1316 files changed, 86081 insertions, 20340 deletions
diff --git a/.bazelrc b/.bazelrc
index ef214bcfa..4a0671f4a 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -30,10 +30,10 @@ build:remote --auth_scope="https://www.googleapis.com/auth/cloud-source-tools"
 
 # Add a custom platform and toolchain that builds in a privileged docker
 # container, which is required by our syscall tests.
-build:remote --host_platform=//:rbe_ubuntu1604
-build:remote --extra_toolchains=//:cc-toolchain-clang-x86_64-default
-build:remote --extra_execution_platforms=//:rbe_ubuntu1604
-build:remote --platforms=//:rbe_ubuntu1604
+build:remote --host_platform=//tools/bazeldefs:rbe_ubuntu1604
+build:remote --extra_toolchains=//tools/bazeldefs:cc-toolchain-clang-x86_64-default
+build:remote --extra_execution_platforms=//tools/bazeldefs:rbe_ubuntu1604
+build:remote --platforms=//tools/bazeldefs:rbe_ubuntu1604
 build:remote --crosstool_top=@rbe_default//cc:toolchain
 build:remote --jobs=50
 build:remote --remote_timeout=3600
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 000000000..49a1ba697
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,31 @@
+---
+name: Bug report
+about: Create a bug report to help us improve
+title:
+labels:
+  - 'type: bug'
+assignees: ''
+---
+
+**Description**
+
+A clear description of what the bug is. If possible, explicitly indicate the
+expected behavior vs. the observed behavior.
+
+**Steps to reproduce**
+
+If available, please include detailed reproduction steps.
+
+If the bug requires software that is not publicly available, see if it can be
+reproduced with software that is publicly available.
+
+**Environment**
+
+Please include the following details of your environment:
+
+*   `runsc -v`
+*   `docker version` or `docker info` (if available)
+*   `kubectl version` and `kubectl get nodes` (if using Kubernetes)
+*   `uname -a`
+*   `git describe` (if built from source)
+*   `runsc` debug logs (if available)
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..772c9a0ac
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,11 @@
+blank_issues_enabled: false
+contact_links:
+  - name: gVisor Documentation (FAQ)
+    url: https://gvisor.dev/docs/user_guide/faq/
+    about: Please see our documentation for common questions and answers.
+  - name: gVisor Documentation (Debugging)
+    url: https://gvisor.dev/docs/user_guide/debugging/
+    about: Please see our documentation for debugging tips.
+  - name: gVisor User Forum
+    url: https://groups.google.com/g/gvisor-users
+    about: Ask and answer general questions here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 000000000..65f60f385
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,21 @@
+---
+name: Feature request
+about: Suggest an idea or improvement
+title: ''
+labels:
+  - 'type: enhancement'
+assignees: ''
+---
+
+**Description**
+
+A clear description of the feature or enhancement.
+
+**Is this feature related to a specific bug?**
+
+Please include a bug references if yes.
+
+**Do you have a specific solution in mind?**
+
+Please include any details about a solution that you have in mind, including any
+alternatives considered.
diff --git a/.github/issue_template.md b/.github/issue_template.md
deleted file mode 100644
index 77c401d22..000000000
--- a/.github/issue_template.md
+++ /dev/null
@@ -1,20 +0,0 @@
-Before filling an issue, please consult our FAQ:
-https://gvisor.dev/docs/user_guide/faq/
-
-Also check that the issue hasn't been reported before.
-
-If you have a question, please email gvisor-users@googlegroups.com rather than filing a bug.
-
-If you believe you've found a security issue, please email gvisor-security@googlegroups.com rather than filing a bug.
-
-If this is your first time compiling or running gVisor, please make sure that your system meets the minimum requirements: https://github.com/google/gvisor#requirements
-
-For all other issues, please attach debug logs. To get debug logs, follow the
-instructions here: https://gvisor.dev/docs/user_guide/debugging/
-
-Other useful information to include is:
-
-*   `runsc -v`
-*   `docker version` or `docker info` if more relevant
-*   `uname -a` - `git describe`
-*   Detailed reproduction steps
diff --git a/.github/labeler.yml b/.github/labeler.yml
new file mode 100644
index 000000000..b6a17051c
--- /dev/null
+++ b/.github/labeler.yml
@@ -0,0 +1,42 @@
+"arch: arm":
+  - "**/*_arm64.*"
+  - "**/*_aarch64.*"
+"arch: x86_64":
+  - "**/*_amd64.*"
+  - "**/*_x86.*"
+"area: bazel":
+  - "**/BUILD"
+  - "**/*.bzl"
+"area: docs":
+  - "**/g3doc/**"
+  - "**/README.md"
+"area: filesystem":
+  - "pkg/sentry/fs/**"
+  - "pkg/sentry/vfs/**"
+  - "pkg/sentry/fsimpl/**"
+"area: hostinet":
+  - "pkg/sentry/socket/hostinet/**"
+"area: networking":
+  - "pkg/tcpip/**"
+  - "pkg/sentry/socket/**"
+"area: kernel":
+  - "pkg/sentry/arch/**"
+  - "pkg/sentry/kernel/**"
+  - "pkg/sentry/syscalls/**"
+"area: mm":
+  - "pkg/sentry/mm/**"
+"area: tests":
+  - "**/tests/**"
+  - "**/*_test.go"
+  - "**/test/**"
+"area: tooling":
+  - "tools/**"
+"dependencies":
+  - "WORKSPACE"
+  - "go.mod"
+  - "go.sum"
+"platform: kvm":
+  - "pkg/sentry/platform/kvm/**"
+  - "pkg/sentry/platform/ring0/**"
+"platform: ptrace":
+  - "pkg/sentry/platform/ptrace/**"
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 000000000..264b4e9fa
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,5 @@
+* [ ] Have you followed the guidelines in [CONTRIBUTING.md](../blob/master/CONTRIBUTING.md)?
+* [ ] Have you formatted and linted your code?
+* [ ] Have you added relevant tests?
+* [ ] Have you added appropriate Fixes & Updates references?
+* [ ] If yes, please erase all these lines!
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 000000000..cf782a580
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,21 @@
+name: "Build"
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  default:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/cache@v1
+      with:
+        path: ~/.cache/bazel
+        key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }}
+        restore-keys: |
+          ${{ runner.os }}-bazel-
+    - run: make
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
new file mode 100644
index 000000000..10c86f5cd
--- /dev/null
+++ b/.github/workflows/go.yml
@@ -0,0 +1,66 @@
+name: "Go"
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  generate:
+    runs-on: ubuntu-latest
+    steps:
+    - run: |
+        jq -nc '{"state": "pending", "context": "go tests"}' | \
+        curl -sL -X POST -d @- \
+            -H "Content-Type: application/json" \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            "${{ github.event.pull_request.statuses_url }}"
+      if: github.event_name == 'pull_request'
+    - uses: actions/checkout@v2
+      if: github.event_name == 'push'
+      with:
+        fetch-depth: 0
+        token: '${{ secrets.GO_TOKEN }}'
+    - uses: actions/checkout@v2
+      if: github.event_name == 'pull_request'
+      with:
+        fetch-depth: 0
+    - uses: actions/setup-go@v2
+      with:
+        go-version: 1.14
+    - uses: actions/cache@v1
+      with:
+        path: ~/go/pkg/mod
+        key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
+        restore-keys: |
+          ${{ runner.os }}-go-
+    - uses: actions/cache@v1
+      with:
+        path: ~/.cache/bazel
+        key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }}
+        restore-keys: |
+          ${{ runner.os }}-bazel-
+    - run: make build TARGETS="//:gopath"
+    - run: tools/go_branch.sh
+    - run: git checkout go && git clean -f
+    - run: go build ./...
+    - if: github.event_name == 'push'
+      run: |
+        git remote add upstream "https://github.com/${{ github.repository }}"
+        git push upstream go:go
+    - if: ${{ success() && github.event_name == 'pull_request' }}
+      run: |
+        jq -nc '{"state": "success", "context": "go tests"}' | \
+        curl -sL  -X POST -d @- \
+            -H "Content-Type: application/json" \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            "${{ github.event.pull_request.statuses_url }}"
+    - if: ${{ failure() && github.event_name == 'pull_request' }}
+      run: |
+        jq -nc '{"state": "failure", "context": "go tests"}' | \
+        curl -sL  -X POST -d @- \
+            -H "Content-Type: application/json" \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            "${{ github.event.pull_request.statuses_url }}"
diff --git a/.github/workflows/issue_reviver.yml b/.github/workflows/issue_reviver.yml
new file mode 100644
index 000000000..5e0254111
--- /dev/null
+++ b/.github/workflows/issue_reviver.yml
@@ -0,0 +1,14 @@
+name: "Issue reviver"
+on:
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  label:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - run: make run TARGETS="//tools/issue_reviver"
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        GITHUB_REPOSITORY: ${{ github.repository }}
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
new file mode 100644
index 000000000..c09f7eb36
--- /dev/null
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,12 @@
+name: "Labeler"
+on:
+- pull_request
+
+jobs:
+  label:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@v2
+      if: github.base_ref == null
+      with:
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
new file mode 100644
index 000000000..0b31fecf5
--- /dev/null
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,20 @@
+name: "Close stale issues"
+on:
+  schedule:
+  - cron: "0 0 * * *"
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/stale@v3
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        stale-issue-label: 'stale'
+        stale-pr-label: 'stale'
+        exempt-issue-labels: 'exported, type: bug, type: cleanup, type: enhancement, type: process, type: proposal, type: question'
+        exempt-pr-labels: 'ready to pull'
+        stale-issue-message: 'This issue is stale because it has been open 90 days with no activity. Remove the stale label or comment or this will be closed in 30 days.'
+        stale-pr-message: 'This pull request is stale because it has been open 90 days with no activity. Remove the stale label or comment or this will be closed in 30 days.'
+        days-before-stale: 90
+        days-before-close: 30
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 000000000..fbc0e46d7
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,23 @@
+language: shell
+dist: xenial
+cache:
+  directories:
+    - /home/travis/.cache/bazel/
+os: linux
+services:
+  - docker
+jobs:
+  include:
+   - os: linux
+     arch: amd64
+   - os: linux
+     arch: arm64
+script:
+   # On arm64, we need to create our own pipes for stderr and stdout,
+   # otherwise we will not be able to open /dev/stderr. This is probably
+   # due to AppArmor rules.
+   - uname -a && make smoke-test 2>&1 | cat
+branches:
+  except:
+  # Skip copybara branches.
+  - /^test\/cl.*$/
diff --git a/BUILD b/BUILD
index 5fd929378..962d54821 100644
--- a/BUILD
+++ b/BUILD
@@ -1,13 +1,62 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_path", "nogo")
-load("@bazel_gazelle//:def.bzl", "gazelle")
+load("//tools:defs.bzl", "build_test", "gazelle", "go_path")
+load("//website:defs.bzl", "doc")
 
 package(licenses = ["notice"])
 
+exports_files(["LICENSE"])
+
+doc(
+    name = "contributing",
+    src = "CONTRIBUTING.md",
+    category = "Project",
+    permalink = "/contributing/",
+    visibility = ["//website:__pkg__"],
+    weight = "20",
+)
+
+doc(
+    name = "security",
+    src = "SECURITY.md",
+    category = "Project",
+    permalink = "/security/",
+    visibility = ["//website:__pkg__"],
+    weight = "30",
+)
+
+doc(
+    name = "governance",
+    src = "GOVERNANCE.md",
+    category = "Project",
+    permalink = "/community/governance/",
+    subcategory = "Community",
+    visibility = ["//website:__pkg__"],
+    weight = "91",
+)
+
+doc(
+    name = "code_of_conduct",
+    src = "CODE_OF_CONDUCT.md",
+    category = "Project",
+    permalink = "/community/code_of_conduct/",
+    subcategory = "Community",
+    visibility = ["//website:__pkg__"],
+    weight = "99",
+)
+
 # The sandbox filegroup is used for sandbox-internal dependencies.
 package_group(
     name = "sandbox",
-    packages = [
-        "//...",
+    packages = ["//..."],
+)
+
+# For targets that will not normally build internally, we ensure that they are
+# least build by a static BUILD test.
+build_test(
+    name = "build_test",
+    targets = [
+        "//test/e2e:integration_test",
+        "//test/image:image_test",
+        "//test/root:root_test",
     ],
 )
 
@@ -43,58 +92,3 @@ go_path(
 # To update the WORKSPACE from go.mod, use:
 #   bazel run //:gazelle -- update-repos -from_file=go.mod
 gazelle(name = "gazelle")
-
-# nogo applies checks to all Go source in this repository, enforcing code
-# guidelines and restrictions. Note that the tool libraries themselves should
-# live in the tools subdirectory (unless they are standard).
-nogo(
-    name = "nogo",
-    config = "//tools:nogo.js",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tools/checkunsafe",
-    ],
-)
-
-# We need to define a bazel platform and toolchain to specify dockerPrivileged
-# and dockerRunAsRoot options, they are required to run tests on the RBE
-# cluster in Kokoro.
-alias(
-    name = "rbe_ubuntu1604",
-    actual = ":rbe_ubuntu1604_r346485",
-)
-
-platform(
-    name = "rbe_ubuntu1604_r346485",
-    constraint_values = [
-        "@bazel_tools//platforms:x86_64",
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//tools/cpp:clang",
-        "@bazel_toolchains//constraints:xenial",
-        "@bazel_toolchains//constraints/sanitizers:support_msan",
-    ],
-    remote_execution_properties = """
-        properties: {
-          name: "container-image"
-          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50"
-        }
-        properties: {
-          name: "dockerAddCapabilities"
-          value: "SYS_ADMIN"
-        }
-        properties: {
-          name: "dockerPrivileged"
-          value: "true"
-        }
-    """,
-)
-
-toolchain(
-    name = "cc-toolchain-clang-x86_64-default",
-    exec_compatible_with = [
-    ],
-    target_compatible_with = [
-    ],
-    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index eb6c8edae..fbf517fe5 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -87,6 +87,5 @@ harassment or threats to anyone's safety, we may take action without notice.
 
 ## Attribution
 
-This Code of Conduct is adapted from the Contributor Covenant, version 1.4,
-available at
-https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+This Code of Conduct is adapted from the
+[Contributor Covenant, version 1.4](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html).
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 71650a4b8..3f8f4c985 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -32,12 +32,15 @@ will need to be added to the appropriate `BUILD` files, and the `:gopath` target
 will need to be re-run to generate appropriate symlinks in the `GOPATH`
 directory tree.
 
+Dependencies can be added by using `go mod get`. In order to keep the
+`WORKSPACE` file in sync, run `tools/go_mod.sh` in place of `go mod`.
+
 ### Coding Guidelines
 
 All Go code should conform to the [Go style guidelines][gostyle]. C++ code
 should conform to the [Google C++ Style Guide][cppstyle] and the guidelines
-described for [tests][teststyle]. Note that code may be automatically formatted
-per the guidelines when merged.
+described for tests. Note that code may be automatically formatted per the
+guidelines when merged.
 
 As a secure runtime, we need to maintain the safety of all of code included in
 gVisor. The following rules help mitigate issues.
@@ -105,32 +108,15 @@ ignored.
 
 ### Build and test with Docker
 
-`scripts/dev.sh` is a convenient script that builds and installs `runsc` as a
-new Docker runtime for you. The scripts tries to extract the runtime name from
-your local environment and will print it at the end. You can also customize it.
-The script creates one regular runtime and another with debug flags enabled.
-Here are a few examples:
+Running `make dev` is a convenient way to build and install `runsc` as a Docker
+runtime. The output of this command will show the runtimes installed.
+
+You may use `make refresh` to refresh the binary after any changes. For example:
 
 ```bash
-# Default case (inside branch my-branch)
-$ scripts/dev.sh
-...
-Runtimes my-branch and my-branch-d (debug enabled) setup.
-Use --runtime=my-branch with your Docker command.
-  docker run --rm --runtime=my-branch --rm hello-world
-
-If you rebuild, use scripts/dev.sh --refresh.
-Logs are in: /tmp/my-branch/logs
-
-# --refresh just updates the runtime binary and doesn't restart docker.
-$ git/my_branch> scripts/dev.sh --refresh
-
-# Using a custom runtime name
-$ git/my_branch> scripts/dev.sh my-runtime
-...
-Runtimes my-runtime and my-runtime-d (debug enabled) setup.
-Use --runtime=my-runtime with your Docker command.
-  docker run --rm --runtime=my-runtime --rm hello-world
+make dev
+docker run --rm --runtime=my-branch --rm hello-world
+make refresh
 ```
 
 ### The small print
@@ -145,4 +131,3 @@ one above, the
 [github]: https://github.com/google/gvisor/compare
 [gvisor-dev-list]: https://groups.google.com/forum/#!forum/gvisor-dev
 [gostyle]: https://github.com/golang/go/wiki/CodeReviewComments
-[teststyle]: ./test/
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 738623023..000000000
--- a/Dockerfile
+++ /dev/null
@@ -1,8 +0,0 @@
-FROM ubuntu:bionic
-
-RUN apt-get update && apt-get install -y curl gnupg2 git python python3 python3-distutils python3-pip
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
-RUN apt-get update && apt-get install -y bazel && apt-get clean
-
-WORKDIR /gvisor
diff --git a/GOVERNANCE.md b/GOVERNANCE.md
new file mode 100644
index 000000000..40846bc2f
--- /dev/null
+++ b/GOVERNANCE.md
@@ -0,0 +1,113 @@
+# Governance
+
+## Projects
+
+A *project* is the primary unit of collaboration. Each project may have its own
+repository and contribution process.
+
+All projects are covered by the [Code of Conduct](CODE_OF_CONDUCT.md), and
+should include an up-to-date copy in the project repository or a link here.
+
+## Contributors
+
+Anyone can be a *contributor* to a project, provided they have signed relevant
+Contributor License Agreements (CLAs) and follow the project's contribution
+guidelines. Contributions will be reviewed by a maintainer, and must pass all
+applicable tests.
+
+Reviews check for code quality and style, including documentation, and enforce
+other policies. Contributions may be rejected for reasons unrelated to the code
+in question. For example, a change may be too complex to maintain or duplicate
+existing functionality.
+
+Note that contributions are not limited to code alone. Bugs, documentation,
+experience reports or public advocacy are all valuable ways to contribute to a
+project and build trust in the community.
+
+## Maintainers
+
+Each project has one or more *maintainers*. Maintainers set technical direction,
+facilitate contributions and exercise overall stewardship.
+
+Maintainers have write access to the project repository. Maintainers review and
+approve changes. They can also assign issues and add additional reviewers.
+
+Note that some repositories may not allow direct commit access, which is
+reserved for administrators or automated processes. In this case, maintainers
+have approval rights, and a separate process exists for merging a change.
+
+Maintainers are responsible for upholding the code of conduct in interactions
+via project communication channels. If comments or exchanges are in violation,
+they may remove them at their discretion.
+
+### Repositories requiring synchronization
+
+For some projects initiated by Google, the infrastructure which synchronizes and
+merges internal and external changes requires that merges are performed by a
+Google employee. In such cases, Google will initiate a rotation to merge changes
+once they pass tests and are approved by a maintainer. This does not preclude
+non-Google contributors from becoming maintainers, in which case the maintainer
+holds approval rights and the merge is an automated process. In some cases,
+Google-internal tests may fail and have to be fixed: the Google employee will
+work with the submitter to achieve this.
+
+### Becoming a maintainer
+
+The list of maintainers is defined by the list of people with commit access or
+approval authority on a repository, typically via a Gerrit group or a GitHub
+team.
+
+Existing maintainers may elevate a contributor to maintainer status on evidence
+of previous contributions and established trust. This decision is based on lazy
+consensus from existing maintainers. While contributors may ask maintainers to
+make this decision, existing maintainers will also pro-actively identify
+contributors who have demonstrated a sustained track record of technical
+leadership and direct contributions.
+
+## Special Interest Groups (SIGs)
+
+From time-to-time, a SIG may be formed in order to solve larger, more complex
+problems across one or more projects. There are many avenues for collaboration
+outside a SIG, but a SIG can provide structure for collaboration on a single
+topic.
+
+Each group will be established by a charter, and governed by the Code of
+Conduct. Some resources may be provided to the group, such as mailing lists or
+meeting space, and archives will be public.
+
+## Security disclosure
+
+Projects may maintain security mailing lists for vulnerability reports and
+internal project audits may occasionally reveal security issues. Access to these
+lists and audits will be limited to project *maintainers*; individual
+maintainers should opt to participate in these lists based on need and
+expertise. Once maintainers become aware of a potential security issue, they
+will assess the scope and potential impact. If reported externally, maintainers
+will determine a reasonable embargo period with the reporter.
+
+During the embargo period, the maintainers will prioritize a fix for the
+security issue. They may choose to disclose the issue to additional trusted
+contributors in order to facilitate a fix, subjecting them to the embargo, or
+notify affected users in order to give them an advanced opportunity to mitigate
+the issue. The inclusion of specific users in this disclosure is left to the
+discretion of the maintainers and contributors involved, and depends on the
+scale of known project use and exposure.
+
+Once a fix is widely available or the embargo period ends, the maintainers will
+make technical details about the vulnerability and associated fixes available.
+
+## Mailing lists
+
+There are four key mailing lists that span projects.
+
+*   [gvisor-users](mailto:gvisor-users@googlegroups.com): general purpose user
+    list.
+*   [gvisor-dev](mailto:gvisor-dev@googlegroups.com): general purpose
+    development list.
+*   [gvisor-security](mailto:gvisor-security@googlegroups.com): private security
+    list. Access to this list is restricted to maintainers of the core gVisor
+    project, subject to the security disclosure policy described above.
+*   [gvisor-syzkaller](mailto:gvisor-syzkaller@googlegroups.com): private
+    syzkaller bug tracking list. Access to this list is not limited to
+    maintainers, but will be granted to those who can credibly contribute to
+    fixes.
diff --git a/Makefile b/Makefile
index a73bc0c36..85818ebea 100644
--- a/Makefile
+++ b/Makefile
@@ -1,47 +1,254 @@
-UID := $(shell id -u ${USER})
-GID := $(shell id -g ${USER})
-GVISOR_BAZEL_CACHE := $(shell readlink -f ~/.cache/bazel/)
+#!/usr/bin/make -f
 
-all: runsc
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-docker-build:
-	docker build -t gvisor-bazel .
+# Described below.
+OPTIONS :=
+STARTUP_OPTIONS :=
+TARGETS := //runsc
+ARGS    :=
 
-bazel-shutdown:
-	docker exec -i gvisor-bazel bazel shutdown && \
-	docker kill gvisor-bazel
+default: runsc
+.PHONY: default
 
-bazel-server-start: docker-build
-	mkdir -p "$(GVISOR_BAZEL_CACHE)" && \
-	docker run -d --rm --name gvisor-bazel \
-		--user 0:0 \
-		-v "$(GVISOR_BAZEL_CACHE):$(HOME)/.cache/bazel/" \
-		-v "$(CURDIR):$(CURDIR)" \
-		--workdir "$(CURDIR)" \
-		--tmpfs /tmp:rw,exec \
-		--privileged \
-		gvisor-bazel \
-		sh -c "while :; do sleep 100; done" && \
-	docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --non-unique --gid $(GID) -d $(HOME) gvisor"
+## usage: make <target>
+##         or
+##        make <build|test|copy|run|sudo> STARTUP_OPTIONS="..." OPTIONS="..." TARGETS="..." ARGS="..."
+##
+## Basic targets.
+##
+##   This Makefile wraps basic build and test targets for ease-of-use. Bazel
+##   is run inside a canonical Docker container in order to simplify up-front
+##   requirements.
+##
+##   There are common arguments that may be passed to targets. These are:
+##     STARTUP_OPTIONS - Bazel startup options.
+##     OPTIONS - Build or test options.
+##     TARGETS - The bazel targets.
+##     ARGS    - Arguments for run or sudo.
+##
+##   Additionally, the copy target expects a DESTINATION to be provided.
+##
+##   For example, to build runsc using this Makefile, you can run:
+##     make build OPTIONS="" TARGETS="//runsc"'
+##
+help: ## Shows all targets and help from the Makefile (this message).
+	@grep --no-filename -E '^([a-z.A-Z_-]+:.*?|)##' $(MAKEFILE_LIST) | \
+		awk 'BEGIN {FS = "(:.*?|)## ?"}; { \
+			if (length($$1) > 0) { \
+				printf "  \033[36m%-20s\033[0m %s\n", $$1, $$2; \
+			} else { \
+				printf "%s\n", $$2; \
+			} \
+		}'
+build: ## Builds the given $(TARGETS) with the given $(OPTIONS). E.g. make build TARGETS=runsc
+test:  ## Tests the given $(TARGETS) with the given $(OPTIONS). E.g. make test TARGETS=pkg/buffer:buffer_test
+copy:  ## Copies the given $(TARGETS) to the given $(DESTINATION). E.g. make copy TARGETS=runsc DESTINATION=/tmp
+run:   ## Runs the given $(TARGETS), built with $(OPTIONS), using $(ARGS). E.g. make run TARGETS=runsc ARGS=-version
+sudo:  ## Runs the given $(TARGETS) as per run, but using "sudo -E". E.g. make sudo TARGETS=test/root:root_test ARGS=-test.v
+.PHONY: help build test copy run sudo
 
-bazel-server:
-	docker exec gvisor-bazel true || \
-	$(MAKE) bazel-server-start
+# Load all bazel wrappers.
+#
+# This file should define the basic "build", "test", "run" and "sudo" rules, in
+# addition to the $(BRANCH_NAME) variable.
+ifneq (,$(wildcard tools/google.mk))
+include tools/google.mk
+else
+include tools/bazel.mk
+endif
 
-BAZEL_OPTIONS := build runsc
-bazel: bazel-server
-	docker exec -u $(UID):$(GID) -i gvisor-bazel bazel $(BAZEL_OPTIONS)
+##
+## Docker image targets.
+##
+##   Images used by the tests must also be built and available locally.
+##   The canonical test targets defined below will automatically load
+##   relevant images. These can be loaded or built manually via these
+##   targets.
+##
+##   (*) Note that you may provide an ARCH parameter in order to build
+##   and load images from an alternate archiecture (using qemu). When
+##   bazel is run as a server, this has the effect of running an full
+##   cross-architecture chain, and can produce cross-compiled binaries.
+##
+define images
+$(1)-%: ## Image tool: $(1) a given image (also may use 'all-images').
+	@$(MAKE) -C images $$@
+endef
+rebuild-...: ## Rebuild the given image. Also may use 'rebuild-all-images'.
+$(eval $(call images,rebuild))
+push-...: ## Push the given image. Also may use 'push-all-images'.
+$(eval $(call images,pull))
+pull-...: ## Pull the given image. Also may use 'pull-all-images'.
+$(eval $(call images,push))
+load-...: ## Load (pull or rebuild) the given image. Also may use 'load-all-images'.
+$(eval $(call images,load))
+list-images: ## List all available images.
+	@$(MAKE) -C images $$@
 
-bazel-alias:
-	@echo "alias bazel='docker exec -u $(UID):$(GID) -i gvisor-bazel bazel'"
+##
+## Canonical build and test targets.
+##
+##   These targets are used by continuous integration and provide
+##   convenient entrypoints for testing changes. If you're adding a
+##   new subsystem or workflow, consider adding a new target here.
+##
+runsc: ## Builds the runsc binary.
+	@$(MAKE) build TARGETS="//runsc"
+.PHONY: runsc
 
-runsc:
-	$(MAKE) BAZEL_OPTIONS="build runsc" bazel
+smoke-test: ## Runs a simple smoke test after build runsc.
+	@$(MAKE) run DOCKER_PRIVILEGED="" ARGS="--alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do true"
+.PHONY: smoke-tests
 
-tests:
-	$(MAKE) BAZEL_OPTIONS="test --test_tag_filters runsc_ptrace //test/syscalls/..." bazel
+unit-tests: ## Runs all unit tests in pkg runsc and tools.
+	@$(MAKE) test OPTIONS="pkg/... runsc/... tools/..."
+.PHONY: unit-tests
 
-unit-tests:
-	$(MAKE) BAZEL_OPTIONS="test //pkg/... //runsc/... //tools/..." bazel
+tests: ## Runs all local ptrace system call tests.
+	@$(MAKE) test OPTIONS="--test_tag_filters runsc_ptrace test/syscalls/..."
+.PHONY: tests
 
-.PHONY: docker-build bazel-shutdown bazel-server-start bazel-server bazel runsc tests
+##
+## Website & documentation helpers.
+##
+##   The website is built from repository documentation and wrappers, using
+##   using a locally-defined Docker image (see images/jekyll). The following
+##   variables may be set when using website-push:
+##     WEBSITE_IMAGE   - The name of the container image.
+##     WEBSITE_SERVICE - The backend service.
+##     WEBSITE_PROJECT - The project id to use.
+##     WEBSITE_REGION  - The region to deploy to.
+##
+WEBSITE_IMAGE   := gcr.io/gvisordev/gvisordev
+WEBSITE_SERVICE := gvisordev
+WEBSITE_PROJECT := gvisordev
+WEBSITE_REGION  := us-central1
+
+website-build: load-jekyll ## Build the site image locally.
+	@$(MAKE) run TARGETS="//website:website"
+.PHONY: website-build
+
+website-server: website-build ## Run a local server for development.
+	@docker run -i -p 8080:8080 gvisor.dev/images/website
+.PHONY: website-server
+
+website-push: website-build ## Push a new image and update the service.
+	@docker tag gvisor.dev/images/website $(WEBSITE_IMAGE) && docker push $(WEBSITE_IMAGE)
+.PHONY: website-push
+
+website-deploy: website-push ## Deploy a new version of the website.
+	@gcloud run deploy $(WEBSITE_SERVICE) --platform=managed --region=$(WEBSITE_REGION) --project=$(WEBSITE_PROJECT) --image=$(WEBSITE_IMAGE)
+.PHONY: website-push
+
+##
+## Repository builders.
+##
+##   This builds a local apt repository. The following variables may be set:
+##     RELEASE_ROOT    - The repository root (default: "repo" directory).
+##     RELEASE_KEY     - The repository GPG private key file (default: dummy key is created).
+##     RELEASE_NIGHTLY - Set to true if a nightly release (default: false).
+##     RELEASE_COMMIT  - The commit or Change-Id for the release (needed for tag).
+##     RELEASE_NAME    - The name of the release in the proper format (needed for tag).
+##     RELEASE_NOTES   - The file containing release notes (needed for tag).
+##
+RELEASE_ROOT    := $(CURDIR)/repo
+RELEASE_KEY     := repo.key
+RELEASE_NIGHTLY := false
+RELEASE_COMMIT  :=
+RELEASE_NAME    :=
+RELEASE_NOTES   :=
+
+GPG_TEST_OPTIONS := $(shell if gpg --pinentry-mode loopback --version >/dev/null 2>&1; then echo --pinentry-mode loopback; fi)
+$(RELEASE_KEY):
+	@echo "WARNING: Generating a key for testing ($@); don't use this."
+	T=$$(mktemp /tmp/keyring.XXXXXX); \
+	C=$$(mktemp /tmp/config.XXXXXX); \
+	echo Key-Type: DSA >> $$C && \
+	echo Key-Length: 1024 >> $$C && \
+	echo Name-Real: Test >> $$C && \
+	echo Name-Email: test@example.com >> $$C && \
+	echo Expire-Date: 0 >> $$C && \
+	echo %commit >> $$C && \
+	gpg --batch $(GPG_TEST_OPTIONS) --passphrase '' --no-default-keyring --keyring $$T --no-tty --gen-key $$C && \
+	gpg --batch $(GPG_TEST_OPTIONS) --export-secret-keys --no-default-keyring --keyring $$T --secret-keyring $$T > $@; \
+	rc=$$?; rm -f $$T $$C; exit $$rc
+
+release: $(RELEASE_KEY) ## Builds a release.
+	@mkdir -p $(RELEASE_ROOT)
+	@T=$$(mktemp -d /tmp/release.XXXXXX); \
+	  $(MAKE) copy TARGETS="runsc" DESTINATION=$$T && \
+	  $(MAKE) copy TARGETS="runsc:runsc-debian" DESTINATION=$$T && \
+	  NIGHTLY=$(RELEASE_NIGHTLY) tools/make_release.sh $(RELEASE_KEY) $(RELEASE_ROOT) $$T/*; \
+	rc=$$?; rm -rf $$T; exit $$rc
+.PHONY: release
+
+tag: ## Creates and pushes a release tag.
+	@tools/tag_release.sh "$(RELEASE_COMMIT)" "$(RELEASE_NAME)" "$(RELEASE_NOTES)"
+.PHONY: tag
+
+##
+## Development helpers and tooling.
+##
+##   These targets faciliate local development by automatically
+##   installing and configuring a runtime. Several variables may
+##   be used here to tweak the installation:
+##     RUNTIME         - The name of the installed runtime (default: branch).
+##     RUNTIME_DIR     - Where the runtime will be installed (default: temporary directory with the $RUNTIME).
+##     RUNTIME_BIN     - The runtime binary (default: $RUNTIME_DIR/runsc).
+##     RUNTIME_LOG_DIR - The logs directory (default: $RUNTIME_DIR/logs).
+##     RUNTIME_LOGS    - The log pattern (default: $RUNTIME_LOG_DIR/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND%).
+##
+ifeq (,$(BRANCH_NAME))
+RUNTIME     := runsc
+RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/runsc
+else
+RUNTIME     := $(BRANCH_NAME)
+RUNTIME_DIR := $(shell dirname $(shell mktemp -u))/$(BRANCH_NAME)
+endif
+RUNTIME_BIN     := $(RUNTIME_DIR)/runsc
+RUNTIME_LOG_DIR := $(RUNTIME_DIR)/logs
+RUNTIME_LOGS    := $(RUNTIME_LOG_DIR)/runsc.log.%TEST%.%TIMESTAMP%.%COMMAND%
+
+dev: ## Installs a set of local runtimes. Requires sudo.
+	@$(MAKE) refresh ARGS="--net-raw"
+	@$(MAKE) configure RUNTIME="$(RUNTIME)" ARGS="--net-raw"
+	@$(MAKE) configure RUNTIME="$(RUNTIME)-d" ARGS="--net-raw --debug --strace --log-packets"
+	@$(MAKE) configure RUNTIME="$(RUNTIME)-p" ARGS="--net-raw --profile"
+	@$(MAKE) configure RUNTIME="$(RUNTIME)-vfs2-d" ARGS="--net-raw --debug --strace --log-packets --vfs2"
+	@sudo systemctl restart docker
+.PHONY: dev
+
+refresh: ## Refreshes the runtime binary (for development only). Must have called 'dev' or 'test-install' first.
+	@mkdir -p "$(RUNTIME_DIR)"
+	@$(MAKE) copy TARGETS=runsc DESTINATION="$(RUNTIME_BIN)" && chmod 0755 "$(RUNTIME_BIN)"
+.PHONY: install
+
+test-install: ## Installs the runtime for testing. Requires sudo.
+	@$(MAKE) refresh ARGS="--net-raw --TESTONLY-test-name-env=RUNSC_TEST_NAME --debug --strace --log-packets $(ARGS)"
+	@$(MAKE) configure
+	@sudo systemctl restart docker
+.PHONY: install-test
+
+configure: ## Configures a single runtime. Requires sudo. Typically called from dev or test-install.
+	@sudo sudo "$(RUNTIME_BIN)" install --experimental=true --runtime="$(RUNTIME)" -- --debug-log "$(RUNTIME_LOGS)" $(ARGS)
+	@echo "Installed runtime \"$(RUNTIME)\" @ $(RUNTIME_BIN)"
+	@echo "Logs are in: $(RUNTIME_LOG_DIR)"
+	@sudo rm -rf "$(RUNTIME_LOG_DIR)" && mkdir -p "$(RUNTIME_LOG_DIR)"
+.PHONY: configure
+
+test-runtime: ## A convenient wrapper around test that provides the runtime argument. Target must still be provided.
+	@$(MAKE) test OPTIONS="$(OPTIONS) --test_arg=--runtime=$(RUNTIME)"
+.PHONY: runtime-test
diff --git a/README.md b/README.md
index de3e06f4e..d72d1dac4 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 ![gVisor](g3doc/logo.png)
 
-[![Status](https://storage.googleapis.com/gvisor-build-badges/build.svg)](https://storage.googleapis.com/gvisor-build-badges/build.html)
+![](https://github.com/google/gvisor/workflows/Build/badge.svg)
 [![gVisor chat](https://badges.gitter.im/gvisor/community.png)](https://gitter.im/gvisor/community)
 
 ## What is gVisor?
 
-**gVisor** is a user-space kernel, written in Go, that implements a substantial
-portion of the Linux system surface. It includes an
+**gVisor** is an application kernel, written in Go, that implements a
+substantial portion of the Linux system surface. It includes an
 [Open Container Initiative (OCI)][oci] runtime called `runsc` that provides an
 isolation boundary between the application and the host kernel. The `runsc`
 runtime integrates with Docker and Kubernetes, making it simple to run sandboxed
@@ -15,16 +15,17 @@ containers.
 ## Why does gVisor exist?
 
 Containers are not a [**sandbox**][sandbox]. While containers have
-revolutionized how we develop, package, and deploy applications, running
-untrusted or potentially malicious code without additional isolation is not a
-good idea. The efficiency and performance gains from using a single, shared
-kernel also mean that container escape is possible with a single vulnerability.
-
-gVisor is a user-space kernel for containers. It limits the host kernel surface
-accessible to the application while still giving the application access to all
-the features it expects. Unlike most kernels, gVisor does not assume or require
-a fixed set of physical resources; instead, it leverages existing host kernel
-functionality and runs as a normal user-space process. In other words, gVisor
+revolutionized how we develop, package, and deploy applications, using them to
+run untrusted or potentially malicious code without additional isolation is not
+a good idea. While using a single, shared kernel allows for efficiency and
+performance gains, it also means that container escape is possible with a single
+vulnerability.
+
+gVisor is an application kernel for containers. It limits the host kernel
+surface accessible to the application while still giving the application access
+to all the features it expects. Unlike most kernels, gVisor does not assume or
+require a fixed set of physical resources; instead, it leverages existing host
+kernel functionality and runs as a normal process. In other words, gVisor
 implements Linux by way of Linux.
 
 gVisor should not be confused with technologies and tools to harden containers
@@ -39,75 +40,44 @@ be found at [gvisor.dev][gvisor-dev].
 
 ## Installing from source
 
-gVisor currently requires x86\_64 Linux to build, though support for other
-architectures may become available in the future.
+gVisor builds on x86_64 and ARM64. Other architectures may become available in
+the future.
+
+For the purposes of these instructions, [bazel][bazel] and other build
+dependencies are wrapped in a build container. It is possible to use
+[bazel][bazel] directly, or type `make help` for standard targets.
 
 ### Requirements
 
 Make sure the following dependencies are installed:
 
 *   Linux 4.14.77+ ([older linux][old-linux])
-*   [git][git]
-*   [Bazel][bazel] 1.2+
-*   [Python][python]
 *   [Docker version 17.09.0 or greater][docker]
-*   C++ toolchain supporting C++17 (GCC 7+, Clang 5+)
-*   Gold linker (e.g. `binutils-gold` package on Ubuntu)
 
 ### Building
 
 Build and install the `runsc` binary:
 
 ```
-bazel build runsc
-sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin
-```
-
-If you don't want to install bazel on your system, you can build runsc in a
-Docker container:
-
-```
 make runsc
 sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin
 ```
 
 ### Testing
 
-The test suite can be run with Bazel:
-
-```
-bazel test //...
-```
-
-or in a Docker container:
+To run standard test suites, you can use:
 
 ```
 make unit-tests
 make tests
 ```
 
-### Using remote execution
-
-If you have a [Remote Build Execution][rbe] environment, you can use it to speed
-up build and test cycles.
-
-You must authenticate with the project first:
+To run specific tests, you can specify the target:
 
 ```
-gcloud auth application-default login --no-launch-browser
+make test TARGETS="//runsc:version_test"
 ```
 
-Then invoke bazel with the following flags:
-
-```
---config=remote
---project_id=$PROJECT
---remote_instance_name=projects/$PROJECT/instances/default_instance
-```
-
-You can also add those flags to your local ~/.bazelrc to avoid needing to
-specify them each time on the command line.
-
 ### Using `go get`
 
 This project uses [bazel][bazel] to build and manage dependencies. A synthetic
@@ -128,7 +98,7 @@ development on this branch is not supported. Development should occur on the
 
 ## Community & Governance
 
-The governance model is documented in our [community][community] repository.
+See [GOVERNANCE.md](GOVERNANCE.md) for project governance information.
 
 The [gvisor-users mailing list][gvisor-users-list] and
 [gvisor-dev mailing list][gvisor-dev-list] are good starting points for
@@ -145,12 +115,9 @@ See [Contributing.md](CONTRIBUTING.md).
 [bazel]: https://bazel.build
 [community]: https://gvisor.googlesource.com/community
 [docker]: https://www.docker.com
-[git]: https://git-scm.com
 [gvisor-users-list]: https://groups.google.com/forum/#!forum/gvisor-users
+[gvisor-dev]: https://gvisor.dev
 [gvisor-dev-list]: https://groups.google.com/forum/#!forum/gvisor-dev
 [oci]: https://www.opencontainers.org
 [old-linux]: https://gvisor.dev/docs/user_guide/networking/#gso
-[python]: https://python.org
-[rbe]: https://blog.bazel.build/2018/10/05/remote-build-execution.html
 [sandbox]: https://en.wikipedia.org/wiki/Sandbox_(computer_security)
-[gvisor-dev]: https://gvisor.dev
diff --git a/SECURITY.md b/SECURITY.md
index 154d68cb3..82cd0efb8 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -5,7 +5,7 @@ the [gvisor-security mailing list][gvisor-security-list]. You should receive a
 prompt response, typically within 48 hours.
 
 Policies for security list access, vulnerability embargo, and vulnerability
-disclosure are outlined in the [community][community] repository.
+disclosure are outlined in the [governance policy](GOVERNANCE.md).
 
 [community]: https://gvisor.googlesource.com/community
 [gvisor-security-list]: https://groups.google.com/forum/#!forum/gvisor-security
diff --git a/WORKSPACE b/WORKSPACE
index 2827c3a26..e1873e5c0 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,13 +1,35 @@
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
+# Bazel/starlark utilities.
+http_archive(
+    name = "bazel_skylib",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz",
+    ],
+    sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44",
+)
+
+load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
+
+bazel_skylib_workspace()
+
 # Load go bazel rules and gazelle.
+#
+# Note that this repository actually patches some other Go repositories as it
+# loads it, in order to limit visibility. We hack this process by patching the
+# patch used by the Go rules, turning the trick against itself.
 http_archive(
     name = "io_bazel_rules_go",
-    sha256 = "f99a9d76e972e0c8f935b2fe6d0d9d778f67c760c6d2400e23fc2e469016e2bd",
+    patch_args = ["-p1"],
+    patches = [
+        "//tools/nogo:io_bazel_rules_go-visibility.patch",
+    ],
+    sha256 = "db2b2d35293f405430f553bc7a865a8749a8ef60c30287e90d2b278c32771afe",
     urls = [
-        "https://storage.googleapis.com/bazel-mirror/github.com/bazelbuild/rules_go/releases/download/v0.21.2/rules_go-v0.21.2.tar.gz",
-        "https://github.com/bazelbuild/rules_go/releases/download/v0.21.2/rules_go-v0.21.2.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.22.3/rules_go-v0.22.3.tar.gz",
+        "https://github.com/bazelbuild/rules_go/releases/download/v0.22.3/rules_go-v0.22.3.tar.gz",
     ],
 )
 
@@ -20,19 +42,30 @@ http_archive(
     ],
 )
 
-load("@io_bazel_rules_go//go:deps.bzl", "go_rules_dependencies", "go_register_toolchains")
+load("@io_bazel_rules_go//go:deps.bzl", "go_register_toolchains", "go_rules_dependencies")
 
 go_rules_dependencies()
 
-go_register_toolchains(
-    go_version = "1.13.7",
-    nogo = "@//:nogo",
-)
+go_register_toolchains(go_version = "1.14.2")
 
 load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies", "go_repository")
 
 gazelle_dependencies()
 
+# TODO(gvisor.dev/issue/1876): Move the statement to "External repositories"
+# block below once 1876 is fixed.
+#
+# The com_google_protobuf repository below would trigger downloading a older
+# version of org_golang_x_sys. If putting this repository statment in a place
+# after that of the com_google_protobuf, this statement will not work as
+# expectd to download a new version of org_golang_x_sys.
+go_repository(
+    name = "org_golang_x_sys",
+    importpath = "golang.org/x/sys",
+    sum = "h1:uYVVQ9WP/Ds2ROhcaGPeIdVq0RIXVLwsHlnvJ+cT1So=",
+    version = "v0.0.0-20200302150141-5c8b2ff67527",
+)
+
 # Load C++ rules.
 http_archive(
     name = "rules_cc",
@@ -54,16 +87,19 @@ http_archive(
         "https://github.com/bazelbuild/rules_proto/archive/97d8af4dc474595af3900dd85cb3a29ad28cc313.tar.gz",
     ],
 )
+
 load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
+
 rules_proto_dependencies()
+
 rules_proto_toolchains()
 
 # Load python dependencies.
 git_repository(
     name = "rules_python",
-    commit = "94677401bc56ed5d756f50b441a6a5c7f735a6d4",
+    commit = "abc4869e02fe9b3866942e89f07b7341f830e805",
     remote = "https://github.com/bazelbuild/rules_python.git",
-    shallow_since = "1573842889 -0500",
+    shallow_since = "1583341286 -0500",
 )
 
 load("@rules_python//python:pip.bzl", "pip_import")
@@ -82,11 +118,11 @@ pip_install()
 # See releases at https://releases.bazel.build/bazel-toolchains.html
 http_archive(
     name = "bazel_toolchains",
-    sha256 = "a653c9d318e42b14c0ccd7ac50c4a2a276c0db1e39743ab88b5aa2f0bc9cf607",
-    strip_prefix = "bazel-toolchains-2.0.2",
+    sha256 = "239a1a673861eabf988e9804f45da3b94da28d1aff05c373b013193c315d9d9e",
+    strip_prefix = "bazel-toolchains-3.0.1",
     urls = [
-        "https://github.com/bazelbuild/bazel-toolchains/releases/download/2.0.2/bazel-toolchains-2.0.2.tar.gz",
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/2.0.2.tar.gz",
+        "https://github.com/bazelbuild/bazel-toolchains/releases/download/3.0.1/bazel-toolchains-3.0.1.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/releases/download/3.0.1/bazel-toolchains-3.0.1.tar.gz",
     ],
 )
 
@@ -105,44 +141,23 @@ load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 
 rules_pkg_dependencies()
 
-# Container rules.
+# Load C++ grpc rules.
 http_archive(
-    name = "io_bazel_rules_docker",
-    sha256 = "14ac30773fdb393ddec90e158c9ec7ebb3f8a4fd533ec2abbfd8789ad81a284b",
-    strip_prefix = "rules_docker-0.12.1",
-    urls = ["https://github.com/bazelbuild/rules_docker/releases/download/v0.12.1/rules_docker-v0.12.1.tar.gz"],
-)
-
-load(
-    "@io_bazel_rules_docker//repositories:repositories.bzl",
-    container_repositories = "repositories",
+    name = "com_github_grpc_grpc",
+    sha256 = "2fcb7f1ab160d6fd3aaade64520be3e5446fc4c6fa7ba6581afdc4e26094bd81",
+    strip_prefix = "grpc-1.26.0",
+    urls = [
+        "https://github.com/grpc/grpc/archive/v1.26.0.tar.gz",
+    ],
 )
 
-container_repositories()
+load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 
-load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps")
+grpc_deps()
 
-container_deps()
+load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
 
-load(
-    "@io_bazel_rules_docker//container:container.bzl",
-    "container_pull",
-)
-
-# This container is built from the Dockerfile in test/iptables/runner.
-container_pull(
-    name = "iptables-test",
-    registry = "gcr.io",
-    repository = "gvisor-presubmit/iptables-test",
-    digest = "sha256:a137d692a2eb9fc7bf95c5f4a568da090e2c31098e93634421ed88f3a3f1db65",
-)
-
-load(
-    "@io_bazel_rules_docker//go:image.bzl",
-    _go_image_repos = "repositories",
-)
-
-_go_image_repos()
+grpc_extra_deps()
 
 # External repositories, in sorted order.
 go_repository(
@@ -188,6 +203,13 @@ go_repository(
 )
 
 go_repository(
+    name = "com_github_kr_pretty",
+    importpath = "github.com/kr/pretty",
+    sum = "h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs=",
+    version = "v0.2.0",
+)
+
+go_repository(
     name = "com_github_kr_pty",
     importpath = "github.com/kr/pty",
     sum = "h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw=",
@@ -195,6 +217,19 @@ go_repository(
 )
 
 go_repository(
+    name = "com_github_kr_text",
+    importpath = "github.com/kr/text",
+    sum = "h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=",
+    version = "v0.1.0",
+)
+
+go_repository(
+    name = "com_github_mohae_deepcopy",
+    commit = "c48cc78d482608239f6c4c92a4abd87eb8761c90",
+    importpath = "github.com/mohae/deepcopy",
+)
+
+go_repository(
     name = "com_github_opencontainers_runtime-spec",
     importpath = "github.com/opencontainers/runtime-spec",
     sum = "h1:d9F+LNYwMyi3BDN4GzZdaSiq4otb8duVEWyZjeUtOQI=",
@@ -223,6 +258,21 @@ go_repository(
 )
 
 go_repository(
+    name = "org_golang_google_grpc",
+    build_file_proto_mode = "disable",
+    importpath = "google.golang.org/grpc",
+    sum = "h1:zvIju4sqAGvwKspUQOhwnpcqSbzi7/H6QomNNjTL4sk=",
+    version = "v1.27.1",
+)
+
+go_repository(
+    name = "in_gopkg_check_v1",
+    importpath = "gopkg.in/check.v1",
+    sum = "h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=",
+    version = "v1.0.0-20190902080502-41f04d3bba15",
+)
+
+go_repository(
     name = "org_golang_x_crypto",
     importpath = "golang.org/x/crypto",
     sum = "h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M=",
@@ -230,23 +280,17 @@ go_repository(
 )
 
 go_repository(
-    name = "org_golang_x_net",
-    importpath = "golang.org/x/net",
-    sum = "h1:oWX7TPOiFAMXLq8o0ikBYfCJVlRHBcsciT5bXOrH628=",
-    version = "v0.0.0-20190311183353-d8887717615a",
-)
-
-go_repository(
-    name = "org_golang_x_text",
-    importpath = "golang.org/x/text",
-    sum = "h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=",
-    version = "v0.3.0",
+    name = "org_golang_x_mod",
+    importpath = "golang.org/x/mod",
+    sum = "h1:p1YOIz9H/mGN8k1XkaV5VFAq9+zhN9Obefv439UwRhI=",
+    version = "v0.2.1-0.20200224194123-e5e73c1b9c72",
 )
 
 go_repository(
-    name = "org_golang_x_tools",
-    commit = "36563e24a262",
-    importpath = "golang.org/x/tools",
+    name = "org_golang_x_net",
+    importpath = "golang.org/x/net",
+    sum = "h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=",
+    version = "v0.0.0-20190620200207-3b0461eec859",
 )
 
 go_repository(
@@ -257,22 +301,31 @@ go_repository(
 )
 
 go_repository(
-    name = "org_golang_x_sys",
-    importpath = "golang.org/x/sys",
-    sum = "h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=",
-    version = "v0.0.0-20190215142949-d0b11bdaac8a",
+    name = "org_golang_x_text",
+    importpath = "golang.org/x/text",
+    sum = "h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=",
+    version = "v0.3.0",
 )
 
 go_repository(
     name = "org_golang_x_time",
-    commit = "c4c64cad1fd0a1a8dab2523e04e61d35308e131e",
     importpath = "golang.org/x/time",
+    sum = "h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=",
+    version = "v0.0.0-20191024005414-555d28b269f0",
 )
 
 go_repository(
     name = "org_golang_x_tools",
-    commit = "aa82965741a9fecd12b026fbb3d3c6ed3231b8f8",
     importpath = "golang.org/x/tools",
+    sum = "h1:Uglradbb4KfUWaYasZhlsDsGRwHHvRsHoNAEONef0W8=",
+    version = "v0.0.0-20200131233409-575de47986ce",
+)
+
+go_repository(
+    name = "org_golang_x_xerrors",
+    importpath = "golang.org/x/xerrors",
+    sum = "h1:9zdDQZ7Thm29KFXgAX/+yaf3eVbP7djjWp/dXAppNCc=",
+    version = "v0.0.0-20190717185122-a985d3407aa7",
 )
 
 go_repository(
@@ -310,6 +363,63 @@ go_repository(
     version = "v1.0.0",
 )
 
+go_repository(
+    name = "com_google_cloud_go_bigquery",
+    importpath = "cloud.google.com/go/bigquery",
+    sum = "h1:K2NyuHRuv15ku6eUpe0DQk5ZykPMnSOnvuVf6IHcjaE=",
+    version = "v1.5.0",
+)
+
+go_repository(
+    name = "org_golang_google_api",
+    importpath = "google.golang.org/api",
+    sum = "h1:jz2KixHX7EcCPiQrySzPdnYT7DbINAypCqKZ1Z7GM40=",
+    version = "v0.20.0",
+)
+
+go_repository(
+    name = "org_uber_go_atomic",
+    importpath = "go.uber.org/atomic",
+    sum = "h1:Ezj3JGmsOnG1MoRWQkPBsKLe9DwWD9QeXzTRzzldNVk=",
+    version = "v1.6.0",
+)
+
+go_repository(
+    name = "org_uber_go_multierr",
+    importpath = "go.uber.org/multierr",
+    sum = "h1:KCa4XfM8CWFCpxXRGok+Q0SS/0XBhMDbHHGABQLvD2A=",
+    version = "v1.5.0",
+)
+
+# BigQuery Dependencies for Benchmarks
+go_repository(
+    name = "com_google_cloud_go",
+    importpath = "cloud.google.com/go",
+    sum = "h1:eoz/lYxKSL4CNAiaUJ0ZfD1J3bfMYbU5B3rwM1C1EIU=",
+    version = "v0.55.0",
+)
+
+go_repository(
+    name = "com_github_googleapis_gax_go_v2",
+    importpath = "github.com/googleapis/gax-go/v2",
+    sum = "h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM=",
+    version = "v2.0.5",
+)
+
+go_repository(
+    name = "io_opencensus_go",
+    importpath = "go.opencensus.io",
+    sum = "h1:8sGtKOrtQqkN1bp2AtX+misvLIlOmsEsNd+9NIcPEm8=",
+    version = "v0.22.3",
+)
+
+go_repository(
+    name = "com_github_golang_groupcache",
+    importpath = "github.com/golang/groupcache",
+    sum = "h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=",
+    version = "v0.0.0-20200121045136-8c9f03a8e57e",
+)
+
 # System Call test dependencies.
 http_archive(
     name = "com_google_absl",
@@ -330,3 +440,13 @@ http_archive(
         "https://github.com/google/googletest/archive/565f1b848215b77c3732bca345fe76a0431d8b34.tar.gz",
     ],
 )
+
+http_archive(
+    name = "com_google_benchmark",
+    sha256 = "3c6a165b6ecc948967a1ead710d4a181d7b0fbcaa183ef7ea84604994966221a",
+    strip_prefix = "benchmark-1.5.0",
+    urls = [
+        "https://mirror.bazel.build/github.com/google/benchmark/archive/v1.5.0.tar.gz",
+        "https://github.com/google/benchmark/archive/v1.5.0.tar.gz",
+    ],
+)
diff --git a/benchmarks/BUILD b/benchmarks/BUILD
index 43614cf5d..389351210 100644
--- a/benchmarks/BUILD
+++ b/benchmarks/BUILD
@@ -1,12 +1,23 @@
 package(licenses = ["notice"])
 
+config_setting(
+    name = "gcloud_rule",
+    values = {
+        "define": "gcloud=off",
+    },
+)
+
 py_binary(
     name = "benchmarks",
+    testonly = 1,
     srcs = ["run.py"],
-    data = [
-        "//tools/images:ubuntu1604",
-        "//tools/images:zone",
-    ],
+    data = select({
+        ":gcloud_rule": [],
+        "//conditions:default": [
+            "//tools/vm:ubuntu1604",
+            "//tools/vm:zone",
+        ],
+    }),
     main = "run.py",
     python_version = "PY3",
     srcs_version = "PY3",
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 975321c99..814bcb220 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -10,7 +10,7 @@ The scripts assume the following:
     (controller) and one or more machines on which docker containers will be run
     (environment).
 *   The controller machine must have bazel installed along with this source
-    code. You should be able to run a command like `bazel run :benchmarks --
+    code. You should be able to run a command like `bazel run //benchmarks --
     --list`
 *   Environment machines must have docker and the required runtimes installed.
     More specifically, you should be able to run a command like: `docker run
@@ -28,10 +28,12 @@ For configuring the environment manually, consult the
 
 ### Locally
 
-Run the following from the benchmarks directory:
+The tool is built to, by default, use Google Cloud Platform to run benchmarks,
+but it does support GCP workflows. To run locally, run the following from the
+benchmarks directory:
 
 ```bash
-bazel run :benchmarks -- run-local startup
+bazel run --define gcloud=off //benchmarks -- run-local startup
 
 ...
 method,metric,result
@@ -46,17 +48,20 @@ runtime, runc. Running on another installed runtime, like say runsc, is as
 simple as:
 
 ```bash
-bazel run :benchmarks -- run-local startup --runtime=runsc
+bazel run  --define gcloud=off //benchmarks -- run-local startup --runtime=runsc
 ```
 
-There is help: ``bash bash bazel run :benchmarks -- --help bazel
-run :benchmarks -- run-local --help` ``
+There is help:
+
+```bash
+bazel run --define gcloud=off //benchmarks -- --help
+bazel run --define gcloud=off //benchmarks -- run-local --help
+```
 
 To list available benchmarks, use the `list` commmand:
 
 ```bash
-bazel run :benchmarks -- list
-ls
+bazel --define gcloud=off  run //benchmarks -- list
 
 ...
 Benchmark: sysbench.cpu
@@ -69,7 +74,7 @@ Metrics: events_per_second
 You can choose benchmarks by name or regex like:
 
 ```bash
-bazel run :benchmarks -- run-local startup.node
+bazel run --define gcloud=off //benchmarks -- run-local startup.node
 ...
 metric,result
 startup_time_ms,1671.7178000000001
@@ -79,7 +84,7 @@ startup_time_ms,1671.7178000000001
 or
 
 ```bash
-bazel run :benchmarks -- run-local s
+bazel run --define gcloud=off //benchmarks -- run-local s
 ...
 method,metric,result
 startup.empty,startup_time_ms,1792.8292
@@ -97,13 +102,13 @@ You can run parameterized benchmarks, for example to run with different
 runtimes:
 
 ```bash
-bazel run :benchmarks -- run-local --runtime=runc --runtime=runsc sysbench.cpu
+bazel run --define gcloud=off //benchmarks -- run-local --runtime=runc --runtime=runsc sysbench.cpu
 ```
 
 Or with different parameters:
 
 ```bash
-bazel run :benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu
+bazel run --define gcloud=off //benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu
 ```
 
 ### On Google Compute Engine (GCE)
@@ -116,7 +121,7 @@ runtime is installed from the workspace. See the files in `tools/installers` for
 supported install targets.
 
 ```bash
-bazel run :benchmarks -- run-gcp --installers=head --runtime=runsc sysbench.cpu
+bazel run //benchmarks -- run-gcp --installers=head --runtime=runsc sysbench.cpu
 ```
 
 When running on GCE, the scripts generate a per run SSH key, which is added to
diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py
index 3d32d3dda..5bdc4aa85 100644
--- a/benchmarks/harness/machine.py
+++ b/benchmarks/harness/machine.py
@@ -43,6 +43,8 @@ from benchmarks.harness import machine_mocks
 from benchmarks.harness import ssh_connection
 from benchmarks.harness import tunnel_dispatcher
 
+log = logging.getLogger(__name__)
+
 
 class Machine(object):
   """The machine object is the primary object for benchmarks.
@@ -236,9 +238,10 @@ class RemoteMachine(Machine):
           archive=archive, dir=harness.REMOTE_INSTALLERS_PATH))
       self._has_installers = True
 
-      # Execute the remote installer.
-      self.run("sudo {dir}/{file}".format(
-          dir=harness.REMOTE_INSTALLERS_PATH, file=installer))
+    # Execute the remote installer.
+    self.run("sudo {dir}/{file}".format(
+        dir=harness.REMOTE_INSTALLERS_PATH, file=installer))
+
     if results:
       results[index] = True
 
diff --git a/benchmarks/harness/machine_producers/gcloud_producer.py b/benchmarks/harness/machine_producers/gcloud_producer.py
index 513d16e4f..44d72f575 100644
--- a/benchmarks/harness/machine_producers/gcloud_producer.py
+++ b/benchmarks/harness/machine_producers/gcloud_producer.py
@@ -53,6 +53,8 @@ class GCloudProducer(machine_producer.MachineProducer):
     ssh_key_file: path to a valid ssh private key. See README on vaild ssh keys.
     ssh_user: string of user name for ssh_key
     ssh_password: string of password for ssh key
+    internal: if true, use internal IPs of instances. Used if bm-tools is
+    running on a GCP vm when a firewall is set for external IPs.
     mock: a mock printer which will print mock data if required. Mock data is
       recorded output from subprocess calls (returncode, stdout, args).
     condition: mutex for this class around machine creation and deleteion.
@@ -66,6 +68,7 @@ class GCloudProducer(machine_producer.MachineProducer):
                ssh_key_file: str,
                ssh_user: str,
                ssh_password: str,
+               internal: bool,
                mock: gcloud_mock_recorder.MockPrinter = None):
     self.image = image
     self.zone = zone
@@ -74,6 +77,7 @@ class GCloudProducer(machine_producer.MachineProducer):
     self.ssh_key_file = ssh_key_file
     self.ssh_user = ssh_user
     self.ssh_password = ssh_password
+    self.internal = internal
     self.mock = mock
     self.condition = threading.Condition()
 
@@ -129,15 +133,13 @@ class GCloudProducer(machine_producer.MachineProducer):
     machines = []
     for instance in instances:
       name = instance["name"]
+      external = instance["networkInterfaces"][0]["accessConfigs"][0]["natIP"]
+      internal = instance["networkInterfaces"][0]["networkIP"]
       kwargs = {
-          "hostname":
-              instance["networkInterfaces"][0]["accessConfigs"][0]["natIP"],
-          "key_path":
-              self.ssh_key_file,
-          "username":
-              self.ssh_user,
-          "key_password":
-              self.ssh_password
+          "hostname": internal if self.internal else external,
+          "key_path": self.ssh_key_file,
+          "username": self.ssh_user,
+          "key_password": self.ssh_password
       }
       machines.append(machine.RemoteMachine(name=name, **kwargs))
     return machines
@@ -168,7 +170,9 @@ class GCloudProducer(machine_producer.MachineProducer):
     cmd.append("--zone=" + self.zone)
     cmd.append("--machine-type=" + self.machine_type)
     res = self._run_command(cmd)
-    return json.loads(res.stdout)
+    data = res.stdout
+    data = str(data, "utf-8") if isinstance(data, (bytes, bytearray)) else data
+    return json.loads(data)
 
   def _add_ssh_key_to_instances(self, names: List[str]) -> None:
     """Adds ssh key to instances by calling gcloud ssh command.
@@ -186,11 +190,13 @@ class GCloudProducer(machine_producer.MachineProducer):
       TimeoutError: when 3 unsuccessful tries to ssh into the host return 255.
     """
     for name in names:
-      cmd = "gcloud compute ssh {name}".format(name=name).split(" ")
+      cmd = "gcloud compute ssh {user}@{name}".format(
+          user=self.ssh_user, name=name).split(" ")
+      if self.internal:
+        cmd.append("--internal-ip")
       cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_file))
       cmd.append("--zone={zone}".format(zone=self.zone))
       cmd.append("--command=uname")
-      cmd.append("--ssh-key-expire-after=60m")
       timeout = datetime.timedelta(seconds=5 * 60)
       start = datetime.datetime.now()
       while datetime.datetime.now() <= timeout + start:
diff --git a/benchmarks/harness/ssh_connection.py b/benchmarks/harness/ssh_connection.py
index a50e34293..b8c8e42d4 100644
--- a/benchmarks/harness/ssh_connection.py
+++ b/benchmarks/harness/ssh_connection.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """SSHConnection handles the details of SSH connections."""
 
-
+import logging
 import os
 import warnings
 
@@ -24,6 +24,8 @@ from benchmarks import harness
 # Get rid of paramiko Cryptography Warnings.
 warnings.filterwarnings(action="ignore", module=".*paramiko.*")
 
+log = logging.getLogger(__name__)
+
 
 def send_one_file(client: paramiko.SSHClient, path: str,
                   remote_dir: str) -> str:
@@ -94,10 +96,13 @@ class SSHConnection:
       The contents of stdout and stderr.
     """
     with self._client() as client:
+      log.info("running command: %s", cmd)
       _, stdout, stderr = client.exec_command(command=cmd)
-      stdout.channel.recv_exit_status()
+      log.info("returned status: %d", stdout.channel.recv_exit_status())
       stdout = stdout.read().decode("utf-8")
       stderr = stderr.read().decode("utf-8")
+      log.info("stdout: %s", stdout)
+      log.info("stderr: %s", stderr)
     return stdout, stderr
 
   def send_workload(self, name: str) -> str:
diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py
index ba27dc69f..fc59cf505 100644
--- a/benchmarks/runner/__init__.py
+++ b/benchmarks/runner/__init__.py
@@ -19,6 +19,7 @@ import logging
 import pkgutil
 import pydoc
 import re
+import subprocess
 import sys
 import types
 from typing import List
@@ -120,14 +121,13 @@ def run_mock(ctx, **kwargs):
 
 @runner.command("run-gcp", commands.GCPCommand)
 @click.pass_context
-def run_gcp(ctx, image_file: str, zone_file: str, machine_type: str,
-            installers: List[str], **kwargs):
+def run_gcp(ctx, image_file: str, zone_file: str, internal: bool,
+            machine_type: str, installers: List[str], **kwargs):
   """Runs all benchmarks on GCP instances."""
 
   # Resolve all files.
-  image = open(image_file).read().rstrip()
-  zone = open(zone_file).read().rstrip()
-
+  image = subprocess.check_output([image_file]).rstrip()
+  zone = subprocess.check_output([zone_file]).rstrip()
   key_file = harness.make_key()
 
   producer = gcloud_producer.GCloudProducer(
@@ -137,7 +137,8 @@ def run_gcp(ctx, image_file: str, zone_file: str, machine_type: str,
       installers,
       ssh_key_file=key_file,
       ssh_user=harness.DEFAULT_USER,
-      ssh_password="")
+      ssh_password="",
+      internal=internal)
 
   try:
     run(ctx, producer, **kwargs)
diff --git a/benchmarks/runner/commands.py b/benchmarks/runner/commands.py
index 0fccb2fad..9a391eb01 100644
--- a/benchmarks/runner/commands.py
+++ b/benchmarks/runner/commands.py
@@ -101,15 +101,20 @@ class GCPCommand(RunCommand):
 
     image_file = click.core.Option(
         ("--image_file",),
-        help="The file containing the image for VMs.",
+        help="The binary that emits the GCP image.",
         default=os.path.join(
-            os.path.dirname(__file__), "../../tools/images/ubuntu1604.txt"),
+            os.path.dirname(__file__), "../../tools/vm/ubuntu1604"),
     )
     zone_file = click.core.Option(
         ("--zone_file",),
-        help="The file containing the GCP zone.",
-        default=os.path.join(
-            os.path.dirname(__file__), "../../tools/images/zone.txt"),
+        help="The binary that emits the GCP zone.",
+        default=os.path.join(os.path.dirname(__file__), "../../tools/vm/zone"),
+    )
+    internal = click.core.Option(
+        ("--internal/--no-internal",),
+        help="""Use instance internal IPs. Used if bm-tools runner is running on
+        GCP instance with firewall rules blocking external IPs.""",
+        default=False,
     )
     installers = click.core.Option(
         ("--installers",),
@@ -124,6 +129,7 @@ class GCPCommand(RunCommand):
     self.params.extend([
         image_file,
         zone_file,
+        internal,
         machine_type,
         installers,
     ])
diff --git a/benchmarks/tcp/BUILD b/benchmarks/tcp/BUILD
index d5e401acc..6dde7d9e6 100644
--- a/benchmarks/tcp/BUILD
+++ b/benchmarks/tcp/BUILD
@@ -10,6 +10,7 @@ go_binary(
         "//pkg/tcpip",
         "//pkg/tcpip/adapters/gonet",
         "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/link/qdisc/fifo",
         "//pkg/tcpip/network/arp",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/stack",
diff --git a/benchmarks/tcp/tcp_benchmark.sh b/benchmarks/tcp/tcp_benchmark.sh
index e65801a7b..ef04b4ace 100755
--- a/benchmarks/tcp/tcp_benchmark.sh
+++ b/benchmarks/tcp/tcp_benchmark.sh
@@ -94,6 +94,9 @@ while [ $# -gt 0 ]; do
     --cubic)
       netstack_opts="${netstack_opts} -cubic"
       ;;
+    --moderate-recv-buf)
+      netstack_opts="${netstack_opts} -moderate_recv_buf"
+      ;;
     --duration)
       shift
       [ "$#" -le 0 ] && echo "no duration provided" && exit 1
@@ -147,8 +150,9 @@ while [ $# -gt 0 ]; do
       echo " --client              use netstack as the client"
       echo " --ideal               reset all network emulation"
       echo " --server              use netstack as the server"
-      echo " --mtu                 set the mtu (bytes)"
+      echo " --mtu                 set the mtu (bytes)"      
       echo " --sack                enable SACK support"
+      echo " --moderate-recv-buf   enable TCP receive buffer auto-tuning"
       echo " --cubic               enable CUBIC congestion control for Netstack"
       echo " --duration            set the test duration (s)"
       echo " --latency             set the latency (ms)"
diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
index 73b7c4f5b..f5aa0b515 100644
--- a/benchmarks/tcp/tcp_proxy.go
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -36,6 +36,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
 	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -55,6 +56,7 @@ var (
 	mask               = flag.Int("mask", 8, "mask size for address")
 	iface              = flag.String("iface", "", "network interface name to bind for netstack")
 	sack               = flag.Bool("sack", false, "enable SACK support for netstack")
+	moderateRecvBuf    = flag.Bool("moderate_recv_buf", false, "enable TCP Receive Buffer Auto-tuning")
 	cubic              = flag.Bool("cubic", false, "enable use of CUBIC congestion control for netstack")
 	gso                = flag.Int("gso", 0, "GSO maximum size")
 	swgso              = flag.Bool("swgso", false, "software-level GSO")
@@ -203,7 +205,7 @@ func newNetstackImpl(mode string) (impl, error) {
 	if err != nil {
 		return nil, fmt.Errorf("failed to create FD endpoint: %v", err)
 	}
-	if err := s.CreateNIC(nicID, ep); err != nil {
+	if err := s.CreateNIC(nicID, fifo.New(ep, runtime.GOMAXPROCS(0), 1000)); err != nil {
 		return nil, fmt.Errorf("error creating NIC %q: %v", *iface, err)
 	}
 	if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
@@ -230,6 +232,11 @@ func newNetstackImpl(mode string) (impl, error) {
 		return nil, fmt.Errorf("SetTransportProtocolOption for SACKEnabled failed: %v", err)
 	}
 
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(*moderateRecvBuf)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+	}
+
 	// Set Congestion Control to cubic if requested.
 	if *cubic {
 		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.CongestionControlOption("cubic")); err != nil {
diff --git a/benchmarks/workloads/absl/Dockerfile b/benchmarks/workloads/absl/Dockerfile
index e935c5ddc..f29cfa156 100644
--- a/benchmarks/workloads/absl/Dockerfile
+++ b/benchmarks/workloads/absl/Dockerfile
@@ -16,9 +16,10 @@ RUN wget https://github.com/bazelbuild/bazel/releases/download/0.27.0/bazel-0.27
 RUN chmod +x bazel-0.27.0-installer-linux-x86_64.sh
 RUN ./bazel-0.27.0-installer-linux-x86_64.sh
 
-RUN git clone https://github.com/abseil/abseil-cpp.git
+RUN mkdir abseil-cpp && cd abseil-cpp \
+    && git init && git remote add origin https://github.com/abseil/abseil-cpp.git \
+    && git fetch --depth 1 origin 43ef2148c0936ebf7cb4be6b19927a9d9d145b8f && git checkout FETCH_HEAD
 WORKDIR abseil-cpp
-RUN git checkout 43ef2148c0936ebf7cb4be6b19927a9d9d145b8f
 RUN bazel clean
 ENV path "absl/base/..."
 CMD bazel build ${path} 2>&1
diff --git a/benchmarks/workloads/ruby/Gemfile.lock b/benchmarks/workloads/ruby/Gemfile.lock
index b44817bd3..ea9f0ea85 100644
--- a/benchmarks/workloads/ruby/Gemfile.lock
+++ b/benchmarks/workloads/ruby/Gemfile.lock
@@ -1,28 +1,41 @@
 GEM
   remote: https://rubygems.org/
   specs:
+    activemerchant (1.105.0)
+      activesupport (>= 4.2)
+      builder (>= 2.1.2, < 4.0.0)
+      i18n (>= 0.6.9)
+      nokogiri (~> 1.4)
     activesupport (5.2.3)
       concurrent-ruby (~> 1.0, >= 1.0.2)
       i18n (>= 0.7, < 2)
       minitest (~> 5.1)
       tzinfo (~> 1.1)
+    bcrypt (3.1.13)
+    builder (3.2.4)
     cassandra-driver (3.2.3)
       ione (~> 1.2)
     concurrent-ruby (1.1.5)
+    ffi (1.12.2)
     i18n (1.6.0)
       concurrent-ruby (~> 1.0)
     ione (1.2.4)
+    mini_portile2 (2.4.0)
     minitest (5.11.3)
     mustermann (1.0.3)
+    nokogiri (1.10.8)
+      mini_portile2 (~> 2.4.0)
     pdf-core (0.7.0)
     prawn (2.2.2)
       pdf-core (~> 0.7.0)
       ttfunk (~> 1.5)
-    puma (3.12.1)
-    rack (2.0.7)
+    puma (3.12.4)
+    rack (2.2.2)
     rack-protection (2.0.5)
       rack
-    rake (12.3.2)
+    rake (12.3.3)
+    rbnacl (7.1.1)
+      ffi
     redis (4.1.1)
     ruby-fann (1.2.6)
     sinatra (2.0.5)
@@ -43,9 +56,12 @@ PLATFORMS
   ruby
 
 DEPENDENCIES
+  activemerchant
+  bcrypt
   cassandra-driver
   puma
   rake
+  rbnacl
   redis
   ruby-fann
   sinatra
diff --git a/benchmarks/workloads/ruby_template/Gemfile.lock b/benchmarks/workloads/ruby_template/Gemfile.lock
index dd8d56fb7..f637b6081 100644
--- a/benchmarks/workloads/ruby_template/Gemfile.lock
+++ b/benchmarks/workloads/ruby_template/Gemfile.lock
@@ -2,25 +2,25 @@ GEM
   remote: https://rubygems.org/
   specs:
     mustermann (1.0.3)
-    puma (3.12.0)
+    puma (3.12.4)
     rack (2.0.6)
     rack-protection (2.0.5)
       rack
+    redis (4.1.0)
     sinatra (2.0.5)
       mustermann (~> 1.0)
       rack (~> 2.0)
       rack-protection (= 2.0.5)
       tilt (~> 2.0)
     tilt (2.0.9)
-    redis (4.1.0)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
   puma
-  sinatra
   redis
+  sinatra
 
 BUNDLED WITH
    1.17.1
 \ No newline at end of file
diff --git a/benchmarks/workloads/tensorflow/Dockerfile b/benchmarks/workloads/tensorflow/Dockerfile
index 262643b98..b5763e8ae 100644
--- a/benchmarks/workloads/tensorflow/Dockerfile
+++ b/benchmarks/workloads/tensorflow/Dockerfile
@@ -2,7 +2,7 @@ FROM tensorflow/tensorflow:1.13.2
 
 RUN apt-get update \
     && apt-get install -y git
-RUN git clone https://github.com/aymericdamien/TensorFlow-Examples.git
+RUN git clone --depth 1 https://github.com/aymericdamien/TensorFlow-Examples.git
 RUN python -m pip install -U pip setuptools
 RUN python -m pip install matplotlib
 
diff --git a/g3doc/BUILD b/g3doc/BUILD
new file mode 100644
index 000000000..dbbf96204
--- /dev/null
+++ b/g3doc/BUILD
@@ -0,0 +1,35 @@
+load("//website:defs.bzl", "doc")
+
+package(
+    default_visibility = ["//website:__pkg__"],
+    licenses = ["notice"],
+)
+
+doc(
+    name = "index",
+    src = "README.md",
+    category = "Project",
+    data = glob([
+        "*.png",
+        "*.svg",
+    ]),
+    permalink = "/docs/",
+    weight = "0",
+)
+
+doc(
+    name = "roadmap",
+    src = "roadmap.md",
+    category = "Project",
+    permalink = "/roadmap/",
+    weight = "10",
+)
+
+doc(
+    name = "community",
+    src = "community.md",
+    category = "Project",
+    permalink = "/community/",
+    subcategory = "Community",
+    weight = "95",
+)
diff --git a/g3doc/Layers.png b/g3doc/Layers.png
new file mode 100644
index 000000000..308c6c451
--- /dev/null
+++ b/g3doc/Layers.png
diff --git a/g3doc/Layers.svg b/g3doc/Layers.svg
new file mode 100644
index 000000000..0a366f841
--- /dev/null
+++ b/g3doc/Layers.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 371.8346456692913 255.01574803149606" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l371.83466 0l0 255.01575l-371.83466 0l0 -255.01575z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l371.83466 0l0 255.01575l-371.83466 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m36.454067 6.6430445l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path stroke="#cc4125" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m36.454067 6.6430445l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path fill="#000000" d="m78.206116 37.98824l5.125 -13.359373l1.90625 0l5.46875 13.359373l-2.015625 0l-1.546875 -4.046875l-5.59375 0l-1.46875 4.046875l-1.875 0zm3.859375 -5.484375l4.53125 0l-1.40625 -3.703123q-0.625 -1.6875 -0.9375 -2.765625q-0.265625 1.28125 -0.71875 2.546875l-1.46875 3.921873zm9.849823 9.1875l0 -13.374998l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546873q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.8437481 -0.765625 -2.765623q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.843748zm8.891342 8.484375l0 -13.374998l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546873q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.8437481 -0.765625 -2.765623q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.843748zm8.844467 4.78125l0 -13.359373l1.640625 0l0 13.359373l-1.640625 0zm4.191696 -11.468748l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.468748l0 -9.671873l1.640625 0l0 9.671873l-1.640625 0zm10.457321 -3.546875l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.5781231 0.515625 -2.749998q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.812498q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125zm9.328125 2.359375q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5624981q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.187498q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm7.781967 3.390625l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578123l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671873q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 -9.999998l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.468748l0 -9.671873l1.640625 0l0 9.671873l-1.640625 0zm3.5354462 -4.84375q0 -2.687498 1.484375 -3.968748q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609373q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.7968731 -0.8125 -2.718748q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765623zm9.297592 4.84375l0 -9.671873l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.5937481l0 5.953125l-1.640625 0l0 -5.890625q0 -0.9999981 -0.203125 -1.4843731q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515623l0 5.28125l-1.640625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m4.454068 73.068245l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m4.454068 73.068245l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m172.45407 73.068245l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m172.45407 73.068245l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m73.43044 56.702377l100.88189 0l0 32.06299l-100.88189 0z" fill-rule="evenodd"/><path fill="#000000" d="m87.06437 74.471375l1.203125 -0.109375q0.078125 0.71875 0.390625 1.1875q0.3125 0.453125 0.953125 0.734375q0.65625 0.28125 1.46875 0.28125q0.71875 0 1.265625 -0.21875q0.5625 -0.21875 0.828125 -0.578125q0.265625 -0.375 0.265625 -0.828125q0 -0.453125 -0.265625 -0.78125q-0.25 -0.328125 -0.84375 -0.5625q-0.390625 -0.15625 -1.703125 -0.46875q-1.3125 -0.3125 -1.84375 -0.59375q-0.671875 -0.359375 -1.015625 -0.890625q-0.328125 -0.53125 -0.328125 -1.1875q0 -0.71875 0.40625 -1.34375q0.40625 -0.625 1.1875 -0.953125q0.796875 -0.328125 1.765625 -0.328125q1.046875 0 1.859375 0.34375q0.8125 0.34375 1.25 1.015625q0.4375 0.65625 0.46875 1.484375l-1.203125 0.09375q-0.109375 -0.90625 -0.671875 -1.359375q-0.5625 -0.46875 -1.65625 -0.46875q-1.140625 0 -1.671875 0.421875q-0.515625 0.421875 -0.515625 1.015625q0 0.515625 0.359375 0.84375q0.375 0.328125 1.90625 0.6875q1.546875 0.34375 2.109375 0.59375q0.84375 0.390625 1.234375 0.984375q0.390625 0.578125 0.390625 1.359375q0 0.75 -0.4375 1.4375q-0.421875 0.671875 -1.25 1.046875q-0.8125 0.359375 -1.828125 0.359375q-1.296875 0 -2.171875 -0.375q-0.875 -0.375 -1.375 -1.125q-0.5 -0.765625 -0.53125 -1.71875zm9.12413 5.71875l-0.125 -1.09375q0.375 0.109375 0.65625 0.109375q0.390625 0 0.625 -0.140625q0.234375 -0.125 0.390625 -0.359375q0.109375 -0.171875 0.359375 -0.875q0.03125 -0.09375 0.109375 -0.28125l-2.625 -6.921875l1.265625 0l1.4375 4.0q0.28125 0.765625 0.5 1.59375q0.203125 -0.796875 0.46875 -1.578125l1.484375 -4.015625l1.171875 0l-2.625 7.015625q-0.421875 1.140625 -0.65625 1.578125q-0.3125 0.578125 -0.71875 0.84375q-0.40625 0.28125 -0.96875 0.28125q-0.328125 0 -0.75 -0.15625zm6.2421875 -4.71875l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625zm9.6953125 1.015625l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.8748627 -1.171875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5218506 4.125l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.0312424 0 1.5781174 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.7031174 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm19.321053 -2.53125l1.15625 0.15625q-0.1875 1.1875 -0.96875 1.859375q-0.78125 0.671875 -1.921875 0.671875q-1.4062653 0 -2.2812653 -0.921875q-0.859375 -0.9375 -0.859375 -2.65625q0 -1.125 0.375 -1.96875q0.375 -0.84375 1.125 -1.25q0.765625 -0.421875 1.6562653 -0.421875q1.125 0 1.84375 0.578125q0.71875 0.5625 0.921875 1.609375l-1.140625 0.171875q-0.171875 -0.703125 -0.59375 -1.046875q-0.40625 -0.359375 -0.984375 -0.359375q-0.890625 0 -1.4531403 0.640625q-0.546875 0.640625 -0.546875 2.0q0 1.40625 0.53125 2.03125q0.546875 0.625 1.4062653 0.625q0.6875 0 1.140625 -0.421875q0.46875 -0.421875 0.59375 -1.296875zm6.6640625 1.671875q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.96109 0l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.507965 -2.0625l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625z" fill-rule="nonzero"/><path fill="#d9d2e9" d="m36.454067 87.40656l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path stroke="#8e7cc3" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m36.454067 87.40656l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path fill="#000000" d="m98.35295 119.54864l1.59375 0.234375q0.109375 0.75 0.5625 1.078125q0.609375 0.453125 1.671875 0.453125q1.140625 0 1.75 -0.453125q0.625 -0.453125 0.84375 -1.265625q0.125 -0.5 0.109375 -2.109375q-1.0625 1.265625 -2.671875 1.265625q-2.0 0 -3.09375 -1.4375q-1.09375 -1.4375 -1.09375 -3.453125q0 -1.390625 0.5 -2.5625q0.515625 -1.171875 1.453125 -1.796875q0.953125 -0.640625 2.25 -0.640625q1.703125 0 2.8125 1.375l0 -1.15625l1.515625 0l0 8.359375q0 2.265625 -0.46875 3.203125q-0.453125 0.9375 -1.453125 1.484375q-0.984375 0.546875 -2.453125 0.546875q-1.71875 0 -2.796875 -0.78125q-1.0625 -0.765625 -1.03125 -2.34375zm1.359375 -5.8125q0 1.90625 0.75 2.78125q0.765625 0.875 1.90625 0.875q1.125 0 1.890625 -0.859375q0.765625 -0.875 0.765625 -2.734375q0 -1.78125 -0.796875 -2.671875q-0.78125 -0.90625 -1.890625 -0.90625q-1.09375 0 -1.859375 0.890625q-0.765625 0.875 -0.765625 2.625zm13.344467 5.015625l-5.171875 -13.359375l1.921875 0l3.46875 9.703125q0.421875 1.171875 0.703125 2.1875q0.3125 -1.09375 0.71875 -2.1875l3.609375 -9.703125l1.796875 0l-5.234375 13.359375l-1.8125 0zm8.427948 -11.46875l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm3.4885712 -2.890625l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm9.375 -1.953125q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm9.281967 4.84375l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m3.6351707 152.91733l48.850395 0" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m3.6351707 152.91733l48.850395 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m195.25722 152.91733l47.338577 0" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m195.25722 152.91733l47.338577 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m52.485565 136.88583l142.77165 0l0 32.06299l-142.77165 0z" fill-rule="evenodd"/><path fill="#000000" d="m65.21821 157.71732l0 -9.546875l1.265625 0l0 8.421875l4.703125 0l0 1.125l-5.96875 0zm7.3343506 -8.1875l0 -1.359375l1.171875 0l0 1.359375l-1.171875 0zm0 8.1875l0 -6.90625l1.171875 0l0 6.90625l-1.171875 0zm2.945465 0l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.03125 0 1.578125 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.703125 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm11.118057 -8.1875l0 -1.359375l1.171875 0l0 1.359375l-1.171875 0zm0 8.1875l0 -6.90625l1.171875 0l0 6.90625l-1.171875 0zm5.507965 -1.046875l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.8748627 -1.171875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm11.006226 4.125l0 -0.875q-0.65625 1.03125 -1.9375 1.03125q-0.8125 0 -1.515625 -0.453125q-0.6875 -0.453125 -1.078125 -1.265625q-0.375 -0.828125 -0.375 -1.890625q0 -1.03125 0.34375 -1.875q0.34375 -0.84375 1.03125 -1.28125q0.703125 -0.453125 1.546875 -0.453125q0.625 0 1.109375 0.265625q0.5 0.25 0.796875 0.671875l0 -3.421875l1.171875 0l0 9.546875l-1.09375 0zm-3.703125 -3.453125q0 1.328125 0.5625 1.984375q0.5625 0.65625 1.328125 0.65625q0.765625 0 1.296875 -0.625q0.53125 -0.625 0.53125 -1.90625q0 -1.421875 -0.546875 -2.078125q-0.546875 -0.671875 -1.34375 -0.671875q-0.78125 0 -1.3125 0.640625q-0.515625 0.625 -0.515625 2.0zm9.865463 1.390625l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625zm7.0859375 4.71875l-0.125 -1.09375q0.375 0.109375 0.65625 0.109375q0.390625 0 0.625 -0.140625q0.234375 -0.125 0.390625 -0.359375q0.109375 -0.171875 0.359375 -0.875q0.03125 -0.09375 0.109375 -0.28125l-2.625 -6.921875l1.265625 0l1.4375 4.0q0.28125 0.765625 0.5 1.59375q0.203125 -0.796875 0.46875 -1.578125l1.484375 -4.015625l1.171875 0l-2.625 7.015625q-0.421875 1.140625 -0.65625 1.578125q-0.3125 0.578125 -0.71875 0.84375q-0.40625 0.28125 -0.96875 0.28125q-0.328125 0 -0.75 -0.15625zm6.2421875 -4.71875l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8124924 0 1.2031174 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1874924 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8124924 0 1.4218674 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0624924 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.2499924 0.328125 1.7343674 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.4531174 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625zm9.695305 1.015625l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.8748627 -1.171875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5218506 4.125l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.03125 0 1.578125 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.703125 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm19.321045 -2.53125l1.15625 0.15625q-0.1875 1.1875 -0.96875 1.859375q-0.78125 0.671875 -1.921875 0.671875q-1.40625 0 -2.28125 -0.921875q-0.859375 -0.9375 -0.859375 -2.65625q0 -1.125 0.375 -1.96875q0.375 -0.84375 1.125 -1.25q0.765625 -0.421875 1.65625 -0.421875q1.125 0 1.84375 0.578125q0.71875 0.5625 0.921875 1.609375l-1.140625 0.171875q-0.171875 -0.703125 -0.59375 -1.046875q-0.40625 -0.359375 -0.984375 -0.359375q-0.890625 0 -1.453125 0.640625q-0.546875 0.640625 -0.546875 2.0q0 1.40625 0.53125 2.03125q0.546875 0.625 1.40625 0.625q0.6875 0 1.140625 -0.421875q0.46875 -0.421875 0.59375 -1.296875zm6.6640625 1.671875q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.96109 0l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.507965 -2.0625l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m36.454067 167.00784l174.83464 0l0 48.850388l-174.83464 0z" fill-rule="evenodd"/><path stroke="#6d9eeb" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m36.454067 167.00784l174.83464 0l0 48.850388l-174.83464 0z" fill-rule="evenodd"/><path fill="#000000" d="m76.63558 198.35303l0 -13.359375l1.765625 0l0 5.484375l6.9375 0l0 -5.484375l1.765625 0l0 13.359375l-1.765625 0l0 -6.296875l-6.9375 0l0 6.296875l-1.765625 0zm12.597946 -4.84375q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm8.641342 1.953125l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm13.5625 1.421875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm6.9134827 1.46875l0 -13.359375l1.78125 0l0 6.625l6.6249924 -6.625l2.390625 0l-5.5937424 5.421875l5.8437424 7.9375l-2.328125 0l-4.7656174 -6.765625l-2.171875 2.140625l0 4.625l-1.78125 0zm18.943565 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm6.228302 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm17.000732 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.7656403 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375153 0 3.1562653 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.2187653 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.5468903 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.3906403 -2.65625l5.4062653 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.0312653 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.094467 5.765625l0 -13.359375l1.640625 0l0 13.359375l-1.640625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m4.454068 233.43303l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m4.454068 233.43303l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m172.45407 233.43303l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m172.45407 233.43303l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m73.43044 217.06717l100.88189 0l0 32.06299l-100.88189 0z" fill-rule="evenodd"/><path fill="#000000" d="m96.04542 237.89867l0 -9.546875l1.265625 0l0 3.921875l4.953125 0l0 -3.921875l1.265625 0l0 9.546875l-1.265625 0l0 -4.5l-4.953125 0l0 4.5l-1.265625 0zm13.953278 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm8.93837 0l0 -0.875q-0.65625 1.03125 -1.9375 1.03125q-0.8125 0 -1.515625 -0.453125q-0.6875 -0.453125 -1.078125 -1.265625q-0.375 -0.828125 -0.375 -1.890625q0 -1.03125 0.34375 -1.875q0.34375 -0.84375 1.03125 -1.28125q0.703125 -0.453125 1.546875 -0.453125q0.625 0 1.109375 0.265625q0.5 0.25 0.796875 0.671875l0 -3.421875l1.171875 0l0 9.546875l-1.09375 0zm-3.703125 -3.453125q0 1.328125 0.5625 1.984375q0.5625 0.65625 1.328125 0.65625q0.765625 0 1.296875 -0.625q0.53125 -0.625 0.53125 -1.90625q0 -1.421875 -0.546875 -2.078125q-0.546875 -0.671875 -1.34375 -0.671875q-0.78125 0 -1.3125 0.640625q-0.515625 0.625 -0.515625 2.0zm7.9124756 3.453125l-2.125 -6.90625l1.21875 0l1.09375 3.984375l0.421875 1.484375q0.015625 -0.109375 0.359375 -1.421875l1.0937424 -4.046875l1.203125 0l1.03125 4.0l0.34375 1.328125l0.40625 -1.34375l1.171875 -3.984375l1.140625 0l-2.15625 6.90625l-1.21875 0l-1.09375 -4.140625l-0.265625 -1.171875l-1.4062424 5.3125l-1.21875 0zm12.859535 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59376526 0.21875 -1.2812653 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.4218903 -0.171875 2.0937653 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.3437653 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.89064026 0 1.4375153 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.9218903 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.2031403 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm9.18837 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375z" fill-rule="nonzero"/><defs><linearGradient id="p.1" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.53514884533539 0.0 0.0 4.53514884533539 0.0 0.0)" spreadMethod="pad" x1="8.21347768339151" y1="37.02644733653771" x2="8.213461293294644" y2="41.56159618184348"><stop offset="0.0" stop-color="#ff0000"/><stop offset="0.51" stop-color="#dab7a6"/><stop offset="0.99999994" stop-color="#dab7a6" stop-opacity="0.0"/><stop offset="1.0" stop-color="#ffffff" stop-opacity="0.0"/></linearGradient></defs><path fill="url(#p.1)" d="m37.249344 167.92108l173.29134 0l0 20.566925l-173.29134 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m272.4455 182.06865l129.5748 -74.83464l20.629913 35.74803l-129.5748 74.83464z" fill-rule="evenodd"/><path fill="#000000" d="m287.51392 188.73558l1.1823425 -0.82717896q0.51071167 0.6974335 1.1166077 0.9970703q0.5980835 0.28611755 1.4464111 0.1931305q0.84054565 -0.10652161 1.6794434 -0.5910187q0.75772095 -0.4376068 1.2010193 -0.9823456q0.44906616 -0.5660858 0.50097656 -1.1013031q0.057678223 -0.55656433 -0.20785522 -1.0166931q-0.27334595 -0.47366333 -0.7392273 -0.6557007q-0.47366333 -0.1955719 -1.2366333 -0.079711914q-0.478302 0.07775879 -2.032318 0.54222107q-1.5618286 0.45092773 -2.2805786 0.48712158q-0.9222717 0.027420044 -1.5864563 -0.31072998q-0.6719971 -0.35168457 -1.0703125 -1.0418701q-0.4295349 -0.74432373 -0.38497925 -1.6361694q0.05029297 -0.9131775 0.6668701 -1.7203827q0.63012695 -0.81500244 1.6313782 -1.3932648q1.1095276 -0.64079285 2.1592712 -0.7598877q1.0419617 -0.1326294 1.8867493 0.29968262q0.8583679 0.4244995 1.3987732 1.2671814l-1.2036743 0.82147217q-0.64712524 -0.87127686 -1.5022583 -1.0089111q-0.8629761 -0.15116882 -2.013092 0.51304626q-1.1906738 0.68766785 -1.4819641 1.4333191q-0.2913208 0.745636 0.06793213 1.3681641q0.30459595 0.52778625 0.8865051 0.6608429q0.5740967 0.119522095 2.3815613 -0.43717957q1.8210144 -0.5645294 2.5725403 -0.6376953q1.0924377 -0.107666016 1.857605 0.28042603q0.77090454 0.366745 1.2316895 1.1652069q0.4529724 0.78492737 0.39904785 1.7543335q-0.040405273 0.9616089 -0.6663208 1.8463745q-0.62594604 0.8847656 -1.6813354 1.4942932q-1.3530579 0.78144836 -2.486084 0.9125519q-1.1408386 0.11756897 -2.1214905 -0.3625946q-0.96713257 -0.48797607 -1.5721436 -1.4738007zm13.40155 -4.9431458l0.8006897 0.98106384q-0.45169067 0.4052124 -0.857605 0.6396332q-0.6629944 0.3829193 -1.1454773 0.39089966q-0.4902649 -0.0055389404 -0.8343506 -0.25790405q-0.3518982 -0.26589966 -0.9844971 -1.3620911l-2.382019 -4.1276093l-0.8930054 0.5157471l-0.5466919 -0.9473114l0.8930054 -0.5157623l-1.0308838 -1.786377l0.79599 -1.434082l1.4526367 2.5171661l1.2177734 -0.7033081l0.5466919 0.94732666l-1.2177734 0.70329285l2.4210815 4.195282q0.30456543 0.5278015 0.4446106 0.645401q0.15356445 0.109802246 0.35705566 0.11857605q0.19570923 -0.004760742 0.4663086 -0.16105652q0.20297241 -0.11721802 0.49645996 -0.35888672zm1.8165283 0.41241455l-4.147064 -7.1861115l1.0959778 -0.6329651l0.6247864 1.0826569q-0.0178833 -1.0001068 0.19332886 -1.4468842q0.21121216 -0.44676208 0.64419556 -0.6968231q0.6088562 -0.35165405 1.471283 -0.3264618l0.22875977 1.3654938q-0.59487915 7.4768066E-4 -1.0413818 0.25862122q-0.39239502 0.22662354 -0.5765381 0.6577606q-0.17843628 0.40979004 -0.06384277 0.92100525q0.17193604 0.7667999 0.61709595 1.5381927l2.1711426 3.7622223l-1.2177429 0.70329285zm2.0899658 -6.006668q-1.1480408 -1.9893646 -0.5930481 -3.5910034q0.47280884 -1.3376465 1.7988281 -2.1034698q1.4883423 -0.8595886 2.984253 -0.4243927q1.501648 0.41384888 2.4934998 2.132553q0.79660034 1.3803864 0.83795166 2.4210815q0.04135132 1.0407104 -0.49923706 1.948349q-0.5348511 0.88630676 -1.4819946 1.4333191q-1.5018921 0.8674011 -2.9899902 0.44573975q-1.4959106 -0.43519592 -2.5502625 -2.2621765zm1.2583313 -0.72673035q0.79663086 1.3803711 1.7902527 1.7267303q1.0072021 0.33854675 1.9137268 -0.18502808q0.9065552 -0.5235596 1.1036072 -1.5575867q0.21057129 -1.0418396 -0.60946655 -2.4628143q-0.76538086 -1.3262482 -1.7725525 -1.6647949q-0.99365234 -0.34635925 -1.9002075 0.17720032q-0.90652466 0.5235748 -1.117096 1.5654144q-0.2048645 1.0204926 0.59173584 2.400879zm8.984772 -0.38945007l-4.1470337 -7.1861115l1.0959778 -0.6329651l0.5857239 1.0149841q0.10531616 -1.6306152 1.6072083 -2.4980164q0.6494751 -0.37509155 1.3234558 -0.45761108q0.6739807 -0.08250427 1.163269 0.14013672q0.48928833 0.22264099 0.9021301 0.687912q0.26287842 0.29925537 0.74710083 1.1383057l2.553833 4.425354l-1.2177429 0.70329285l-2.522583 -4.371216q-0.42956543 -0.74432373 -0.7892456 -1.0237579q-0.3540039 -0.30078125 -0.8442688 -0.3063202q-0.47677612 -0.01335144 -0.9638672 0.2679596q-0.7847595 0.45324707 -1.0640869 1.2821655q-0.2735901 0.80758667 0.52301025 2.187973l2.264862 3.9246216l-1.2177429 0.70329285zm7.819275 -3.7220154l1.2922058 -0.511734q0.3878479 0.51579285 0.8666992 0.5640259q0.65527344 0.072631836 1.4400635 -0.38059998q0.8388977 -0.48449707 1.1036682 -1.0885162q0.26480103 -0.60401917 0.07571411 -1.306778q-0.1161499 -0.42010498 -0.81121826 -1.6245575q-0.25161743 1.408371 -1.4423218 2.096054q-1.4883423 0.85957336 -2.9171448 0.25932312q-1.428833 -0.60025024 -2.2957153 -2.1024323q-0.5935669 -1.0285187 -0.72805786 -2.1056366q-0.12097168 -1.0849152 0.30926514 -1.9649353q0.4437561 -0.8878174 1.3908997 -1.4348297q1.2718811 -0.7345581 2.690796 -0.182724l-0.4998474 -0.8661194l1.1230469 -0.6485901l3.5847168 6.2117157q0.9684448 1.6781158 1.0362854 2.5771942q0.067840576 0.8990936 -0.4420166 1.7348785q-0.5098877 0.8357849 -1.5923462 1.4609375q-1.2854004 0.7423706 -2.4195251 0.62150574q-1.1206055 -0.12869263 -1.7651672 -1.3081818zm-1.4765625 -4.9031525q0.81222534 1.4074402 1.7418518 1.7366486q0.94314575 0.32138062 1.7820435 -0.16311646q0.8388977 -0.4844818 1.0401001 -1.4487457q0.19342041 -0.97779846 -0.60317993 -2.3581848q-0.76538086 -1.3262482 -1.7298889 -1.6533508q-0.97229004 -0.3406372 -1.7976685 0.1360321q-0.8118286 0.46887207 -0.9974365 1.4602051q-0.1855774 0.991333 0.56417847 2.290512z" fill-rule="nonzero"/><path fill="#000000" d="m294.23132 199.68793l-0.80441284 -1.3939209l1.2177429 -0.70329285l0.80441284 1.3939056l-1.2177429 0.7033081zm4.920227 8.525894l-4.147064 -7.1861115l1.2177734 -0.7033081l4.1470337 7.1861115l-1.2177429 0.7033081zm1.3493347 -3.6482391l1.0948792 -0.88494873q0.51641846 0.6760864 1.2029724 0.80285645q0.6922302 0.10542297 1.5176086 -0.3712616q0.8388672 -0.4844818 1.0495605 -1.057251q0.20285034 -0.58628845 -0.062683105 -1.0464172q-0.23431396 -0.4059906 -0.7266846 -0.4464264q-0.35079956 -0.013916016 -1.4790955 0.3129425q-1.5347595 0.43530273 -2.2030334 0.49645996q-0.66256714 0.03982544 -1.183075 -0.23695374q-0.5069885 -0.28459167 -0.8115845 -0.8123779q-0.28115845 -0.48719788 -0.2989807 -1.018219q-0.017791748 -0.5310211 0.2048645 -1.0204926q0.15917969 -0.3806305 0.56817627 -0.797287q0.4147339 -0.43800354 0.9694824 -0.75839233q0.852417 -0.49230957 1.6289368 -0.61598206q0.7765198 -0.123687744 1.3162842 0.123931885q0.5455017 0.22625732 1.0733948 0.85964966l-1.0969849 0.85006714q-0.4013672 -0.5079651 -0.9733887 -0.59262085q-0.5720215 -0.0846405 -1.2756042 0.3217163q-0.8388977 0.4844818 -1.0402222 0.9796753q-0.19558716 0.47383118 0.015289307 0.8392334q0.14056396 0.24359131 0.39874268 0.34710693q0.2581787 0.103500366 0.64749146 0.05909729q0.22845459 -0.041732788 1.2620544 -0.31388855q1.4806519 -0.40403748 2.127594 -0.470932q0.6390991 -0.08041382 1.1653442 0.17501831q0.5397949 0.24760437 0.89904785 0.87013245q0.35144043 0.60899353 0.29852295 1.3613129q-0.047210693 0.73095703 -0.5519409 1.4194183q-0.49118042 0.68063354 -1.3300476 1.1651306q-1.407196 0.81269836 -2.4736633 0.6527405q-1.0664673 -0.15994263 -1.933258 -1.193039zm6.1190186 -5.4646606q-1.1480408 -1.9893799 -0.5930481 -3.5910187q0.47283936 -1.3376465 1.7988281 -2.1034698q1.4883423 -0.85957336 2.984253 -0.4243927q1.501648 0.41384888 2.4934998 2.132553q0.79663086 1.3803864 0.83795166 2.4210968q0.04135132 1.0406952 -0.49923706 1.948349q-0.5348511 0.8862915 -1.4819946 1.4333038q-1.5018921 0.8674011 -2.9899902 0.44573975q-1.4959106 -0.43519592 -2.5502625 -2.2621613zm1.2583313 -0.7267456q0.79663086 1.3803864 1.7902832 1.7267303q1.0071716 0.33854675 1.9136963 -0.18501282q0.9065552 -0.5235748 1.1036072 -1.5576019q0.21057129 -1.0418396 -0.60946655 -2.4628143q-0.76538086 -1.3262482 -1.7725525 -1.6647949q-0.99365234 -0.34635925 -1.9002075 0.17721558q-0.90652466 0.5235596 -1.117096 1.5653992q-0.2048645 1.0204926 0.59173584 2.400879zm8.957733 -0.3738098l-5.7246704 -9.919815l1.2177734 -0.70329285l5.72464 9.9198l-1.2177429 0.7033081zm7.2713623 -5.390396q-0.34069824 0.972641 -0.8225403 1.5756989q-0.48962402 0.5895386 -1.2067566 1.0036926q-1.1906738 0.6876831 -2.162445 0.47302246q-0.9717407 -0.21464539 -1.4872131 -1.1078339q-0.30456543 -0.52778625 -0.3109436 -1.1015167q-0.014190674 -0.58724976 0.21627808 -1.0631866q0.23620605 -0.49728394 0.64520264 -0.9139252q0.31063843 -0.3057251 0.980896 -0.8010864q1.3733215 -1.02771 1.9363098 -1.6776581q-0.14837646 -0.25712585 -0.18740845 -0.32478333q-0.42956543 -0.74432373 -0.93963623 -0.84669495q-0.7156677 -0.14602661 -1.6357422 0.38536072q-0.8524475 0.49230957 -1.0922546 1.0458069q-0.23410034 0.5321655 0.013824463 1.3994293l-1.2843933 0.52526855q-0.2749939 -0.85162354 -0.18301392 -1.5362854q0.10549927 -0.6924591 0.66851807 -1.3424072q0.5552063 -0.66348267 1.4752808 -1.1948547q0.92007446 -0.5313873 1.6133118 -0.6430664q0.7067566 -0.11949158 1.1648254 0.04902649q0.45803833 0.16850281 0.8552551 0.60672q0.24728394 0.27218628 0.71588135 1.0841675l0.9371643 1.6239929q0.9840393 1.7051697 1.3094177 2.1126862q0.33892822 0.39971924 0.81103516 0.68640137l-1.2718506 0.7345581q-0.40811157 -0.26953125 -0.7590027 -0.75253296zm-1.6645203 -2.6654663q-0.5067749 0.65356445 -1.7234497 1.6088562q-0.69522095 0.5458679 -0.9283142 0.8609314q-0.23312378 0.31506348 -0.25280762 0.6873169q-0.013977051 0.3508911 0.16564941 0.66215515q0.28115845 0.48719788 0.83392334 0.6010132q0.5662842 0.10598755 1.269867 -0.30036926q0.70358276 -0.40634155 1.072998 -1.0166473q0.37512207 -0.63165283 0.3197937 -1.3214569q-0.031341553 -0.5232086 -0.49990845 -1.3352051l-0.25775146 -0.44659424zm7.223419 -0.8156891l0.8006897 0.98106384q-0.45169067 0.4052124 -0.857605 0.63964844q-0.6629944 0.38290405 -1.1454468 0.39089966q-0.4902954 -0.0055389404 -0.8343811 -0.25790405q-0.3518982 -0.26591492 -0.9844971 -1.3620911l-2.382019 -4.1276093l-0.8930054 0.5157471l-0.5466919 -0.94732666l0.8930054 -0.5157471l-1.0308838 -1.786377l0.7960205 -1.434082l1.4526367 2.5171661l1.2177429 -0.7033081l0.5466919 0.94732666l-1.2177429 0.70329285l2.421051 4.195282q0.30459595 0.52778625 0.4446106 0.645401q0.15356445 0.10978699 0.35708618 0.11856079q0.19567871 -0.004760742 0.46627808 -0.16104126q0.20297241 -0.11721802 0.49645996 -0.35890198zm-3.0901794 -8.121277l-0.80441284 -1.3939209l1.2177429 -0.70329285l0.80444336 1.3939209l-1.2177734 0.70329285zm4.920227 8.525894l-4.1470337 -7.1861115l1.2177429 -0.70329285l4.147064 7.186096l-1.2177734 0.7033081zm0.54074097 -5.111908q-1.1480408 -1.9893799 -0.5930481 -3.5910187q0.47280884 -1.3376465 1.7988281 -2.1034698q1.4883423 -0.8595886 2.984253 -0.4243927q1.501648 0.41384888 2.4934998 2.132553q0.79660034 1.3803864 0.83795166 2.4210968q0.0413208 1.0406952 -0.49923706 1.948349q-0.5348511 0.8862915 -1.4819946 1.4333038q-1.5018921 0.8674011 -2.9899902 0.44573975q-1.4959412 -0.43519592 -2.5502625 -2.2621613zm1.2583313 -0.7267456q0.79663086 1.3803864 1.7902527 1.7267303q1.0072021 0.33854675 1.9137268 -0.18501282q0.9065552 -0.5235748 1.1036072 -1.5576019q0.21057129 -1.0418396 -0.60946655 -2.4628143q-0.76538086 -1.3262482 -1.7725525 -1.6647949q-0.99365234 -0.34635925 -1.9002075 0.17721558q-0.9065552 0.5235596 -1.117096 1.5653992q-0.2048645 1.0204926 0.59173584 2.400879zm8.984772 -0.38945007l-4.1470337 -7.1861115l1.0959473 -0.6329651l0.5857544 1.0149841q0.10531616 -1.6306152 1.6072083 -2.4980164q0.6494446 -0.37509155 1.3234558 -0.45761108q0.6739807 -0.08250427 1.163269 0.14013672q0.48928833 0.22264099 0.9021301 0.687912q0.26287842 0.29925537 0.74710083 1.1383057l2.553833 4.425354l-1.2177429 0.70329285l-2.5226135 -4.371216q-0.4295349 -0.74432373 -0.7892456 -1.0237579q-0.3539734 -0.30078125 -0.8442383 -0.3063202q-0.47677612 -0.01335144 -0.9638672 0.2679596q-0.7847595 0.45324707 -1.0640869 1.2821808q-0.2735901 0.8075714 0.52301025 2.1879578l2.264862 3.9246216l-1.2177429 0.70329285z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m282.76047 199.94267c-17.003845 0 -26.795105 -5.566925 -34.007706 -11.133865c-7.2126007 -5.566925 -11.846542 -11.13385 -23.6931 -11.13385" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.76047 199.94267c-17.003876 0 -26.795105 -5.5669403 -34.007706 -11.133865c-3.6062927 -2.7834625 -6.567932 -5.566925 -10.10881 -7.6545258c-0.4426117 -0.2609558 -0.8942871 -0.5110321 -1.3573761 -0.74887085c-0.11578369 -0.0594635 -0.23228455 -0.1181488 -0.34950256 -0.17605591l-0.13806152 -0.066833496" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m237.48381 176.93082l-9.563843 1.350235l8.194244 5.1131744z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m272.4455 118.06866l129.5748 -74.83465l20.629913 35.74803l-129.5748 74.83464z" fill-rule="evenodd"/><path fill="#000000" d="m290.03357 127.53869l-5.72464 -9.9198l1.3124695 -0.75800323l5.72464 9.919807l-1.3124695 0.7579956zm3.470581 -2.0044022l-4.1470337 -7.186104l1.0959778 -0.6329651l0.5857239 1.0149841q0.10531616 -1.6306229 1.6072083 -2.498024q0.6494751 -0.37509155 1.3234558 -0.45760345q0.6739807 -0.0825119 1.163269 0.14012146q0.48928833 0.22264099 0.9021301 0.6879196q0.26290894 0.29925537 0.74710083 1.1383133l2.553833 4.4253464l-1.2177429 0.70329285l-2.522583 -4.371208q-0.4295349 -0.74432373 -0.7892456 -1.0237579q-0.3540039 -0.30078888 -0.8442688 -0.30632782q-0.4767456 -0.01335144 -0.9638672 0.26796722q-0.7847595 0.4532318 -1.0640869 1.2821655q-0.2735901 0.80758667 0.52301025 2.187973l2.2648926 3.924614l-1.2177734 0.70329285zm12.360168 -7.1384735l-0.5232849 -0.906723q-0.059539795 1.4598389 -1.3855286 2.2256546q-0.8659668 0.5001221 -1.8564148 0.44062805q-0.9904785 -0.05949402 -1.8883972 -0.67765045q-0.88442993 -0.62597656 -1.5170288 -1.7221603q-0.6247864 -1.0826492 -0.77282715 -2.151947q-0.14230347 -1.0906448 0.30926514 -1.9649353q0.44378662 -0.887825 1.336792 -1.4035797q0.6494751 -0.37509155 1.3063049 -0.39356232q0.6703491 -0.026283264 1.2392883 0.2405777l-2.054016 -3.5592194l1.2177429 -0.7033005l5.7246704 9.919807l-1.1365662 0.6564102zm-5.9122925 -1.3669891q0.79660034 1.3803787 1.7767029 1.7345505q0.97232056 0.3406372 1.7570801 -0.112602234q0.7983093 -0.4610443 0.97817993 -1.4310303q0.18560791 -0.991333 -0.58758545 -2.3311157q-0.8512573 -1.4751129 -1.8178406 -1.8370972q-0.96658325 -0.36198425 -1.805481 0.12251282q-0.8118286 0.46886444 -0.97036743 1.4445648q-0.15853882 0.9757004 0.6693115 2.4102173zm12.53952 -5.5459747l1.3442383 -0.57787323q0.34274292 1.2816315 -0.11764526 2.3594894q-0.4468689 1.0700455 -1.6916809 1.788971q-1.5830688 0.9142914 -3.0654602 0.47128296q-1.4823914 -0.44300842 -2.4898376 -2.1887817q-1.0465393 -1.813446 -0.69085693 -3.3540955q0.35568237 -1.5406494 1.8440247 -2.4002304q1.4342346 -0.828331 2.9109192 -0.36397552q1.4823608 0.4430008 2.5054626 2.2158508q0.0625 0.10826111 0.18743896 0.32479095l-5.3580933 3.094513q0.75494385 1.1518478 1.7173462 1.4440689q0.96810913 0.27088165 1.861145 -0.24487305q0.6765137 -0.39072418 0.9470215 -1.0160904q0.2705078 -0.6253662 0.09597778 -1.5530472zm-5.131775 0.32941437l4.005066 -2.3130646q-0.60446167 -0.8598404 -1.2410278 -1.0876312q-0.98794556 -0.36769867 -1.90802 0.16368103q-0.8388977 0.48449707 -1.0926819 1.3889084q-0.2480774 0.88306427 0.23666382 1.8481064zm10.854523 3.1318436l-5.740265 -9.946869l1.1094971 -0.6407852l0.5388794 0.9337845q0.07220459 -0.78147125 0.4031067 -1.333458q0.3444214 -0.5597992 1.0480042 -0.9661484q0.92007446 -0.5313797 1.8970032 -0.46406555q0.9769592 0.06730652 1.8285828 0.7302551q0.8573303 0.64160156 1.4508972 1.6701202q0.6404114 1.1097183 0.74212646 2.2238083q0.11526489 1.1062698 -0.3691101 2.01754q-0.4708252 0.90345 -1.3097229 1.3879471q-0.6088867 0.35164642 -1.2443542 0.37583923q-0.62197876 0.01637268 -1.159668 -0.19635773l2.022766 3.5050888l-1.2177429 0.7033005zm-2.551239 -6.952957q0.80441284 1.3939209 1.7418518 1.7366486q0.95095825 0.3349228 1.7492676 -0.12612915q0.8118286 -0.46886444 0.9953308 -1.495079q0.18353271 -1.026207 -0.6443176 -2.4607239q-0.79663086 -1.3803787 -1.7554016 -1.728836q-0.96658325 -0.36198425 -1.7513428 0.09125519q-0.77124023 0.4454193 -0.958374 1.5278625q-0.1736145 1.0746231 0.62298584 2.4550018zm12.239746 -5.408928l1.3442688 -0.57788086q0.34274292 1.2816391 -0.11764526 2.359497q-0.4468689 1.0700455 -1.6916809 1.788971q-1.5830688 0.91428375 -3.0654602 0.47128296q-1.4823914 -0.44300842 -2.4898682 -2.1887894q-1.0465088 -1.8134384 -0.6908264 -3.3540878q0.35565186 -1.5406494 1.8440247 -2.400238q1.4342346 -0.828331 2.9108887 -0.36397552q1.4823914 0.44300842 2.5054932 2.2158508q0.062469482 0.10826874 0.18743896 0.32479858l-5.3580933 3.094513q0.75491333 1.1518402 1.7173462 1.4440689q0.96810913 0.27087402 1.861145 -0.24488068q0.6765137 -0.39071655 0.9470215 -1.0160828q0.2705078 -0.62537384 0.095947266 -1.5530472zm-5.1317444 0.32941437l4.0050354 -2.3130722q-0.60443115 -0.85983276 -1.2410278 -1.0876236q-0.98791504 -0.3677063 -1.9079895 0.1636734q-0.8388977 0.48449707 -1.0926819 1.3889084q-0.2480774 0.88306427 0.23666382 1.848114zm9.2612915 0.3710785l-4.1470337 -7.1861115l1.0959778 -0.6329651l0.5857239 1.0149841q0.10531616 -1.6306229 1.6072083 -2.498024q0.6494751 -0.37509155 1.3234558 -0.45760345q0.6739807 -0.0825119 1.163269 0.14012909q0.48928833 0.22263336 0.9021301 0.687912q0.26290894 0.29925537 0.74710083 1.1383133l2.553833 4.4253464l-1.2177429 0.70329285l-2.522583 -4.371208q-0.42956543 -0.74432373 -0.7892456 -1.0237579q-0.3540039 -0.30078888 -0.8442688 -0.30632782q-0.4767456 -0.01335144 -0.9638672 0.26796722q-0.7847595 0.4532318 -1.0640869 1.2821655q-0.2735901 0.80758667 0.52301025 2.187973l2.2648926 3.924614l-1.2177734 0.7033005zm12.360168 -7.138481l-0.5232849 -0.906723q-0.059539795 1.4598389 -1.3855286 2.2256546q-0.8659668 0.5001221 -1.8564148 0.44062805q-0.9904785 -0.05949402 -1.8883972 -0.67765045q-0.88442993 -0.62596893 -1.5170288 -1.7221603q-0.6247864 -1.0826492 -0.77282715 -2.151947q-0.14230347 -1.0906448 0.30926514 -1.9649353q0.44378662 -0.887825 1.336792 -1.4035797q0.6494751 -0.37509155 1.3063049 -0.39356232q0.6703491 -0.026283264 1.2392883 0.2405777l-2.054016 -3.5592194l1.2177429 -0.7033005l5.7246704 9.919807l-1.1365662 0.6564102zm-5.9122925 -1.3669891q0.79660034 1.3803787 1.7767029 1.7345505q0.97232056 0.3406372 1.7570801 -0.112602234q0.7983093 -0.4610443 0.97817993 -1.4310303q0.18560791 -0.991333 -0.58758545 -2.3311157q-0.8512573 -1.4751129 -1.8178406 -1.8370972q-0.96658325 -0.36198425 -1.805481 0.12251282q-0.8118286 0.46886444 -0.97036743 1.4445648q-0.15853882 0.9757004 0.6693115 2.4102173zm12.53952 -5.5459747l1.3442383 -0.57787323q0.34274292 1.2816391 -0.11764526 2.3594894q-0.4468689 1.0700455 -1.6916809 1.788971q-1.5830688 0.9142914 -3.0654602 0.47128296q-1.4823914 -0.4430008 -2.4898682 -2.1887817q-1.0465088 -1.813446 -0.6908264 -3.3540955q0.35568237 -1.5406494 1.8440247 -2.4002304q1.4342346 -0.828331 2.9109192 -0.36397552q1.4823608 0.4430008 2.5054626 2.2158508q0.0625 0.10826111 0.18743896 0.32479095l-5.3580933 3.094513q0.75494385 1.1518478 1.7173462 1.4440689q0.96810913 0.27088165 1.861145 -0.24487305q0.6765137 -0.39072418 0.9470215 -1.0160904q0.2705078 -0.6253662 0.09597778 -1.5530472zm-5.131775 0.32941437l4.005066 -2.3130646q-0.60446167 -0.8598404 -1.2410278 -1.0876312q-0.98794556 -0.36769867 -1.90802 0.16368103q-0.8388977 0.48449707 -1.0926819 1.3889084q-0.2480774 0.88306427 0.23666382 1.8481064zm9.261322 0.3710785l-4.147064 -7.186104l1.0959778 -0.6329727l0.5857544 1.0149918q0.105285645 -1.6306229 1.6071777 -2.498024q0.6494751 -0.37509155 1.3234558 -0.45761108q0.67401123 -0.0825119 1.163269 0.14012909q0.48928833 0.22264099 0.9021301 0.6879196q0.26290894 0.29925537 0.74710083 1.1383133l2.553833 4.4253387l-1.2177429 0.7033005l-2.522583 -4.371208q-0.4295349 -0.74432373 -0.7892456 -1.0237656q-0.3539734 -0.30078125 -0.8442688 -0.3063202q-0.4767456 -0.01335907 -0.96383667 0.2679596q-0.78479004 0.45323944 -1.0640869 1.2821732q-0.2736206 0.80758667 0.52301025 2.1879654l2.264862 3.924614l-1.2177429 0.7033005zm9.725006 -7.078125l0.8006897 0.98106384q-0.45169067 0.4052124 -0.857605 0.6396408q-0.6629944 0.38291168 -1.1454468 0.39089966q-0.4902954 -0.0055389404 -0.8343811 -0.25791168q-0.3518982 -0.26589966 -0.9844971 -1.3620834l-2.382019 -4.127617l-0.8930054 0.5157547l-0.5466919 -0.94731903l0.8930054 -0.5157547l-1.0308838 -1.786377l0.79599 -1.4340897l1.4526672 2.5171661l1.2177429 -0.7033005l0.5466919 0.94732666l-1.2177429 0.70329285l2.421051 4.195282q0.30459595 0.5277939 0.4446106 0.645401q0.15356445 0.10979462 0.35708618 0.11856842q0.19567871 -0.004760742 0.46627808 -0.16104889q0.20297241 -0.11721802 0.49645996 -0.35889435z" fill-rule="nonzero"/><path fill="#000000" d="m299.15155 144.21382l-5.72464 -9.919815l1.2177429 -0.70329285l3.2645264 5.6568604l1.1950684 -4.587631l1.5830688 -0.9142914l-1.2081604 4.252365l5.625824 2.7774506l-1.5018921 0.8674011l-4.4921265 -2.3134918l-0.38955688 1.3256378l1.6478882 2.8554993l-1.2177429 0.7033081zm10.503723 -9.151794l1.3442383 -0.57788086q0.34274292 1.2816315 -0.11764526 2.359497q-0.44683838 1.0700378 -1.6916504 1.788971q-1.5830688 0.9142761 -3.0654602 0.47128296q-1.4823914 -0.44300842 -2.4898682 -2.188797q-1.0465393 -1.8134308 -0.69085693 -3.3540802q0.35568237 -1.5406494 1.8440247 -2.400238q1.4342346 -0.8283386 2.9109192 -0.36398315q1.4823914 0.44300842 2.5054932 2.2158508q0.062469482 0.10827637 0.18743896 0.32479858l-5.3580933 3.094513q0.75491333 1.1518402 1.7173157 1.4440765q0.96813965 0.27087402 1.861145 -0.2448883q0.6765137 -0.39071655 0.947052 -1.0160828q0.2705078 -0.6253662 0.095947266 -1.5530396zm-5.1317444 0.32940674l4.0050354 -2.3130646q-0.60446167 -0.85983276 -1.2410278 -1.0876312q-0.98791504 -0.3677063 -1.90802 0.16368103q-0.8388672 0.48449707 -1.0926819 1.3889008q-0.2480774 0.8830719 0.23669434 1.848114zm9.247772 0.378891l-4.147064 -7.1861115l1.0959778 -0.6329651l0.6247864 1.0826569q-0.017913818 -1.0001068 0.19329834 -1.4468765q0.21124268 -0.4467697 0.64419556 -0.69683075q0.6088867 -0.35165405 1.4713135 -0.32646942l0.22875977 1.3655014q-0.59487915 7.4768066E-4 -1.0413818 0.25862122q-0.39239502 0.22662354 -0.5765686 0.6577606q-0.17840576 0.40979004 -0.063812256 0.92100525q0.17190552 0.7667999 0.61709595 1.5381927l2.1711426 3.7622223l-1.2177429 0.70329285zm4.6274414 -2.6725311l-4.147064 -7.186104l1.0959778 -0.6329727l0.5857544 1.0149841q0.105285645 -1.6306152 1.6071777 -2.4980164q0.6494751 -0.37509918 1.3234558 -0.45761108q0.67401123 -0.0825119 1.1632996 0.14012909q0.4892578 0.22264099 0.9020996 0.6879196q0.26290894 0.29925537 0.74710083 1.1383133l2.553833 4.4253464l-1.2177429 0.70329285l-2.522583 -4.371208q-0.4295349 -0.74432373 -0.7892456 -1.0237656q-0.3539734 -0.30078125 -0.8442688 -0.3063202q-0.4767456 -0.01335907 -0.96383667 0.2679596q-0.78479004 0.45323944 -1.0640869 1.2821732q-0.2736206 0.80758667 0.52301025 2.1879654l2.264862 3.9246216l-1.2177429 0.70329285zm11.281708 -9.60112l1.3442688 -0.57788086q0.34274292 1.2816391 -0.11764526 2.359497q-0.4468689 1.0700378 -1.6916809 1.788971q-1.5830688 0.91428375 -3.0654602 0.47127533q-1.4823914 -0.4430008 -2.4898682 -2.1887817q-1.0465088 -1.8134384 -0.6908264 -3.3540878q0.35565186 -1.5406494 1.8440247 -2.400238q1.4342346 -0.828331 2.9108887 -0.36397552q1.4823914 0.44300842 2.5054932 2.2158508q0.062469482 0.10826111 0.18743896 0.32479095l-5.3580933 3.094513q0.75491333 1.1518478 1.7173462 1.4440689q0.96810913 0.27088165 1.861145 -0.24487305q0.6765137 -0.39072418 0.9470215 -1.0160904q0.2705078 -0.6253662 0.095947266 -1.5530396zm-5.1317444 0.32941437l4.0050354 -2.3130722q-0.60443115 -0.85983276 -1.2410278 -1.0876236q-0.98791504 -0.3677063 -1.9079895 0.1636734q-0.8388977 0.48449707 -1.0926819 1.3889084q-0.2480774 0.88306427 0.23666382 1.848114zm9.234253 0.3867035l-5.7246704 -9.919807l1.2177734 -0.7033005l5.72464 9.919807l-1.2177429 0.7033005z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m282.76047 135.94267c-17.003845 0 -26.795105 -5.566925 -34.007706 -11.133858c-7.2126007 -5.5669327 -11.846542 -11.133858 -23.6931 -11.133858" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.76047 135.94267c-17.003876 0 -26.795105 -5.5669403 -34.007706 -11.133865c-3.6062927 -2.7834625 -6.567932 -5.566925 -10.10881 -7.6545258c-0.4426117 -0.26094818 -0.8942871 -0.5110321 -1.3573761 -0.74887085c-0.11578369 -0.0594635 -0.23228455 -0.11816406 -0.34950256 -0.17607117l-0.13806152 -0.06682587" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m237.48381 112.93081l-9.563843 1.350235l8.194244 5.113182z" fill-rule="evenodd"/></g></svg>
+\ No newline at end of file
diff --git a/g3doc/Machine-Virtualization.png b/g3doc/Machine-Virtualization.png
new file mode 100644
index 000000000..1ba2ed6b2
--- /dev/null
+++ b/g3doc/Machine-Virtualization.png
diff --git a/g3doc/Machine-Virtualization.svg b/g3doc/Machine-Virtualization.svg
new file mode 100644
index 000000000..5352da07b
--- /dev/null
+++ b/g3doc/Machine-Virtualization.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 387.7034120734908 336.4225721784777" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l387.7034 0l0 336.42258l-387.7034 0l0 -336.42258z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l387.7034 0l0 336.42258l-387.7034 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m44.454067 14.643044l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path stroke="#cc4125" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m44.454067 14.643044l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path fill="#000000" d="m86.206116 45.98824l5.125 -13.359375l1.90625 0l5.46875 13.359375l-2.015625 0l-1.546875 -4.046875l-5.59375 0l-1.46875 4.046875l-1.875 0zm3.859375 -5.484375l4.53125 0l-1.40625 -3.703125q-0.625 -1.6875 -0.9375 -2.765625q-0.265625 1.28125 -0.71875 2.546875l-1.46875 3.921875zm9.849823 9.1875l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm8.891342 8.484375l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm8.844467 4.78125l0 -13.359375l1.640625 0l0 13.359375l-1.640625 0zm4.191696 -11.46875l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm10.457321 -3.546875l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.578125 0.515625 -2.75q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.8125q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125zm9.328125 2.359375q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm7.781967 3.390625l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 -10.0l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm3.5354462 -4.84375q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm9.297592 4.84375l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m12.454068 81.068245l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.454068 81.068245l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m180.45407 81.068245l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m180.45407 81.068245l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m81.43044 64.70238l100.88189 0l0 32.06299l-100.88189 0z" fill-rule="evenodd"/><path fill="#000000" d="m95.06437 82.471375l1.203125 -0.109375q0.078125 0.71875 0.390625 1.1875q0.3125 0.453125 0.953125 0.734375q0.65625 0.28125 1.46875 0.28125q0.71875 0 1.265625 -0.21875q0.5625 -0.21875 0.828125 -0.578125q0.265625 -0.375 0.265625 -0.828125q0 -0.453125 -0.265625 -0.78125q-0.25 -0.328125 -0.84375 -0.5625q-0.390625 -0.15625 -1.703125 -0.46875q-1.3125 -0.3125 -1.84375 -0.59375q-0.671875 -0.359375 -1.015625 -0.890625q-0.328125 -0.53125 -0.328125 -1.1875q0 -0.71875 0.40625 -1.34375q0.40625 -0.625 1.1875 -0.953125q0.796875 -0.328125 1.765625 -0.328125q1.046875 0 1.859375 0.34375q0.8125 0.34375 1.25 1.015625q0.4375 0.65625 0.46875 1.484375l-1.203125 0.09375q-0.109375 -0.90625 -0.671875 -1.359375q-0.5625 -0.46875 -1.65625 -0.46875q-1.140625 0 -1.671875 0.421875q-0.515625 0.421875 -0.515625 1.015625q0 0.515625 0.359375 0.84375q0.375 0.328125 1.90625 0.6875q1.546875 0.34375 2.109375 0.59375q0.84375 0.390625 1.234375 0.984375q0.390625 0.578125 0.390625 1.359375q0 0.75 -0.4375 1.4375q-0.421875 0.671875 -1.25 1.046875q-0.8125 0.359375 -1.828125 0.359375q-1.296875 0 -2.171875 -0.375q-0.875 -0.375 -1.375 -1.125q-0.5 -0.765625 -0.53125 -1.71875zm9.12413 5.71875l-0.125 -1.09375q0.375 0.109375 0.65625 0.109375q0.390625 0 0.625 -0.140625q0.234375 -0.125 0.390625 -0.359375q0.109375 -0.171875 0.359375 -0.875q0.03125 -0.09375 0.109375 -0.28125l-2.625 -6.921875l1.265625 0l1.4375 4.0q0.28125 0.765625 0.5 1.59375q0.203125 -0.796875 0.46875 -1.578125l1.484375 -4.015625l1.171875 0l-2.625 7.015625q-0.421875 1.140625 -0.65625 1.578125q-0.3125 0.578125 -0.71875 0.84375q-0.40625 0.28125 -0.96875 0.28125q-0.328125 0 -0.75 -0.15625zm6.2421875 -4.71875l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625zm9.6953125 1.015625l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.8748627 -1.171875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.521843 4.125l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.03125 0 1.578125 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.703125 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm19.32106 -2.53125l1.15625 0.15625q-0.1875 1.1875 -0.96875 1.859375q-0.78125 0.671875 -1.921875 0.671875q-1.4062653 0 -2.2812653 -0.921875q-0.859375 -0.9375 -0.859375 -2.65625q0 -1.125 0.375 -1.96875q0.375 -0.84375 1.125 -1.25q0.765625 -0.421875 1.6562653 -0.421875q1.125 0 1.84375 0.578125q0.71875 0.5625 0.921875 1.609375l-1.140625 0.171875q-0.171875 -0.703125 -0.59375 -1.046875q-0.40625 -0.359375 -0.984375 -0.359375q-0.890625 0 -1.4531403 0.640625q-0.546875 0.640625 -0.546875 2.0q0 1.40625 0.53125 2.03125q0.546875 0.625 1.4062653 0.625q0.6875 0 1.140625 -0.421875q0.46875 -0.421875 0.59375 -1.296875zm6.6640625 1.671875q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.96109 0l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.507965 -2.0625l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625z" fill-rule="nonzero"/><path fill="#fff2cc" d="m44.454067 95.40656l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path stroke="#f1c232" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m44.454067 95.40656l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path fill="#000000" d="m85.11598 121.51739l0 -1.578125l5.65625 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.375 0.515625 -2.84375 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421875 -2.4375q-0.8125 -1.59375 -0.8125 -3.5625q0 -1.953125 0.8125 -3.640625q0.8125 -1.6875 2.34375 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.625 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.265625 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.09375 -0.453125 1.65625 -0.953125l0 -2.484375l-3.921875 0zm14.386429 5.234375l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.640625 0l0 9.671875l-1.46875 0zm10.672592 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm8.485092 2.875l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm13.5625 1.421875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm6.9134827 1.46875l0 -13.359375l1.78125 0l0 6.625l6.625 -6.625l2.390625 0l-5.59375 5.421875l5.84375 7.9375l-2.328125 0l-4.765625 -6.765625l-2.171875 2.140625l0 4.625l-1.78125 0zm18.943573 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm6.228302 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm17.000717 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.094467 5.765625l0 -13.359375l1.640625 0l0 13.359375l-1.640625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m12.454068 161.83176l57.574802 0" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.454068 161.83176l57.574802 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m193.71391 161.83176l60.7874 0" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m193.71391 161.83176l60.7874 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m70.02887 145.80026l123.68504 0l0 32.06299l-123.68504 0z" fill-rule="evenodd"/><path fill="#000000" d="m87.09864 166.63176l-3.6875 -9.546875l1.359375 0l2.484375 6.9375q0.296875 0.828125 0.5 1.5625q0.21875 -0.78125 0.515625 -1.5625l2.578125 -6.9375l1.28125 0l-3.734375 9.546875l-1.296875 0zm6.0303802 -8.1875l0 -1.359375l1.171875 0l0 1.359375l-1.171875 0zm0 8.1875l0 -6.90625l1.171875 0l0 6.90625l-1.171875 0zm2.92984 0l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm7.0164948 -1.046875l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.6717377 1.046875l0 -1.015625q-0.8125 1.171875 -2.1875 1.171875q-0.609375 0 -1.140625 -0.234375q-0.53125 -0.234375 -0.796875 -0.578125q-0.25 -0.359375 -0.359375 -0.875q-0.0625 -0.34375 -0.0625 -1.09375l0 -4.28125l1.171875 0l0 3.828125q0 0.921875 0.0625 1.234375q0.109375 0.46875 0.46875 0.734375q0.359375 0.25 0.890625 0.25q0.515625 0 0.984375 -0.265625q0.46875 -0.265625 0.65625 -0.734375q0.1875 -0.46875 0.1875 -1.34375l0 -3.703125l1.171875 0l0 6.90625l-1.046875 0zm7.3968506 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm6.6797028 0l0 -9.546875l1.171875 0l0 3.421875q0.828125 -0.9375 2.0781174 -0.9375q0.765625 0 1.328125 0.296875q0.5625 0.296875 0.8125 0.84375q0.25 0.53125 0.25 1.546875l0 4.375l-1.171875 0l0 -4.375q0 -0.890625 -0.390625 -1.28125q-0.375 -0.40625 -1.078125 -0.40625q-0.515625 0 -0.9843674 0.28125q-0.453125 0.265625 -0.65625 0.734375q-0.1875 0.453125 -0.1875 1.265625l0 3.78125l-1.171875 0zm11.928093 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm8.93837 0l0 -0.875q-0.65625 1.03125 -1.9375 1.03125q-0.8125 0 -1.515625 -0.453125q-0.6875 -0.453125 -1.078125 -1.265625q-0.375 -0.828125 -0.375 -1.890625q0 -1.03125 0.34375 -1.875q0.34375 -0.84375 1.03125 -1.28125q0.703125 -0.453125 1.546875 -0.453125q0.625 0 1.109375 0.265625q0.5 0.25 0.796875 0.671875l0 -3.421875l1.171875 0l0 9.546875l-1.09375 0zm-3.703125 -3.453125q0 1.328125 0.5625 1.984375q0.5625 0.65625 1.328125 0.65625q0.765625 0 1.296875 -0.625q0.53125 -0.625 0.53125 -1.90625q0 -1.421875 -0.546875 -2.078125q-0.546875 -0.671875 -1.34375 -0.671875q-0.78125 0 -1.3125 0.640625q-0.515625 0.625 -0.515625 2.0zm7.9124756 3.453125l-2.125 -6.90625l1.21875 0l1.09375 3.984375l0.421875 1.484375q0.015625 -0.109375 0.359375 -1.421875l1.09375 -4.046875l1.203125 0l1.03125 4.0l0.34375 1.328125l0.40625 -1.34375l1.171875 -3.984375l1.140625 0l-2.15625 6.90625l-1.21875 0l-1.09375 -4.140625l-0.265625 -1.171875l-1.40625 5.3125l-1.21875 0zm12.859528 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm9.18837 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375z" fill-rule="nonzero"/><path fill="#d9ead3" d="m44.454067 175.40657l174.83464 0l0 48.850388l-174.83464 0z" fill-rule="evenodd"/><path stroke="#93c47d" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m44.454067 175.40657l174.83464 0l0 48.850388l-174.83464 0z" fill-rule="evenodd"/><path fill="#000000" d="m115.3588 206.75175l-5.171875 -13.359375l1.921875 0l3.46875 9.703125q0.421875 1.171875 0.703125 2.1875q0.3125 -1.09375 0.71875 -2.1875l3.609375 -9.703125l1.796875 0l-5.234375 13.359375l-1.8125 0zm8.584198 0l0 -13.359375l2.65625 0l3.1562424 9.453125q0.4375 1.328125 0.640625 1.984375q0.234375 -0.734375 0.703125 -2.140625l3.203125 -9.296875l2.375 0l0 13.359375l-1.703125 0l0 -11.171875l-3.875 11.171875l-1.59375 0l-3.8593674 -11.375l0 11.375l-1.703125 0zm15.540794 0l0 -13.359375l2.65625 0l3.15625 9.453125q0.4375 1.328125 0.640625 1.984375q0.234375 -0.734375 0.703125 -2.140625l3.203125 -9.296875l2.375 0l0 13.359375l-1.703125 0l0 -11.171875l-3.875 11.171875l-1.59375 0l-3.859375 -11.375l0 11.375l-1.703125 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m12.454068 239.1764l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.454068 239.1764l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m180.45407 239.1764l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m180.45407 239.1764l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m81.43044 222.81055l100.88189 0l0 32.06299l-100.88189 0z" fill-rule="evenodd"/><path fill="#000000" d="m95.06437 240.57954l1.203125 -0.109375q0.078125 0.71875 0.390625 1.1875q0.3125 0.453125 0.953125 0.734375q0.65625 0.28125 1.46875 0.28125q0.71875 0 1.265625 -0.21875q0.5625 -0.21875 0.828125 -0.578125q0.265625 -0.375 0.265625 -0.828125q0 -0.453125 -0.265625 -0.78125q-0.25 -0.328125 -0.84375 -0.5625q-0.390625 -0.15625 -1.703125 -0.46875q-1.3125 -0.3125 -1.84375 -0.59375q-0.671875 -0.359375 -1.015625 -0.890625q-0.328125 -0.53125 -0.328125 -1.1875q0 -0.71875 0.40625 -1.34375q0.40625 -0.625 1.1875 -0.953125q0.796875 -0.328125 1.765625 -0.328125q1.046875 0 1.859375 0.34375q0.8125 0.34375 1.25 1.015625q0.4375 0.65625 0.46875 1.484375l-1.203125 0.09375q-0.109375 -0.90625 -0.671875 -1.359375q-0.5625 -0.46875 -1.65625 -0.46875q-1.140625 0 -1.671875 0.421875q-0.515625 0.421875 -0.515625 1.015625q0 0.515625 0.359375 0.84375q0.375 0.328125 1.90625 0.6875q1.546875 0.34375 2.109375 0.59375q0.84375 0.390625 1.234375 0.984375q0.390625 0.578125 0.390625 1.359375q0 0.75 -0.4375 1.4375q-0.421875 0.671875 -1.25 1.046875q-0.8125 0.359375 -1.828125 0.359375q-1.296875 0 -2.171875 -0.375q-0.875 -0.375 -1.375 -1.125q-0.5 -0.765625 -0.53125 -1.71875zm9.12413 5.71875l-0.125 -1.09375q0.375 0.109375 0.65625 0.109375q0.390625 0 0.625 -0.140625q0.234375 -0.125 0.390625 -0.359375q0.109375 -0.171875 0.359375 -0.875q0.03125 -0.09375 0.109375 -0.28125l-2.625 -6.921875l1.265625 0l1.4375 4.0q0.28125 0.765625 0.5 1.59375q0.203125 -0.796875 0.46875 -1.578125l1.484375 -4.015625l1.171875 0l-2.625 7.015625q-0.421875 1.140625 -0.65625 1.578125q-0.3125 0.578125 -0.71875 0.84375q-0.40625 0.28125 -0.96875 0.28125q-0.328125 0 -0.75 -0.15625zm6.2421875 -4.71875l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625zm9.6953125 1.015625l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.8748627 -1.171875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.521843 4.125l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.03125 0 1.578125 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.703125 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm19.32106 -2.53125l1.15625 0.15625q-0.1875 1.1875 -0.96875 1.859375q-0.78125 0.671875 -1.921875 0.671875q-1.4062653 0 -2.2812653 -0.921875q-0.859375 -0.9375 -0.859375 -2.65625q0 -1.125 0.375 -1.96875q0.375 -0.84375 1.125 -1.25q0.765625 -0.421875 1.6562653 -0.421875q1.125 0 1.84375 0.578125q0.71875 0.5625 0.921875 1.609375l-1.140625 0.171875q-0.171875 -0.703125 -0.59375 -1.046875q-0.40625 -0.359375 -0.984375 -0.359375q-0.890625 0 -1.4531403 0.640625q-0.546875 0.640625 -0.546875 2.0q0 1.40625 0.53125 2.03125q0.546875 0.625 1.4062653 0.625q0.6875 0 1.140625 -0.421875q0.46875 -0.421875 0.59375 -1.296875zm6.6640625 1.671875q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.96109 0l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.507965 -2.0625l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m44.454067 252.7512l174.83464 0l0 48.850388l-174.83464 0z" fill-rule="evenodd"/><path stroke="#6d9eeb" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m44.454067 252.7512l174.83464 0l0 48.850388l-174.83464 0z" fill-rule="evenodd"/><path fill="#000000" d="m84.63558 284.0964l0 -13.359375l1.765625 0l0 5.484375l6.9375 0l0 -5.484375l1.765625 0l0 13.359375l-1.765625 0l0 -6.296875l-6.9375 0l0 6.296875l-1.765625 0zm12.597946 -4.84375q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm8.641342 1.953125l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm13.5625 1.421875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm6.913475 1.46875l0 -13.359375l1.78125 0l0 6.625l6.625 -6.625l2.390625 0l-5.59375 5.421875l5.84375 7.9375l-2.328125 0l-4.765625 -6.765625l-2.171875 2.140625l0 4.625l-1.78125 0zm18.943573 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm6.228302 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm17.000732 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.7656403 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375153 0 3.1562653 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.2187653 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.5468903 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.3906403 -2.65625l5.4062653 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.0312653 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.094467 5.765625l0 -13.359375l1.640625 0l0 13.359375l-1.640625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m12.454068 319.17642l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.454068 319.17642l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m180.45407 319.17642l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m180.45407 319.17642l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m81.43044 302.81055l100.88189 0l0 32.06299l-100.88189 0z" fill-rule="evenodd"/><path fill="#000000" d="m104.04542 323.64203l0 -9.546875l1.265625 0l0 3.921875l4.953125 0l0 -3.921875l1.265625 0l0 9.546875l-1.265625 0l0 -4.5l-4.953125 0l0 4.5l-1.265625 0zm13.953278 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm8.938362 0l0 -0.875q-0.65625 1.03125 -1.9374924 1.03125q-0.8125 0 -1.515625 -0.453125q-0.6875 -0.453125 -1.078125 -1.265625q-0.375 -0.828125 -0.375 -1.890625q0 -1.03125 0.34375 -1.875q0.34375 -0.84375 1.03125 -1.28125q0.703125 -0.453125 1.546875 -0.453125q0.6249924 0 1.1093674 0.265625q0.5 0.25 0.796875 0.671875l0 -3.421875l1.171875 0l0 9.546875l-1.09375 0zm-3.7031174 -3.453125q0 1.328125 0.5625 1.984375q0.5625 0.65625 1.3281174 0.65625q0.765625 0 1.296875 -0.625q0.53125 -0.625 0.53125 -1.90625q0 -1.421875 -0.546875 -2.078125q-0.546875 -0.671875 -1.3437424 -0.671875q-0.78125 0 -1.3125 0.640625q-0.515625 0.625 -0.515625 2.0zm7.912468 3.453125l-2.125 -6.90625l1.21875 0l1.09375 3.984375l0.421875 1.484375q0.015625 -0.109375 0.359375 -1.421875l1.09375 -4.046875l1.203125 0l1.03125 4.0l0.34375 1.328125l0.40625 -1.34375l1.171875 -3.984375l1.140625 0l-2.15625 6.90625l-1.21875 0l-1.09375 -4.140625l-0.265625 -1.171875l-1.40625 5.3125l-1.21875 0zm12.859543 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59376526 0.21875 -1.2812653 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.4218903 -0.171875 2.0937653 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.3437653 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.89064026 0 1.4375153 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.9218903 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.2031403 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm9.18837 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375z" fill-rule="nonzero"/><defs><linearGradient id="p.1" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.545553100086654 0.0 0.0 4.545553100086654 0.0 0.0)" spreadMethod="pad" x1="9.954639806354566" y1="38.70166210013951" x2="9.95462288989064" y2="43.24721520019468"><stop offset="0.0" stop-color="#ff0000"/><stop offset="0.51" stop-color="#dab7a6"/><stop offset="0.99999994" stop-color="#dab7a6" stop-opacity="0.0"/><stop offset="1.0" stop-color="#ffffff" stop-opacity="0.0"/></linearGradient></defs><path fill="url(#p.1)" d="m45.249344 175.92108l173.29134 0l0 20.661423l-173.29134 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m280.4455 190.06865l129.5748 -74.83464l20.629913 35.74803l-129.5748 74.83464z" fill-rule="evenodd"/><path fill="#000000" d="m295.51392 196.73558l1.1823425 -0.82717896q0.51071167 0.6974335 1.1166077 0.9970703q0.5980835 0.28611755 1.4464111 0.1931305q0.84054565 -0.10652161 1.6794434 -0.5910187q0.75772095 -0.4376068 1.2010193 -0.9823456q0.44906616 -0.5660858 0.50097656 -1.1013031q0.057678223 -0.55656433 -0.20785522 -1.0166931q-0.27334595 -0.47366333 -0.7392273 -0.6557007q-0.47366333 -0.1955719 -1.2366333 -0.079711914q-0.478302 0.07775879 -2.032318 0.54222107q-1.5618286 0.45092773 -2.2805786 0.48712158q-0.9222717 0.027420044 -1.5864563 -0.31072998q-0.6719971 -0.35168457 -1.0703125 -1.0418701q-0.4295349 -0.74432373 -0.38497925 -1.6361694q0.05029297 -0.9131775 0.6668701 -1.7203827q0.63012695 -0.81500244 1.6313782 -1.3932648q1.1095276 -0.64079285 2.1592712 -0.7598877q1.0419617 -0.1326294 1.8867493 0.29968262q0.8583679 0.4244995 1.3987732 1.2671814l-1.2036743 0.82147217q-0.64712524 -0.87127686 -1.5022583 -1.0089111q-0.8629761 -0.15116882 -2.013092 0.51304626q-1.1906738 0.68766785 -1.4819641 1.4333191q-0.2913208 0.745636 0.06793213 1.3681641q0.30459595 0.52778625 0.8865051 0.6608429q0.5740967 0.119522095 2.3815613 -0.43717957q1.8210144 -0.5645294 2.5725403 -0.6376953q1.0924377 -0.107666016 1.857605 0.28042603q0.77090454 0.366745 1.2316895 1.1652069q0.4529724 0.78492737 0.39904785 1.7543335q-0.040405273 0.9616089 -0.6663208 1.8463745q-0.62594604 0.8847656 -1.6813354 1.4942932q-1.3530579 0.78144836 -2.486084 0.9125519q-1.1408386 0.11756897 -2.1214905 -0.3625946q-0.96713257 -0.48797607 -1.5721436 -1.4738007zm13.40155 -4.9431458l0.8006897 0.98106384q-0.45169067 0.4052124 -0.857605 0.6396332q-0.6629944 0.3829193 -1.1454773 0.39089966q-0.4902649 -0.0055389404 -0.8343506 -0.25790405q-0.3518982 -0.26589966 -0.9844971 -1.3620911l-2.382019 -4.1276093l-0.8930054 0.5157471l-0.5466919 -0.9473114l0.8930054 -0.5157623l-1.0308838 -1.786377l0.79599 -1.434082l1.4526367 2.5171661l1.2177734 -0.7033081l0.5466919 0.94732666l-1.2177734 0.70329285l2.4210815 4.195282q0.30456543 0.5278015 0.4446106 0.645401q0.15356445 0.109802246 0.35705566 0.11857605q0.19570923 -0.004760742 0.4663086 -0.16105652q0.20297241 -0.11721802 0.49645996 -0.35888672zm1.8165283 0.41241455l-4.147064 -7.1861115l1.0959778 -0.6329651l0.6247864 1.0826569q-0.0178833 -1.0001068 0.19332886 -1.4468842q0.21121216 -0.44676208 0.64419556 -0.6968231q0.6088562 -0.35165405 1.471283 -0.3264618l0.22875977 1.3654938q-0.59487915 7.4768066E-4 -1.0413818 0.25862122q-0.39239502 0.22662354 -0.5765381 0.6577606q-0.17843628 0.40979004 -0.06384277 0.92100525q0.17193604 0.7667999 0.61709595 1.5381927l2.1711426 3.7622223l-1.2177429 0.70329285zm2.0899658 -6.006668q-1.1480408 -1.9893646 -0.5930481 -3.5910034q0.47280884 -1.3376465 1.7988281 -2.1034698q1.4883423 -0.8595886 2.984253 -0.4243927q1.501648 0.41384888 2.4934998 2.132553q0.79660034 1.3803864 0.83795166 2.4210815q0.04135132 1.0407104 -0.49923706 1.948349q-0.5348511 0.88630676 -1.4819946 1.4333191q-1.5018921 0.8674011 -2.9899902 0.44573975q-1.4959106 -0.43519592 -2.5502625 -2.2621765zm1.2583313 -0.72673035q0.79663086 1.3803711 1.7902527 1.7267303q1.0072021 0.33854675 1.9137268 -0.18502808q0.9065552 -0.5235596 1.1036072 -1.5575867q0.21057129 -1.0418396 -0.60946655 -2.4628143q-0.76538086 -1.3262482 -1.7725525 -1.6647949q-0.99365234 -0.34635925 -1.9002075 0.17720032q-0.90652466 0.5235748 -1.117096 1.5654144q-0.2048645 1.0204926 0.59173584 2.400879zm8.984772 -0.38945007l-4.1470337 -7.1861115l1.0959778 -0.6329651l0.5857239 1.0149841q0.10531616 -1.6306152 1.6072083 -2.4980164q0.6494751 -0.37509155 1.3234558 -0.45761108q0.6739807 -0.08250427 1.163269 0.14013672q0.48928833 0.22264099 0.9021301 0.687912q0.26287842 0.29925537 0.74710083 1.1383057l2.553833 4.425354l-1.2177429 0.70329285l-2.522583 -4.371216q-0.42956543 -0.74432373 -0.7892456 -1.0237579q-0.3540039 -0.30078125 -0.8442688 -0.3063202q-0.47677612 -0.01335144 -0.9638672 0.2679596q-0.7847595 0.45324707 -1.0640869 1.2821655q-0.2735901 0.80758667 0.52301025 2.187973l2.264862 3.9246216l-1.2177429 0.70329285zm7.819275 -3.7220154l1.2922058 -0.511734q0.3878479 0.51579285 0.8666992 0.5640259q0.65527344 0.072631836 1.4400635 -0.38059998q0.8388977 -0.48449707 1.1036682 -1.0885162q0.26480103 -0.60401917 0.07571411 -1.306778q-0.1161499 -0.42010498 -0.81121826 -1.6245575q-0.25161743 1.408371 -1.4423218 2.096054q-1.4883423 0.85957336 -2.9171448 0.25932312q-1.428833 -0.60025024 -2.2957153 -2.1024323q-0.5935669 -1.0285187 -0.72805786 -2.1056366q-0.12097168 -1.0849152 0.30926514 -1.9649353q0.4437561 -0.8878174 1.3908997 -1.4348297q1.2718811 -0.7345581 2.690796 -0.182724l-0.4998474 -0.8661194l1.1230469 -0.6485901l3.5847168 6.2117157q0.9684448 1.6781158 1.0362854 2.5771942q0.067840576 0.8990936 -0.4420166 1.7348785q-0.5098877 0.8357849 -1.5923462 1.4609375q-1.2854004 0.7423706 -2.4195251 0.62150574q-1.1206055 -0.12869263 -1.7651672 -1.3081818zm-1.4765625 -4.9031525q0.81222534 1.4074402 1.7418518 1.7366486q0.94314575 0.32138062 1.7820435 -0.16311646q0.8388977 -0.4844818 1.0401001 -1.4487457q0.19342041 -0.97779846 -0.60317993 -2.3581848q-0.76538086 -1.3262482 -1.7298889 -1.6533508q-0.97229004 -0.3406372 -1.7976685 0.1360321q-0.8118286 0.46887207 -0.9974365 1.4602051q-0.1855774 0.991333 0.56417847 2.290512z" fill-rule="nonzero"/><path fill="#000000" d="m302.23132 207.68793l-0.80441284 -1.3939209l1.2177429 -0.70329285l0.80441284 1.3939056l-1.2177429 0.7033081zm4.920227 8.525894l-4.147064 -7.1861115l1.2177734 -0.7033081l4.1470337 7.1861115l-1.2177429 0.7033081zm1.3493347 -3.6482391l1.0948792 -0.88494873q0.51641846 0.6760864 1.2029724 0.80285645q0.6922302 0.10542297 1.5176086 -0.3712616q0.8388672 -0.4844818 1.0495605 -1.057251q0.20285034 -0.58628845 -0.062683105 -1.0464172q-0.23431396 -0.4059906 -0.7266846 -0.4464264q-0.35079956 -0.013916016 -1.4790955 0.3129425q-1.5347595 0.43530273 -2.2030334 0.49645996q-0.66256714 0.03982544 -1.183075 -0.23695374q-0.5069885 -0.28459167 -0.8115845 -0.8123779q-0.28115845 -0.48719788 -0.2989807 -1.018219q-0.017791748 -0.5310211 0.2048645 -1.0204926q0.15917969 -0.3806305 0.56817627 -0.797287q0.4147339 -0.43800354 0.9694824 -0.75839233q0.852417 -0.49230957 1.6289368 -0.61598206q0.7765198 -0.123687744 1.3162842 0.123931885q0.5455017 0.22625732 1.0733948 0.85964966l-1.0969849 0.85006714q-0.4013672 -0.5079651 -0.9733887 -0.59262085q-0.5720215 -0.0846405 -1.2756042 0.3217163q-0.8388977 0.4844818 -1.0402222 0.9796753q-0.19558716 0.47383118 0.015289307 0.8392334q0.14056396 0.24359131 0.39874268 0.34710693q0.2581787 0.103500366 0.64749146 0.05909729q0.22845459 -0.041732788 1.2620544 -0.31388855q1.4806519 -0.40403748 2.127594 -0.470932q0.6390991 -0.08041382 1.1653442 0.17501831q0.5397949 0.24760437 0.89904785 0.87013245q0.35144043 0.60899353 0.29852295 1.3613129q-0.047210693 0.73095703 -0.5519409 1.4194183q-0.49118042 0.68063354 -1.3300476 1.1651306q-1.407196 0.81269836 -2.4736633 0.6527405q-1.0664673 -0.15994263 -1.933258 -1.193039zm6.1190186 -5.4646606q-1.1480408 -1.9893799 -0.5930481 -3.5910187q0.47283936 -1.3376465 1.7988281 -2.1034698q1.4883423 -0.85957336 2.984253 -0.4243927q1.501648 0.41384888 2.4934998 2.132553q0.79663086 1.3803864 0.83795166 2.4210968q0.04135132 1.0406952 -0.49923706 1.948349q-0.5348511 0.8862915 -1.4819946 1.4333038q-1.5018921 0.8674011 -2.9899902 0.44573975q-1.4959106 -0.43519592 -2.5502625 -2.2621613zm1.2583313 -0.7267456q0.79663086 1.3803864 1.7902832 1.7267303q1.0071716 0.33854675 1.9136963 -0.18501282q0.9065552 -0.5235748 1.1036072 -1.5576019q0.21057129 -1.0418396 -0.60946655 -2.4628143q-0.76538086 -1.3262482 -1.7725525 -1.6647949q-0.99365234 -0.34635925 -1.9002075 0.17721558q-0.90652466 0.5235596 -1.117096 1.5653992q-0.2048645 1.0204926 0.59173584 2.400879zm8.957733 -0.3738098l-5.7246704 -9.919815l1.2177734 -0.70329285l5.72464 9.9198l-1.2177429 0.7033081zm7.2713623 -5.390396q-0.34069824 0.972641 -0.8225403 1.5756989q-0.48962402 0.5895386 -1.2067566 1.0036926q-1.1906738 0.6876831 -2.162445 0.47302246q-0.9717407 -0.21464539 -1.4872131 -1.1078339q-0.30456543 -0.52778625 -0.3109436 -1.1015167q-0.014190674 -0.58724976 0.21627808 -1.0631866q0.23620605 -0.49728394 0.64520264 -0.9139252q0.31063843 -0.3057251 0.980896 -0.8010864q1.3733215 -1.02771 1.9363098 -1.6776581q-0.14837646 -0.25712585 -0.18740845 -0.32478333q-0.42956543 -0.74432373 -0.93963623 -0.84669495q-0.7156677 -0.14602661 -1.6357422 0.38536072q-0.8524475 0.49230957 -1.0922546 1.0458069q-0.23410034 0.5321655 0.013824463 1.3994293l-1.2843933 0.52526855q-0.2749939 -0.85162354 -0.18301392 -1.5362854q0.10549927 -0.6924591 0.66851807 -1.3424072q0.5552063 -0.66348267 1.4752808 -1.1948547q0.92007446 -0.5313873 1.6133118 -0.6430664q0.7067566 -0.11949158 1.1648254 0.04902649q0.45803833 0.16850281 0.8552551 0.60672q0.24728394 0.27218628 0.71588135 1.0841675l0.9371643 1.6239929q0.9840393 1.7051697 1.3094177 2.1126862q0.33892822 0.39971924 0.81103516 0.68640137l-1.2718506 0.7345581q-0.40811157 -0.26953125 -0.7590027 -0.75253296zm-1.6645203 -2.6654663q-0.5067749 0.65356445 -1.7234497 1.6088562q-0.69522095 0.5458679 -0.9283142 0.8609314q-0.23312378 0.31506348 -0.25280762 0.6873169q-0.013977051 0.3508911 0.16564941 0.66215515q0.28115845 0.48719788 0.83392334 0.6010132q0.5662842 0.10598755 1.269867 -0.30036926q0.70358276 -0.40634155 1.072998 -1.0166473q0.37512207 -0.63165283 0.3197937 -1.3214569q-0.031341553 -0.5232086 -0.49990845 -1.3352051l-0.25775146 -0.44659424zm7.223419 -0.8156891l0.8006897 0.98106384q-0.45169067 0.4052124 -0.857605 0.63964844q-0.6629944 0.38290405 -1.1454468 0.39089966q-0.4902954 -0.0055389404 -0.8343811 -0.25790405q-0.3518982 -0.26591492 -0.9844971 -1.3620911l-2.382019 -4.1276093l-0.8930054 0.5157471l-0.5466919 -0.94732666l0.8930054 -0.5157471l-1.0308838 -1.786377l0.7960205 -1.434082l1.4526367 2.5171661l1.2177429 -0.7033081l0.5466919 0.94732666l-1.2177429 0.70329285l2.421051 4.195282q0.30459595 0.52778625 0.4446106 0.645401q0.15356445 0.10978699 0.35708618 0.11856079q0.19567871 -0.004760742 0.46627808 -0.16104126q0.20297241 -0.11721802 0.49645996 -0.35890198zm-3.0901794 -8.121277l-0.80441284 -1.3939209l1.2177429 -0.70329285l0.80444336 1.3939209l-1.2177734 0.70329285zm4.920227 8.525894l-4.1470337 -7.1861115l1.2177429 -0.70329285l4.147064 7.186096l-1.2177734 0.7033081zm0.54074097 -5.111908q-1.1480408 -1.9893799 -0.5930481 -3.5910187q0.47280884 -1.3376465 1.7988281 -2.1034698q1.4883423 -0.8595886 2.984253 -0.4243927q1.501648 0.41384888 2.4934998 2.132553q0.79660034 1.3803864 0.83795166 2.4210968q0.0413208 1.0406952 -0.49923706 1.948349q-0.5348511 0.8862915 -1.4819946 1.4333038q-1.5018921 0.8674011 -2.9899902 0.44573975q-1.4959412 -0.43519592 -2.5502625 -2.2621613zm1.2583313 -0.7267456q0.79663086 1.3803864 1.7902527 1.7267303q1.0072021 0.33854675 1.9137268 -0.18501282q0.9065552 -0.5235748 1.1036072 -1.5576019q0.21057129 -1.0418396 -0.60946655 -2.4628143q-0.76538086 -1.3262482 -1.7725525 -1.6647949q-0.99365234 -0.34635925 -1.9002075 0.17721558q-0.9065552 0.5235596 -1.117096 1.5653992q-0.2048645 1.0204926 0.59173584 2.400879zm8.984772 -0.38945007l-4.1470337 -7.1861115l1.0959473 -0.6329651l0.5857544 1.0149841q0.10531616 -1.6306152 1.6072083 -2.4980164q0.6494446 -0.37509155 1.3234558 -0.45761108q0.6739807 -0.08250427 1.163269 0.14013672q0.48928833 0.22264099 0.9021301 0.687912q0.26287842 0.29925537 0.74710083 1.1383057l2.553833 4.425354l-1.2177429 0.70329285l-2.5226135 -4.371216q-0.4295349 -0.74432373 -0.7892456 -1.0237579q-0.3539734 -0.30078125 -0.8442383 -0.3063202q-0.47677612 -0.01335144 -0.9638672 0.2679596q-0.7847595 0.45324707 -1.0640869 1.2821808q-0.2735901 0.8075714 0.52301025 2.1879578l2.264862 3.9246216l-1.2177429 0.70329285z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m290.76047 207.94267c-17.003845 0 -26.795105 -5.566925 -34.00769 -11.133865c-7.212616 -5.566925 -11.846558 -11.13385 -23.693115 -11.13385" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m290.76047 207.94267c-17.003876 0 -26.795105 -5.5669403 -34.00769 -11.133865c-3.606308 -2.7834625 -6.5679474 -5.566925 -10.108826 -7.6545258c-0.4426117 -0.2609558 -0.8942871 -0.5110321 -1.3573761 -0.74887085c-0.11578369 -0.0594635 -0.23228455 -0.1181488 -0.34950256 -0.17605591l-0.13806152 -0.066833496" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m245.48381 184.93082l-9.563843 1.350235l8.194244 5.1131744z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m280.4455 126.06866l129.5748 -74.83465l20.629913 35.74803l-129.5748 74.83464z" fill-rule="evenodd"/><path fill="#000000" d="m298.03357 135.5387l-5.72464 -9.919807l1.3124695 -0.75800323l5.72464 9.9198l-1.3124695 0.75801086zm3.470581 -2.0044098l-4.1470337 -7.186104l1.0959778 -0.6329651l0.5857239 1.0149841q0.10531616 -1.6306229 1.6072083 -2.498024q0.6494751 -0.37509155 1.3234558 -0.45760345q0.6739807 -0.0825119 1.163269 0.14012146q0.48928833 0.22264099 0.9021301 0.6879196q0.26290894 0.29925537 0.74710083 1.1383133l2.553833 4.4253464l-1.2177429 0.70329285l-2.522583 -4.371208q-0.4295349 -0.74432373 -0.7892456 -1.0237579q-0.3540039 -0.30078888 -0.8442688 -0.30632782q-0.4767456 -0.01335144 -0.9638672 0.26796722q-0.7847595 0.4532318 -1.0640869 1.2821655q-0.2735901 0.80758667 0.52301025 2.1879654l2.2648926 3.9246216l-1.2177734 0.70329285zm12.360168 -7.1384735l-0.5232849 -0.906723q-0.059539795 1.4598389 -1.3855286 2.2256546q-0.8659668 0.5001297 -1.8564148 0.44062042q-0.9904785 -0.05949402 -1.8883972 -0.6776428q-0.88442993 -0.62597656 -1.5170288 -1.7221603q-0.6247864 -1.0826492 -0.77282715 -2.151947q-0.14230347 -1.0906448 0.30926514 -1.9649353q0.44378662 -0.887825 1.336792 -1.4035797q0.6494751 -0.37509155 1.3063049 -0.39356232q0.6703491 -0.026283264 1.2392883 0.2405777l-2.054016 -3.5592194l1.2177429 -0.7033005l5.7246704 9.919807l-1.1365662 0.6564102zm-5.9122925 -1.3669891q0.79660034 1.3803787 1.7767029 1.7345505q0.97232056 0.3406372 1.7570801 -0.112602234q0.7983093 -0.4610443 0.97817993 -1.4310303q0.18560791 -0.991333 -0.58758545 -2.3311157q-0.8512573 -1.4751129 -1.8178406 -1.8370972q-0.96658325 -0.36198425 -1.805481 0.12251282q-0.8118286 0.46886444 -0.97036743 1.4445648q-0.15853882 0.9757004 0.6693115 2.4102173zm12.53952 -5.5459747l1.3442383 -0.57787323q0.34274292 1.2816315 -0.11764526 2.3594894q-0.4468689 1.0700455 -1.6916809 1.788971q-1.5830688 0.9142914 -3.0654602 0.47128296q-1.4823914 -0.44300842 -2.4898376 -2.1887817q-1.0465393 -1.813446 -0.69085693 -3.3540955q0.35568237 -1.5406494 1.8440247 -2.4002304q1.4342346 -0.828331 2.9109192 -0.36397552q1.4823608 0.4430008 2.5054626 2.2158508q0.0625 0.10826111 0.18743896 0.32479095l-5.3580933 3.094513q0.75494385 1.1518478 1.7173462 1.4440689q0.96810913 0.27088165 1.861145 -0.24487305q0.6765137 -0.39072418 0.9470215 -1.0160904q0.2705078 -0.6253662 0.09597778 -1.5530472zm-5.131775 0.32941437l4.005066 -2.3130646q-0.60446167 -0.8598404 -1.2410278 -1.0876312q-0.98794556 -0.36769867 -1.90802 0.16368103q-0.8388977 0.48449707 -1.0926819 1.3889084q-0.2480774 0.88306427 0.23666382 1.8481064zm10.854523 3.1318436l-5.740265 -9.946869l1.1094971 -0.6407852l0.5388794 0.9337845q0.07220459 -0.78147125 0.4031067 -1.333458q0.3444214 -0.5597992 1.0480042 -0.9661484q0.92007446 -0.5313797 1.8970032 -0.46406555q0.9769592 0.06730652 1.8285828 0.7302551q0.8573303 0.64160156 1.4508972 1.6701202q0.6404114 1.1097183 0.74212646 2.2238083q0.11526489 1.1062698 -0.3691101 2.01754q-0.4708252 0.90345 -1.3097229 1.3879471q-0.6088867 0.35164642 -1.2443542 0.37583923q-0.62197876 0.01637268 -1.159668 -0.19635773l2.022766 3.5050888l-1.2177429 0.7033005zm-2.551239 -6.952957q0.80441284 1.3939209 1.7418518 1.7366486q0.95095825 0.3349228 1.7492676 -0.12612915q0.8118286 -0.46886444 0.9953308 -1.495079q0.18353271 -1.026207 -0.6443176 -2.4607239q-0.79663086 -1.3803787 -1.7554016 -1.728836q-0.96658325 -0.36198425 -1.7513428 0.09125519q-0.77124023 0.4454193 -0.958374 1.5278625q-0.1736145 1.0746231 0.62298584 2.4550018zm12.239746 -5.408928l1.3442688 -0.57788086q0.34274292 1.2816391 -0.11764526 2.359497q-0.4468689 1.0700455 -1.6916809 1.788971q-1.5830688 0.91428375 -3.0654602 0.47128296q-1.4823914 -0.44300842 -2.4898682 -2.1887894q-1.0465088 -1.8134384 -0.6908264 -3.3540878q0.35565186 -1.5406494 1.8440247 -2.400238q1.4342346 -0.828331 2.9108887 -0.36397552q1.4823914 0.44300842 2.5054932 2.2158508q0.062469482 0.10826874 0.18743896 0.32479858l-5.3580933 3.094513q0.75491333 1.1518402 1.7173462 1.4440689q0.96810913 0.27087402 1.861145 -0.24488068q0.6765137 -0.39071655 0.9470215 -1.0160828q0.2705078 -0.62537384 0.095947266 -1.5530472zm-5.1317444 0.32941437l4.0050354 -2.3130722q-0.60443115 -0.85983276 -1.2410278 -1.0876236q-0.98791504 -0.3677063 -1.9079895 0.1636734q-0.8388977 0.48449707 -1.0926819 1.3889084q-0.2480774 0.88306427 0.23666382 1.848114zm9.2612915 0.3710785l-4.1470337 -7.1861115l1.0959778 -0.6329651l0.5857239 1.0149841q0.10531616 -1.6306229 1.6072083 -2.498024q0.6494751 -0.37509155 1.3234558 -0.45760345q0.6739807 -0.0825119 1.163269 0.14012909q0.48928833 0.22263336 0.9021301 0.687912q0.26290894 0.29925537 0.74710083 1.1383133l2.553833 4.4253464l-1.2177429 0.70329285l-2.522583 -4.371208q-0.42956543 -0.74432373 -0.7892456 -1.0237579q-0.3540039 -0.30078888 -0.8442688 -0.30632782q-0.4767456 -0.01335144 -0.9638672 0.26796722q-0.7847595 0.4532318 -1.0640869 1.2821655q-0.2735901 0.80758667 0.52301025 2.187973l2.2648926 3.924614l-1.2177734 0.7033005zm12.360168 -7.138481l-0.5232849 -0.906723q-0.059539795 1.4598389 -1.3855286 2.2256546q-0.8659668 0.5001221 -1.8564148 0.44062805q-0.9904785 -0.05949402 -1.8883972 -0.67765045q-0.88442993 -0.62596893 -1.5170288 -1.7221603q-0.6247864 -1.0826492 -0.77282715 -2.151947q-0.14230347 -1.0906448 0.30926514 -1.9649353q0.44378662 -0.887825 1.336792 -1.4035797q0.6494751 -0.37509155 1.3063049 -0.39356232q0.6703491 -0.026283264 1.2392883 0.2405777l-2.054016 -3.5592194l1.2177429 -0.7033005l5.7246704 9.919807l-1.1365662 0.6564102zm-5.9122925 -1.3669891q0.79660034 1.3803787 1.7767029 1.7345505q0.97232056 0.3406372 1.7570801 -0.112602234q0.7983093 -0.4610443 0.97817993 -1.4310303q0.18560791 -0.991333 -0.58758545 -2.3311157q-0.8512573 -1.4751129 -1.8178406 -1.8370972q-0.96658325 -0.36198425 -1.805481 0.12251282q-0.8118286 0.46886444 -0.97036743 1.4445648q-0.15853882 0.9757004 0.6693115 2.4102173zm12.53952 -5.5459747l1.3442383 -0.57787323q0.34274292 1.2816391 -0.11764526 2.3594894q-0.4468689 1.0700455 -1.6916809 1.788971q-1.5830688 0.9142914 -3.0654602 0.47128296q-1.4823914 -0.4430008 -2.4898682 -2.1887817q-1.0465088 -1.813446 -0.6908264 -3.3540955q0.35568237 -1.5406494 1.8440247 -2.4002304q1.4342346 -0.828331 2.9109192 -0.36397552q1.4823608 0.4430008 2.5054626 2.2158508q0.0625 0.10826111 0.18743896 0.32479095l-5.3580933 3.094513q0.75494385 1.1518478 1.7173462 1.4440689q0.96810913 0.27088165 1.861145 -0.24487305q0.6765137 -0.39072418 0.9470215 -1.0160904q0.2705078 -0.6253662 0.09597778 -1.5530472zm-5.131775 0.32941437l4.005066 -2.3130646q-0.60446167 -0.8598404 -1.2410278 -1.0876312q-0.98794556 -0.36769867 -1.90802 0.16368103q-0.8388977 0.48449707 -1.0926819 1.3889084q-0.2480774 0.88306427 0.23666382 1.8481064zm9.261322 0.3710785l-4.147064 -7.186104l1.0959778 -0.6329727l0.5857544 1.0149918q0.105285645 -1.6306229 1.6071777 -2.498024q0.6494751 -0.37509155 1.3234558 -0.45761108q0.67401123 -0.0825119 1.163269 0.14012909q0.48928833 0.22264099 0.9021301 0.6879196q0.26290894 0.29925537 0.74710083 1.1383133l2.553833 4.4253387l-1.2177429 0.7033005l-2.522583 -4.371208q-0.4295349 -0.74432373 -0.7892456 -1.0237656q-0.3539734 -0.30078125 -0.8442688 -0.3063202q-0.4767456 -0.01335907 -0.96383667 0.2679596q-0.78479004 0.45323944 -1.0640869 1.2821732q-0.2736206 0.80758667 0.52301025 2.1879654l2.264862 3.924614l-1.2177429 0.7033005zm9.725006 -7.078125l0.8006897 0.98106384q-0.45169067 0.4052124 -0.857605 0.6396408q-0.6629944 0.38291168 -1.1454468 0.39089966q-0.4902954 -0.0055389404 -0.8343811 -0.25791168q-0.3518982 -0.26589966 -0.9844971 -1.3620834l-2.382019 -4.127617l-0.8930054 0.5157547l-0.5466919 -0.94731903l0.8930054 -0.5157547l-1.0308838 -1.786377l0.79599 -1.4340897l1.4526672 2.5171661l1.2177429 -0.7033005l0.5466919 0.94732666l-1.2177429 0.70329285l2.421051 4.195282q0.30459595 0.5277939 0.4446106 0.645401q0.15356445 0.10979462 0.35708618 0.11856842q0.19567871 -0.004760742 0.46627808 -0.16104889q0.20297241 -0.11721802 0.49645996 -0.35889435z" fill-rule="nonzero"/><path fill="#000000" d="m307.15155 152.21382l-5.72464 -9.919815l1.2177429 -0.70329285l3.2645264 5.6568604l1.1950684 -4.587631l1.5830688 -0.9142914l-1.2081604 4.252365l5.625824 2.7774506l-1.5018921 0.8674011l-4.4921265 -2.3134918l-0.38955688 1.3256378l1.6478882 2.8554993l-1.2177429 0.7033081zm10.503723 -9.151794l1.3442383 -0.57788086q0.34274292 1.2816315 -0.11764526 2.359497q-0.44683838 1.0700378 -1.6916504 1.788971q-1.5830688 0.9142761 -3.0654602 0.47128296q-1.4823914 -0.44300842 -2.4898682 -2.188797q-1.0465393 -1.8134308 -0.69085693 -3.3540802q0.35568237 -1.5406494 1.8440247 -2.400238q1.4342346 -0.8283386 2.9109192 -0.36398315q1.4823914 0.44300842 2.5054932 2.2158508q0.062469482 0.10827637 0.18743896 0.32479858l-5.3580933 3.094513q0.75491333 1.1518402 1.7173157 1.4440765q0.96813965 0.27087402 1.861145 -0.2448883q0.6765137 -0.39071655 0.947052 -1.0160828q0.2705078 -0.6253662 0.095947266 -1.5530396zm-5.1317444 0.32940674l4.0050354 -2.3130646q-0.60446167 -0.85983276 -1.2410278 -1.0876312q-0.98791504 -0.3677063 -1.90802 0.16368103q-0.8388672 0.48449707 -1.0926819 1.3889008q-0.2480774 0.8830719 0.23669434 1.848114zm9.247772 0.378891l-4.147064 -7.1861115l1.0959778 -0.6329651l0.6247864 1.0826569q-0.017913818 -1.0001068 0.19329834 -1.4468842q0.21124268 -0.44676208 0.64419556 -0.6968231q0.6088867 -0.35165405 1.4713135 -0.3264618l0.22875977 1.3654938q-0.59487915 7.4768066E-4 -1.0413818 0.25862122q-0.39239502 0.22662354 -0.5765686 0.6577606q-0.17840576 0.40979004 -0.063812256 0.92100525q0.17190552 0.7667999 0.61709595 1.5381927l2.1711426 3.7622223l-1.2177429 0.70329285zm4.6274414 -2.6725311l-4.147064 -7.1861115l1.0959778 -0.6329651l0.5857544 1.0149841q0.105285645 -1.6306152 1.6071777 -2.4980164q0.6494751 -0.37509155 1.3234558 -0.45761108q0.67401123 -0.08250427 1.1632996 0.14012146q0.4892578 0.22264099 0.9020996 0.68792725q0.26290894 0.29925537 0.74710083 1.1383057l2.553833 4.425354l-1.2177429 0.70329285l-2.522583 -4.371216q-0.4295349 -0.74432373 -0.7892456 -1.0237579q-0.3539734 -0.30078125 -0.8442688 -0.3063202q-0.4767456 -0.01335144 -0.96383667 0.2679596q-0.78479004 0.4532318 -1.0640869 1.2821655q-0.2736206 0.80758667 0.52301025 2.187973l2.264862 3.9246216l-1.2177429 0.70329285zm11.281708 -9.60112l1.3442688 -0.57788086q0.34274292 1.2816315 -0.11764526 2.359497q-0.4468689 1.0700378 -1.6916809 1.788971q-1.5830688 0.9142914 -3.0654602 0.47128296q-1.4823914 -0.44300842 -2.4898682 -2.1887817q-1.0465088 -1.813446 -0.6908264 -3.3540955q0.35565186 -1.5406494 1.8440247 -2.400238q1.4342346 -0.828331 2.9108887 -0.36397552q1.4823914 0.44300842 2.5054932 2.2158432q0.062469482 0.10827637 0.18743896 0.32479858l-5.3580933 3.094513q0.75491333 1.1518555 1.7173462 1.4440765q0.96810913 0.27087402 1.861145 -0.24487305q0.6765137 -0.3907318 0.9470215 -1.016098q0.2705078 -0.6253662 0.095947266 -1.5530396zm-5.1317444 0.32940674l4.0050354 -2.3130646q-0.60443115 -0.85983276 -1.2410278 -1.0876312q-0.98791504 -0.3677063 -1.9079895 0.16368103q-0.8388977 0.48449707 -1.0926819 1.388916q-0.2480774 0.88305664 0.23666382 1.8480988zm9.234253 0.3867035l-5.7246704 -9.9198l1.2177734 -0.7033005l5.72464 9.919807l-1.2177429 0.70329285z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m290.76047 143.94267c-17.003845 0 -26.795105 -5.566925 -34.00769 -11.133865c-7.212616 -5.566925 -11.846558 -11.13385 -23.693115 -11.13385" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m290.76047 143.94267c-17.003876 0 -26.795105 -5.5669403 -34.00769 -11.133865c-3.606308 -2.7834625 -6.5679474 -5.566925 -10.108826 -7.6545258c-0.4426117 -0.26094818 -0.8942871 -0.5110321 -1.3573761 -0.74887085c-0.11578369 -0.0594635 -0.23228455 -0.11816406 -0.34950256 -0.17607117l-0.13806152 -0.06682587" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m245.48381 120.93081l-9.563843 1.350235l8.194244 5.113182z" fill-rule="evenodd"/></g></svg>
+\ No newline at end of file
diff --git a/g3doc/README.md b/g3doc/README.md
index 49d58cdae..304a91493 100644
--- a/g3doc/README.md
+++ b/g3doc/README.md
@@ -1,2 +1,168 @@
-The gVisor logo files are licensed under CC BY-SA 4.0 (Creative Commons
-Attribution-ShareAlike 4.0 International).
+# What is gVisor?
+
+gVisor is an application kernel, written in Go, that implements a substantial
+portion of the [Linux system call interface][linux]. It provides an additional
+layer of isolation between running applications and the host operating system.
+
+gVisor includes an [Open Container Initiative (OCI)][oci] runtime called `runsc`
+that makes it easy to work with existing container tooling. The `runsc` runtime
+integrates with Docker and Kubernetes, making it simple to run sandboxed
+containers.
+
+gVisor can be used with Docker, Kubernetes, or directly using `runsc`. Use the
+links below to see detailed instructions for each of them:
+
+*   [Docker](./user_guide/quick_start/docker.md): The quickest and easiest way
+    to get started.
+*   [Kubernetes](./user_guide/quick_start/kubernetes.md): Isolate Pods in your
+    K8s cluster with gVisor.
+*   [OCI Quick Start](./user_guide/quick_start/oci.md): Expert mode. Customize
+    gVisor for your environment.
+
+## What does gVisor do?
+
+gVisor provides a virtualized environment in order to sandbox containers. The
+system interfaces normally implemented by the host kernel are moved into a
+distinct, per-sandbox application kernel in order to minimize the risk of an
+container escape exploit. gVisor does not introduce large fixed overheads
+however, and still retains a process-like model with respect to resource
+utilization.
+
+## How is this different?
+
+Two other approaches are commonly taken to provide stronger isolation than
+native containers.
+
+**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes
+virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This
+virtualized hardware is generally enlightened (paravirtualized) and additional
+mechanisms can be used to improve the visibility between the guest and host
+(e.g. balloon drivers, paravirtualized spinlocks). Running containers in
+distinct virtual machines can provide great isolation, compatibility and
+performance (though nested virtualization may bring challenges in this area),
+but for containers it often requires additional proxies and agents, and may
+require a larger resource footprint and slower start-up times.
+
+![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization")
+
+**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and
+[AppArmor][apparmor], allows the specification of a fine-grained security policy
+for an application or container. These schemes typically rely on hooks
+implemented inside the host kernel to enforce the rules. If the surface can be
+made small enough, then this is an excellent way to sandbox applications and
+maintain native performance. However, in practice it can be extremely difficult
+(if not impossible) to reliably define a policy for arbitrary, previously
+unknown applications, making this approach challenging to apply universally.
+
+![Rule-based execution](Rule-Based-Execution.png "Rule-based execution")
+
+Rule-based execution is often combined with additional layers for
+defense-in-depth.
+
+**gVisor** provides a third isolation mechanism, distinct from those above.
+
+gVisor intercepts application system calls and acts as the guest kernel, without
+the need for translation through virtualized hardware. gVisor may be thought of
+as either a merged guest kernel and VMM, or as seccomp on steroids. This
+architecture allows it to provide a flexible resource footprint (i.e. one based
+on threads and memory mappings, not fixed guest physical resources) while also
+lowering the fixed costs of virtualization. However, this comes at the price of
+reduced application compatibility and higher per-system call overhead.
+
+![gVisor](Layers.png "gVisor")
+
+On top of this, gVisor employs rule-based execution to provide defense-in-depth
+(details below).
+
+gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML
+virtualizes hardware internally and thus provides a fixed resource footprint.
+
+Each of the above approaches may excel in distinct scenarios. For example,
+machine-level virtualization will face challenges achieving high density, while
+gVisor may provide poor performance for system call heavy workloads.
+
+## Why Go?
+
+gVisor is written in [Go][golang] in order to avoid security pitfalls that can
+plague kernels. With Go, there are strong types, built-in bounds checks, no
+uninitialized variables, no use-after-free, no stack overflow, and a built-in
+race detector. However, the use of Go has its challenges, and the runtime often
+introduces performance overhead.
+
+## What are the different components?
+
+A gVisor sandbox consists of multiple processes. These processes collectively
+comprise an environment in which one or more containers can be run.
+
+Each sandbox has its own isolated instance of:
+
+*   The **Sentry**, which is a kernel that runs the containers and intercepts
+    and responds to system calls made by the application.
+
+Each container running in the sandbox has its own isolated instance of:
+
+*   A **Gofer** which provides file system access to the containers.
+
+![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram")
+
+## What is runsc?
+
+The entrypoint to running a sandboxed container is the `runsc` executable.
+`runsc` implements the [Open Container Initiative (OCI)][oci] runtime
+specification, which is used by Docker and Kubernetes. This means that OCI
+compatible _filesystem bundles_ can be run by `runsc`. Filesystem bundles are
+comprised of a `config.json` file containing container configuration, and a root
+filesystem for the container. Please see the [OCI runtime spec][runtime-spec]
+for more information on filesystem bundles. `runsc` implements multiple commands
+that perform various functions such as starting, stopping, listing, and querying
+the status of containers.
+
+### Sentry
+
+<a name="sentry"></a> <!-- For deep linking. -->
+
+The Sentry is the largest component of gVisor. It can be thought of as a
+application kernel. The Sentry implements all the kernel functionality needed by
+the application, including: system calls, signal delivery, memory management and
+page faulting logic, the threading model, and more.
+
+When the application makes a system call, the
+[Platform](./architecture_guide/platforms.md) redirects the call to the Sentry,
+which will do the necessary work to service it. It is important to note that the
+Sentry does not pass system calls through to the host kernel. As a userspace
+application, the Sentry will make some host system calls to support its
+operation, but it does not allow the application to directly control the system
+calls it makes. For example, the Sentry is not able to open files directly; file
+system operations that extend beyond the sandbox (not internal `/proc` files,
+pipes, etc) are sent to the Gofer, described below.
+
+### Gofer
+
+<a name="gofer"></a> <!-- For deep linking. -->
+
+The Gofer is a standard host process which is started with each container and
+communicates with the Sentry via the [9P protocol][9p] over a socket or shared
+memory channel. The Sentry process is started in a restricted seccomp container
+without access to file system resources. The Gofer mediates all access to the
+these resources, providing an additional level of isolation.
+
+### Application
+
+The application is a normal Linux binary provided to gVisor in an OCI runtime
+bundle. gVisor aims to provide an environment equivalent to Linux v4.4, so
+applications should be able to run unmodified. However, gVisor does not
+presently implement every system call, `/proc` file, or `/sys` file so some
+incompatibilities may occur. See [Commpatibility](./user_guide/compatibility.md)
+for more information.
+
+[9p]: https://en.wikipedia.org/wiki/9P_(protocol)
+[apparmor]: https://wiki.ubuntu.com/AppArmor
+[golang]: https://golang.org
+[kvm]: https://www.linux-kvm.org
+[linux]: https://en.wikipedia.org/wiki/Linux_kernel_interfaces
+[oci]: https://www.opencontainers.org
+[runtime-spec]: https://github.com/opencontainers/runtime-spec
+[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt
+[selinux]: https://selinuxproject.org
+[uml]: http://user-mode-linux.sourceforge.net/
+[xen]: https://www.xenproject.org
diff --git a/g3doc/Rule-Based-Execution.png b/g3doc/Rule-Based-Execution.png
new file mode 100644
index 000000000..b42654a90
--- /dev/null
+++ b/g3doc/Rule-Based-Execution.png
diff --git a/g3doc/Rule-Based-Execution.svg b/g3doc/Rule-Based-Execution.svg
new file mode 100644
index 000000000..bd6717043
--- /dev/null
+++ b/g3doc/Rule-Based-Execution.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 355.03674540682414 172.5564304461942" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l355.03674 0l0 172.55643l-355.03674 0l0 -172.55643z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l355.03674 0l0 172.55643l-355.03674 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m36.454067 6.6430445l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path stroke="#cc4125" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m36.454067 6.6430445l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path fill="#000000" d="m78.206116 37.98824l5.125 -13.359373l1.90625 0l5.46875 13.359373l-2.015625 0l-1.546875 -4.046875l-5.59375 0l-1.46875 4.046875l-1.875 0zm3.859375 -5.484375l4.53125 0l-1.40625 -3.703123q-0.625 -1.6875 -0.9375 -2.765625q-0.265625 1.28125 -0.71875 2.546875l-1.46875 3.921873zm9.849823 9.1875l0 -13.374998l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546873q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.8437481 -0.765625 -2.765623q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.843748zm8.891342 8.484375l0 -13.374998l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546873q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.8437481 -0.765625 -2.765623q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.843748zm8.844467 4.78125l0 -13.359373l1.640625 0l0 13.359373l-1.640625 0zm4.191696 -11.468748l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.468748l0 -9.671873l1.640625 0l0 9.671873l-1.640625 0zm10.457321 -3.546875l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.5781231 0.515625 -2.749998q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.812498q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125zm9.328125 2.359375q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5624981q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.187498q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm7.781967 3.390625l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578123l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671873q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 -9.999998l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.468748l0 -9.671873l1.640625 0l0 9.671873l-1.640625 0zm3.5354462 -4.84375q0 -2.687498 1.484375 -3.968748q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609373q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.7968731 -0.8125 -2.718748q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765623zm9.297592 4.84375l0 -9.671873l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.5937481l0 5.953125l-1.640625 0l0 -5.890625q0 -0.9999981 -0.203125 -1.4843731q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515623l0 5.28125l-1.640625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m3.6351707 71.39028l48.850395 0" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m3.6351707 71.39028l48.850395 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m195.25722 71.39028l47.338577 0" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m195.25722 71.39028l47.338577 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m52.485565 55.358784l142.77165 0l0 32.062992l-142.77165 0z" fill-rule="evenodd"/><path fill="#000000" d="m65.21821 76.19028l0 -9.546875l1.265625 0l0 8.421875l4.703125 0l0 1.125l-5.96875 0zm7.3343506 -8.1875l0 -1.359375l1.171875 0l0 1.359375l-1.171875 0zm0 8.1875l0 -6.90625l1.171875 0l0 6.90625l-1.171875 0zm2.945465 0l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.03125 0 1.578125 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.703125 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm11.118057 -8.1875l0 -1.359375l1.171875 0l0 1.359375l-1.171875 0zm0 8.1875l0 -6.90625l1.171875 0l0 6.90625l-1.171875 0zm5.507965 -1.046875l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.8748627 -1.171875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm11.006226 4.125l0 -0.875q-0.65625 1.03125 -1.9375 1.03125q-0.8125 0 -1.515625 -0.453125q-0.6875 -0.453125 -1.078125 -1.265625q-0.375 -0.828125 -0.375 -1.890625q0 -1.03125 0.34375 -1.875q0.34375 -0.84375 1.03125 -1.28125q0.703125 -0.453125 1.546875 -0.453125q0.625 0 1.109375 0.265625q0.5 0.25 0.796875 0.671875l0 -3.421875l1.171875 0l0 9.546875l-1.09375 0zm-3.703125 -3.453125q0 1.328125 0.5625 1.984375q0.5625 0.65625 1.328125 0.65625q0.765625 0 1.296875 -0.625q0.53125 -0.625 0.53125 -1.90625q0 -1.421875 -0.546875 -2.078125q-0.546875 -0.671875 -1.34375 -0.671875q-0.78125 0 -1.3125 0.640625q-0.515625 0.625 -0.515625 2.0zm9.865463 1.390625l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625zm7.0859375 4.71875l-0.125 -1.09375q0.375 0.109375 0.65625 0.109375q0.390625 0 0.625 -0.140625q0.234375 -0.125 0.390625 -0.359375q0.109375 -0.171875 0.359375 -0.875q0.03125 -0.09375 0.109375 -0.28125l-2.625 -6.921875l1.265625 0l1.4375 4.0q0.28125 0.765625 0.5 1.59375q0.203125 -0.796875 0.46875 -1.578125l1.484375 -4.015625l1.171875 0l-2.625 7.015625q-0.421875 1.140625 -0.65625 1.578125q-0.3125 0.578125 -0.71875 0.84375q-0.40625 0.28125 -0.96875 0.28125q-0.328125 0 -0.75 -0.15625zm6.2421875 -4.71875l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8124924 0 1.2031174 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1874924 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8124924 0 1.4218674 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0624924 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.2499924 0.328125 1.7343674 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.4531174 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625zm9.695305 1.015625l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.8748627 -1.171875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5218506 4.125l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.03125 0 1.578125 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.703125 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm19.321045 -2.53125l1.15625 0.15625q-0.1875 1.1875 -0.96875 1.859375q-0.78125 0.671875 -1.921875 0.671875q-1.40625 0 -2.28125 -0.921875q-0.859375 -0.9375 -0.859375 -2.65625q0 -1.125 0.375 -1.96875q0.375 -0.84375 1.125 -1.25q0.765625 -0.421875 1.65625 -0.421875q1.125 0 1.84375 0.578125q0.71875 0.5625 0.921875 1.609375l-1.140625 0.171875q-0.171875 -0.703125 -0.59375 -1.046875q-0.40625 -0.359375 -0.984375 -0.359375q-0.890625 0 -1.453125 0.640625q-0.546875 0.640625 -0.546875 2.0q0 1.40625 0.53125 2.03125q0.546875 0.625 1.40625 0.625q0.6875 0 1.140625 -0.421875q0.46875 -0.421875 0.59375 -1.296875zm6.6640625 1.671875q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.96109 0l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.507965 -2.0625l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m36.454067 85.4808l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path stroke="#6d9eeb" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m36.454067 85.4808l174.83464 0l0 48.850395l-174.83464 0z" fill-rule="evenodd"/><path fill="#000000" d="m76.63558 116.82599l0 -13.359375l1.765625 0l0 5.484375l6.9375 0l0 -5.484375l1.765625 0l0 13.359375l-1.765625 0l0 -6.296875l-6.9375 0l0 6.296875l-1.765625 0zm12.597946 -4.84375q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm8.641342 1.953125l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm13.5625 1.421875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm6.9134827 1.46875l0 -13.359375l1.78125 0l0 6.625l6.6249924 -6.625l2.390625 0l-5.5937424 5.421875l5.8437424 7.9375l-2.328125 0l-4.7656174 -6.765625l-2.171875 2.140625l0 4.625l-1.78125 0zm18.943565 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm6.228302 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm17.000732 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.7656403 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375153 0 3.1562653 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.2187653 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.5468903 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.3906403 -2.65625l5.4062653 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.0312653 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.094467 5.765625l0 -13.359375l1.640625 0l0 13.359375l-1.640625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m4.454068 151.90599l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m4.454068 151.90599l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m172.45407 151.90599l74.04724 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m172.45407 151.90599l74.04724 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m73.43044 135.54013l100.88189 0l0 32.06299l-100.88189 0z" fill-rule="evenodd"/><path fill="#000000" d="m96.04542 156.37163l0 -9.546875l1.265625 0l0 3.921875l4.953125 0l0 -3.921875l1.265625 0l0 9.546875l-1.265625 0l0 -4.5l-4.953125 0l0 4.5l-1.265625 0zm13.953278 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm8.93837 0l0 -0.875q-0.65625 1.03125 -1.9375 1.03125q-0.8125 0 -1.515625 -0.453125q-0.6875 -0.453125 -1.078125 -1.265625q-0.375 -0.828125 -0.375 -1.890625q0 -1.03125 0.34375 -1.875q0.34375 -0.84375 1.03125 -1.28125q0.703125 -0.453125 1.546875 -0.453125q0.625 0 1.109375 0.265625q0.5 0.25 0.796875 0.671875l0 -3.421875l1.171875 0l0 9.546875l-1.09375 0zm-3.703125 -3.453125q0 1.328125 0.5625 1.984375q0.5625 0.65625 1.328125 0.65625q0.765625 0 1.296875 -0.625q0.53125 -0.625 0.53125 -1.90625q0 -1.421875 -0.546875 -2.078125q-0.546875 -0.671875 -1.34375 -0.671875q-0.78125 0 -1.3125 0.640625q-0.515625 0.625 -0.515625 2.0zm7.9124756 3.453125l-2.125 -6.90625l1.21875 0l1.09375 3.984375l0.421875 1.484375q0.015625 -0.109375 0.359375 -1.421875l1.0937424 -4.046875l1.203125 0l1.03125 4.0l0.34375 1.328125l0.40625 -1.34375l1.171875 -3.984375l1.140625 0l-2.15625 6.90625l-1.21875 0l-1.09375 -4.140625l-0.265625 -1.171875l-1.4062424 5.3125l-1.21875 0zm12.859535 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59376526 0.21875 -1.2812653 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.4218903 -0.171875 2.0937653 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.3437653 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.89064026 0 1.4375153 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.9218903 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.2031403 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm9.18837 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375z" fill-rule="nonzero"/><defs><linearGradient id="p.1" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.54555197232122 0.0 0.0 4.54555197232122 0.0 0.0)" spreadMethod="pad" x1="8.189483259998303" y1="18.80511284496466" x2="8.189466907412452" y2="23.35066481725647"><stop offset="0.0" stop-color="#ff0000"/><stop offset="0.51" stop-color="#dab7a6"/><stop offset="0.99999994" stop-color="#dab7a6" stop-opacity="0.0"/><stop offset="1.0" stop-color="#ffffff" stop-opacity="0.0"/></linearGradient></defs><path fill="url(#p.1)" d="m37.225723 85.48025l173.29134 0l0 20.661415l-173.29134 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m272.4455 100.54161l129.5748 -74.83464l20.629913 35.74803l-129.5748 74.83464z" fill-rule="evenodd"/><path fill="#000000" d="m287.51392 107.20854l1.1823425 -0.8271866q0.51071167 0.6974335 1.1166077 0.9970856q0.5980835 0.28610992 1.4464111 0.19311523q0.84054565 -0.10652161 1.6794434 -0.5910187q0.75772095 -0.4376068 1.2010193 -0.98233795q0.44906616 -0.5660858 0.50097656 -1.1013031q0.057678223 -0.55656433 -0.20785522 -1.0166931q-0.27334595 -0.47366333 -0.7392273 -0.6557007q-0.47366333 -0.1955719 -1.2366333 -0.079704285q-0.478302 0.07775116 -2.032318 0.54221344q-1.5618286 0.45092773 -2.2805786 0.48712158q-0.9222717 0.027420044 -1.5864563 -0.31072998q-0.6719971 -0.35168457 -1.0703125 -1.0418777q-0.4295349 -0.74432373 -0.38497925 -1.6361618q0.05029297 -0.9131851 0.6668701 -1.7203751q0.63012695 -0.8150101 1.6313782 -1.39328q1.1095276 -0.6407852 2.1592712 -0.75988007q1.0419617 -0.13263702 1.8867493 0.29968262q0.8583679 0.4244995 1.3987732 1.2671738l-1.2036743 0.8214798q-0.64712524 -0.87127686 -1.5022583 -1.0089188q-0.8629761 -0.15116882 -2.013092 0.5130539q-1.1906738 0.6876755 -1.4819641 1.4333115q-0.2913208 0.745636 0.06793213 1.3681641q0.30459595 0.5277939 0.8865051 0.6608505q0.5740967 0.119522095 2.3815613 -0.43717957q1.8210144 -0.5645218 2.5725403 -0.6376953q1.0924377 -0.107666016 1.857605 0.28042603q0.77090454 0.366745 1.2316895 1.1652069q0.4529724 0.78491974 0.39904785 1.7543335q-0.040405273 0.96160126 -0.6663208 1.8463745q-0.62594604 0.8847656 -1.6813354 1.4942932q-1.3530579 0.78144073 -2.486084 0.9125519q-1.1408386 0.11756897 -2.1214905 -0.3625946q-0.96713257 -0.48797607 -1.5721436 -1.4738007zm13.40155 -4.9431534l0.8006897 0.98106384q-0.45169067 0.4052124 -0.857605 0.63964844q-0.6629944 0.38290405 -1.1454773 0.39089966q-0.4902649 -0.00554657 -0.8343506 -0.25791168q-0.3518982 -0.2659073 -0.9844971 -1.3620911l-2.382019 -4.127617l-0.8930054 0.5157547l-0.5466919 -0.94731903l0.8930054 -0.5157547l-1.0308838 -1.786377l0.79599 -1.4340897l1.4526367 2.5171661l1.2177734 -0.70329285l0.5466919 0.94731903l-1.2177734 0.7033005l2.4210815 4.1952744q0.30456543 0.5277939 0.4446106 0.645401q0.15356445 0.10979462 0.35705566 0.11856842q0.19570923 -0.004760742 0.4663086 -0.16104889q0.20297241 -0.11721802 0.49645996 -0.35889435zm1.8165283 0.41242218l-4.147064 -7.186104l1.0959778 -0.6329727l0.6247864 1.0826492q-0.0178833 -1.0000992 0.19332886 -1.4468689q0.21121216 -0.44677734 0.64419556 -0.6968384q0.6088562 -0.35164642 1.471283 -0.3264618l0.22875977 1.3654938q-0.59487915 7.4768066E-4 -1.0413818 0.25862885q-0.39239502 0.2266159 -0.5765381 0.6577606q-0.17843628 0.40979004 -0.06384277 0.9209976q0.17193604 0.76680756 0.61709595 1.5382004l2.1711426 3.7622147l-1.2177429 0.7033005zm2.0899658 -6.0066605q-1.1480408 -1.9893799 -0.5930481 -3.5910187q0.47280884 -1.3376465 1.7988281 -2.1034622q1.4883423 -0.8595886 2.984253 -0.4243927q1.501648 0.41384125 2.4934998 2.132553q0.79660034 1.3803787 0.83795166 2.4210892q0.04135132 1.0407028 -0.49923706 1.948349q-0.5348511 0.8862915 -1.4819946 1.4333038q-1.5018921 0.8674011 -2.9899902 0.44573975q-1.4959106 -0.4351883 -2.5502625 -2.2621613zm1.2583313 -0.7267456q0.79663086 1.3803787 1.7902527 1.726738q1.0072021 0.33853912 1.9137268 -0.18502045q0.9065552 -0.5235672 1.1036072 -1.5575943q0.21057129 -1.0418396 -0.60946655 -2.462822q-0.76538086 -1.3262482 -1.7725525 -1.6647949q-0.99365234 -0.34635162 -1.9002075 0.17721558q-0.90652466 0.5235672 -1.117096 1.5654068q-0.2048645 1.0204926 0.59173584 2.4008713zm8.984772 -0.38944244l-4.1470337 -7.1861115l1.0959778 -0.6329651l0.5857239 1.0149841q0.10531616 -1.6306229 1.6072083 -2.498024q0.6494751 -0.37509155 1.3234558 -0.45760345q0.6739807 -0.0825119 1.163269 0.14012909q0.48928833 0.22264099 0.9021301 0.6879196q0.26287842 0.29925537 0.74710083 1.1383057l2.553833 4.4253464l-1.2177429 0.7033005l-2.522583 -4.371216q-0.42956543 -0.74432373 -0.7892456 -1.0237579q-0.3540039 -0.30078125 -0.8442688 -0.30632782q-0.47677612 -0.01335144 -0.9638672 0.26796722q-0.7847595 0.45323944 -1.0640869 1.2821732q-0.2735901 0.80757904 0.52301025 2.1879654l2.264862 3.924614l-1.2177429 0.7033005zm7.819275 -3.7220154l1.2922058 -0.511734q0.3878479 0.5157852 0.8666992 0.56401825q0.65527344 0.072639465 1.4400635 -0.38059235q0.8388977 -0.48449707 1.1036682 -1.0885162q0.26480103 -0.60401917 0.07571411 -1.3067856q-0.1161499 -0.42009735 -0.81121826 -1.6245499q-0.25161743 1.408371 -1.4423218 2.0960464q-1.4883423 0.859581 -2.9171448 0.25933075q-1.428833 -0.60025024 -2.2957153 -2.1024323q-0.5935669 -1.0285187 -0.72805786 -2.1056366q-0.12097168 -1.0849228 0.30926514 -1.9649353q0.4437561 -0.887825 1.3908997 -1.4348297q1.2718811 -0.7345581 2.690796 -0.18271637l-0.4998474 -0.866127l1.1230469 -0.6485977l3.5847168 6.2117233q0.9684448 1.6781082 1.0362854 2.5771942q0.067840576 0.899086 -0.4420166 1.7348709q-0.5098877 0.83579254 -1.5923462 1.4609451q-1.2854004 0.7423706 -2.4195251 0.6214981q-1.1206055 -0.12869263 -1.7651672 -1.3081741zm-1.4765625 -4.90316q0.81222534 1.4074478 1.7418518 1.7366486q0.94314575 0.32138824 1.7820435 -0.16310883q0.8388977 -0.48448944 1.0401001 -1.4487534q0.19342041 -0.97779846 -0.60317993 -2.3581848q-0.76538086 -1.3262482 -1.7298889 -1.6533508q-0.97229004 -0.3406372 -1.7976685 0.13603973q-0.8118286 0.46886444 -0.9974365 1.4601974q-0.1855774 0.991333 0.56417847 2.290512z" fill-rule="nonzero"/><path fill="#000000" d="m294.23132 118.16088l-0.80441284 -1.3939133l1.2177429 -0.7033005l0.80441284 1.3939209l-1.2177429 0.70329285zm4.920227 8.525894l-4.147064 -7.1861115l1.2177734 -0.70329285l4.1470337 7.186104l-1.2177429 0.7033005zm1.3493347 -3.6482391l1.0948792 -0.88494873q0.51641846 0.67609406 1.2029724 0.8028641q0.6922302 0.10542297 1.5176086 -0.37125397q0.8388672 -0.48449707 1.0495605 -1.0572586q0.20285034 -0.5862961 -0.062683105 -1.0464249q-0.23431396 -0.4059906 -0.7266846 -0.44641113q-0.35079956 -0.013923645 -1.4790955 0.31292725q-1.5347595 0.43530273 -2.2030334 0.4964676q-0.66256714 0.03981781 -1.183075 -0.23695374q-0.5069885 -0.28459167 -0.8115845 -0.8123779q-0.28115845 -0.48719788 -0.2989807 -1.0182266q-0.017791748 -0.5310211 0.2048645 -1.0204926q0.15917969 -0.3806305 0.56817627 -0.79727936q0.4147339 -0.43800354 0.9694824 -0.75839233q0.852417 -0.49230957 1.6289368 -0.6159897q0.7765198 -0.123680115 1.3162842 0.123931885q0.5455017 0.22625732 1.0733948 0.8596573l-1.0969849 0.85006714q-0.4013672 -0.5079727 -0.9733887 -0.59262085q-0.5720215 -0.0846405 -1.2756042 0.32170868q-0.8388977 0.48449707 -1.0402222 0.9796829q-0.19558716 0.47383118 0.015289307 0.8392334q0.14056396 0.24359131 0.39874268 0.3470993q0.2581787 0.103507996 0.64749146 0.05910492q0.22845459 -0.041732788 1.2620544 -0.31388855q1.4806519 -0.4040451 2.127594 -0.470932q0.6390991 -0.08041382 1.1653442 0.17501068q0.5397949 0.247612 0.89904785 0.87013245q0.35144043 0.60899353 0.29852295 1.3613129q-0.047210693 0.73096466 -0.5519409 1.4194183q-0.49118042 0.68063354 -1.3300476 1.1651306q-1.407196 0.81269836 -2.4736633 0.65275574q-1.0664673 -0.15995026 -1.933258 -1.1930542zm6.1190186 -5.4646606q-1.1480408 -1.9893723 -0.5930481 -3.591011q0.47283936 -1.3376541 1.7988281 -2.1034698q1.4883423 -0.859581 2.984253 -0.4243927q1.501648 0.41384888 2.4934998 2.1325607q0.79663086 1.3803787 0.83795166 2.4210815q0.04135132 1.0407028 -0.49923706 1.948349q-0.5348511 0.88629913 -1.4819946 1.4333115q-1.5018921 0.8674011 -2.9899902 0.44573975q-1.4959106 -0.43519592 -2.5502625 -2.262169zm1.2583313 -0.7267456q0.79663086 1.3803864 1.7902832 1.7267456q1.0071716 0.33853912 1.9136963 -0.18502808q0.9065552 -0.5235672 1.1036072 -1.5575943q0.21057129 -1.0418396 -0.60946655 -2.462822q-0.76538086 -1.3262482 -1.7725525 -1.6647873q-0.99365234 -0.34635925 -1.9002075 0.17720795q-0.90652466 0.5235672 -1.117096 1.5654068q-0.2048645 1.0204926 0.59173584 2.4008713zm8.957733 -0.3738098l-5.7246704 -9.919807l1.2177734 -0.7033005l5.72464 9.919807l-1.2177429 0.7033005zm7.2713623 -5.390396q-0.34069824 0.9726486 -0.8225403 1.5757141q-0.48962402 0.5895233 -1.2067566 1.003685q-1.1906738 0.6876755 -2.162445 0.47302246q-0.9717407 -0.21464539 -1.4872131 -1.1078339q-0.30456543 -0.5277939 -0.3109436 -1.1015167q-0.014190674 -0.58724976 0.21627808 -1.0631866q0.23620605 -0.4972763 0.64520264 -0.9139328q0.31063843 -0.30571747 0.980896 -0.8010864q1.3733215 -1.0277023 1.9363098 -1.6776505q-0.14837646 -0.25712585 -0.18740845 -0.32479095q-0.42956543 -0.74432373 -0.93963623 -0.84669495q-0.7156677 -0.14601898 -1.6357422 0.38536072q-0.8524475 0.49230957 -1.0922546 1.0458145q-0.23410034 0.5321655 0.013824463 1.3994217l-1.2843933 0.5252762q-0.2749939 -0.85163116 -0.18301392 -1.5362854q0.10549927 -0.6924591 0.66851807 -1.3424072q0.5552063 -0.66348267 1.4752808 -1.1948624q0.92007446 -0.5313797 1.6133118 -0.6430588q0.7067566 -0.11948395 1.1648254 0.04901886q0.45803833 0.16851044 0.8552551 0.60672q0.24728394 0.27218628 0.71588135 1.0841827l0.9371643 1.6239777q0.9840393 1.7051773 1.3094177 2.1127014q0.33892822 0.39970398 0.81103516 0.6863861l-1.2718506 0.7345581q-0.40811157 -0.26953125 -0.7590027 -0.75253296zm-1.6645203 -2.6654587q-0.5067749 0.65356445 -1.7234497 1.6088486q-0.69522095 0.5458679 -0.9283142 0.8609314q-0.23312378 0.31506348 -0.25280762 0.6873169q-0.013977051 0.35090637 0.16564941 0.6621628q0.28115845 0.48719788 0.83392334 0.60100555q0.5662842 0.10598755 1.269867 -0.30036163q0.70358276 -0.40634918 1.072998 -1.016655q0.37512207 -0.63165283 0.3197937 -1.3214569q-0.031341553 -0.5232086 -0.49990845 -1.3351974l-0.25775146 -0.44659424zm7.223419 -0.8156891l0.8006897 0.98106384q-0.45169067 0.40522003 -0.857605 0.63964844q-0.6629944 0.38290405 -1.1454468 0.39089966q-0.4902954 -0.0055389404 -0.8343811 -0.25791168q-0.3518982 -0.26589966 -0.9844971 -1.3620911l-2.382019 -4.1276093l-0.8930054 0.5157471l-0.5466919 -0.94731903l0.8930054 -0.5157471l-1.0308838 -1.786377l0.7960205 -1.4340897l1.4526367 2.5171661l1.2177429 -0.7033005l0.5466919 0.94731903l-1.2177429 0.7033005l2.421051 4.195282q0.30459595 0.5277939 0.4446106 0.645401q0.15356445 0.10978699 0.35708618 0.11856079q0.19567871 -0.004760742 0.46627808 -0.16104889q0.20297241 -0.11721039 0.49645996 -0.35889435zm-3.0901794 -8.121277l-0.80441284 -1.3939209l1.2177429 -0.70329285l0.80444336 1.3939133l-1.2177734 0.7033005zm4.920227 8.525887l-4.1470337 -7.186104l1.2177429 -0.7033005l4.147064 7.186104l-1.2177734 0.7033005zm0.54074097 -5.111908q-1.1480408 -1.9893799 -0.5930481 -3.591011q0.47280884 -1.3376541 1.7988281 -2.1034698q1.4883423 -0.8595886 2.984253 -0.4243927q1.501648 0.41384125 2.4934998 2.132553q0.79660034 1.3803864 0.83795166 2.4210892q0.0413208 1.0407028 -0.49923706 1.948349q-0.5348511 0.88629913 -1.4819946 1.4333038q-1.5018921 0.8674011 -2.9899902 0.44574738q-1.4959412 -0.43519592 -2.5502625 -2.262169zm1.2583313 -0.7267456q0.79663086 1.3803864 1.7902527 1.726738q1.0072021 0.33854675 1.9137268 -0.18502045q0.9065552 -0.5235672 1.1036072 -1.5575943q0.21057129 -1.0418396 -0.60946655 -2.462822q-0.76538086 -1.3262482 -1.7725525 -1.6647949q-0.99365234 -0.34635162 -1.9002075 0.17721558q-0.9065552 0.5235672 -1.117096 1.5654068q-0.2048645 1.0204926 0.59173584 2.4008713zm8.984772 -0.38944244l-4.1470337 -7.1861115l1.0959473 -0.6329651l0.5857544 1.0149841q0.10531616 -1.6306152 1.6072083 -2.4980164q0.6494446 -0.37509918 1.3234558 -0.45761108q0.6739807 -0.0825119 1.163269 0.14012909q0.48928833 0.22264099 0.9021301 0.6879196q0.26287842 0.29925537 0.74710083 1.1383133l2.553833 4.4253387l-1.2177429 0.7033005l-2.5226135 -4.371208q-0.4295349 -0.74432373 -0.7892456 -1.0237656q-0.3539734 -0.30078125 -0.8442383 -0.3063202q-0.47677612 -0.01335907 -0.9638672 0.2679596q-0.7847595 0.45323944 -1.0640869 1.2821732q-0.2735901 0.80757904 0.52301025 2.1879654l2.264862 3.924614l-1.2177429 0.7033005z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m282.76047 118.41563c-17.003845 0 -26.795105 -5.566925 -34.007706 -11.133858c-7.2126007 -5.566925 -11.846542 -11.133858 -23.6931 -11.133858" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.76047 118.415634c-17.003876 0 -26.795105 -5.5669327 -34.007706 -11.133858c-3.6062927 -2.7834702 -6.567932 -5.5669327 -10.10881 -7.6545334c-0.4426117 -0.26094818 -0.8942871 -0.51101685 -1.3573761 -0.74887085c-0.11578369 -0.0594635 -0.23228455 -0.11816406 -0.34950256 -0.17607117l-0.13806152 -0.06682587" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m237.48381 95.40378l-9.563843 1.350235l8.194244 5.1131744z" fill-rule="evenodd"/></g></svg>
+\ No newline at end of file
diff --git a/g3doc/Sentry-Gofer.png b/g3doc/Sentry-Gofer.png
new file mode 100644
index 000000000..ca2c27ef7
--- /dev/null
+++ b/g3doc/Sentry-Gofer.png
diff --git a/g3doc/Sentry-Gofer.svg b/g3doc/Sentry-Gofer.svg
new file mode 100644
index 000000000..5c10750d2
--- /dev/null
+++ b/g3doc/Sentry-Gofer.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 358.8556430446194 249.67191601049868" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l358.85565 0l0 249.67192l-358.85565 0l0 -249.67192z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l358.85565 0l0 249.67192l-358.85565 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m36.454067 6.6430445l114.4252 0l0 48.850395l-114.4252 0z" fill-rule="evenodd"/><path stroke="#cc4125" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m36.454067 6.6430445l114.4252 0l0 48.850395l-114.4252 0z" fill-rule="evenodd"/><path fill="#000000" d="m48.00139 37.98824l5.125 -13.359373l1.90625 0l5.46875 13.359373l-2.015625 0l-1.546875 -4.046875l-5.59375 0l-1.46875 4.046875l-1.875 0zm3.859375 -5.484375l4.53125 0l-1.40625 -3.703123q-0.625 -1.6875 -0.9375 -2.765625q-0.265625 1.28125 -0.71875 2.546875l-1.46875 3.921873zm9.849823 9.1875l0 -13.374998l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546873q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.8437481 -0.765625 -2.765623q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.843748zm8.891342 8.484375l0 -13.374998l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546873q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.8437481 -0.765625 -2.765623q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.843748zm8.844467 4.78125l0 -13.359373l1.640625 0l0 13.359373l-1.640625 0zm4.191696 -11.468748l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.468748l0 -9.671873l1.640625 0l0 9.671873l-1.640625 0zm10.457321 -3.546875l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.5781231 0.515625 -2.749998q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.812498q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125zm9.328125 2.359375q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5624981q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.187498q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm7.781967 3.390625l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578123l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671873q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 -9.999998l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.468748l0 -9.671873l1.640625 0l0 9.671873l-1.640625 0zm3.5354462 -4.84375q0 -2.687498 1.484375 -3.968748q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609373q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.7968731 -0.8125 -2.718748q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765623zm9.297592 4.84375l0 -9.671873l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.5937481l0 5.953125l-1.640625 0l0 -5.890625q0 -0.9999981 -0.203125 -1.4843731q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515623l0 5.28125l-1.640625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m4.4540663 73.055115l40.47244 0.25196838" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m4.4540663 73.055115l40.47244 0.25196838" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m144.0 74.0l35.27559 -0.94488525" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m144.0 74.0l35.27559 -0.94488525" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m44.926506 57.27559l97.48032 0l0 32.062996l-97.48032 0z" fill-rule="evenodd"/><path fill="#000000" d="m56.859642 75.044586l1.203125 -0.109375q0.078125 0.71875 0.390625 1.1875q0.3125 0.453125 0.953125 0.734375q0.65625 0.28125 1.46875 0.28125q0.71875 0 1.265625 -0.21875q0.5625 -0.21875 0.828125 -0.578125q0.265625 -0.375 0.265625 -0.828125q0 -0.453125 -0.265625 -0.78125q-0.25 -0.328125 -0.84375 -0.5625q-0.390625 -0.15625 -1.703125 -0.46875q-1.3125 -0.3125 -1.84375 -0.59375q-0.671875 -0.359375 -1.015625 -0.890625q-0.328125 -0.53125 -0.328125 -1.1875q0 -0.71875 0.40625 -1.34375q0.40625 -0.625 1.1875 -0.953125q0.796875 -0.328125 1.765625 -0.328125q1.046875 0 1.859375 0.34375q0.8125 0.34375 1.25 1.015625q0.4375 0.65625 0.46875 1.484375l-1.203125 0.09375q-0.109375 -0.90625 -0.671875 -1.359375q-0.5625 -0.46875 -1.65625 -0.46875q-1.140625 0 -1.671875 0.421875q-0.515625 0.421875 -0.515625 1.015625q0 0.515625 0.359375 0.84375q0.375 0.328125 1.90625 0.6875q1.546875 0.34375 2.109375 0.59375q0.84375 0.390625 1.234375 0.984375q0.390625 0.578125 0.390625 1.359375q0 0.75 -0.4375 1.4375q-0.421875 0.671875 -1.25 1.046875q-0.8125 0.359375 -1.828125 0.359375q-1.296875 0 -2.171875 -0.375q-0.875 -0.375 -1.375 -1.125q-0.5 -0.765625 -0.53125 -1.71875zm9.12413 5.71875l-0.125 -1.09375q0.375 0.109375 0.65625 0.109375q0.390625 0 0.625 -0.140625q0.234375 -0.125 0.390625 -0.359375q0.109375 -0.171875 0.359375 -0.875q0.03125 -0.09375 0.109375 -0.28125l-2.625 -6.921875l1.265625 0l1.4375 4.0q0.28125 0.765625 0.5 1.59375q0.203125 -0.796875 0.46875 -1.578125l1.484375 -4.015625l1.171875 0l-2.625 7.015625q-0.421875 1.140625 -0.65625 1.578125q-0.3125 0.578125 -0.71875 0.84375q-0.40625 0.28125 -0.96875 0.28125q-0.328125 0 -0.75 -0.15625zm6.2421875 -4.71875l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625zm9.6953125 1.015625l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.8748627 -1.171875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5218506 4.125l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.03125 0 1.578125 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.703125 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm19.321045 -2.53125l1.15625 0.15625q-0.1875 1.1875 -0.96875 1.859375q-0.78125 0.671875 -1.921875 0.671875q-1.40625 0 -2.28125 -0.921875q-0.859375 -0.9375 -0.859375 -2.65625q0 -1.125 0.375 -1.96875q0.375 -0.84375 1.125 -1.25q0.765625 -0.421875 1.65625 -0.421875q1.125 0 1.84375 0.578125q0.71875 0.5625 0.921875 1.609375l-1.140625 0.171875q-0.171875 -0.703125 -0.59375 -1.046875q-0.40625 -0.359375 -0.984375 -0.359375q-0.890625 0 -1.453125 0.640625q-0.546875 0.640625 -0.546875 2.0q0 1.40625 0.53125 2.03125q0.546875 0.625 1.40625 0.625q0.6875 0 1.140625 -0.421875q0.46875 -0.421875 0.59375 -1.296875zm6.6640625 1.671875q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.96109 0l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.507965 -2.0625l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125076 0 1.2031326 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875076 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125076 0 1.4218826 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625076 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.2500076 0.328125 1.7343826 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.4531326 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625z" fill-rule="nonzero"/><path fill="#d9d2e9" d="m36.454067 87.40682l114.4252 0l0 48.850395l-114.4252 0z" fill-rule="evenodd"/><path stroke="#8e7cc3" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m36.454067 87.40682l114.4252 0l0 48.850395l-114.4252 0z" fill-rule="evenodd"/><path fill="#000000" d="m67.55086 114.45515l1.65625 -0.140625q0.125 1.0 0.546875 1.640625q0.4375 0.640625 1.34375 1.046875q0.921875 0.390625 2.0625 0.390625q1.0 0 1.78125 -0.296875q0.78125 -0.296875 1.15625 -0.8125q0.375 -0.53125 0.375 -1.15625q0 -0.625 -0.375 -1.09375q-0.359375 -0.46875 -1.1875 -0.796875q-0.546875 -0.203125 -2.390625 -0.640625q-1.828125 -0.453125 -2.5625 -0.84375q-0.96875 -0.5 -1.4375 -1.234375q-0.46875 -0.75 -0.46875 -1.671875q0 -1.0 0.578125 -1.875q0.578125 -0.890625 1.671875 -1.34375q1.109375 -0.453125 2.453125 -0.453125q1.484375 0 2.609375 0.484375q1.140625 0.46875 1.75 1.40625q0.609375 0.921875 0.65625 2.09375l-1.6875 0.125q-0.140625 -1.265625 -0.9375 -1.90625q-0.78125 -0.65625 -2.3125 -0.65625q-1.609375 0 -2.34375 0.59375q-0.734375 0.59375 -0.734375 1.421875q0 0.71875 0.53125 1.171875q0.5 0.46875 2.65625 0.96875q2.15625 0.484375 2.953125 0.84375q1.171875 0.53125 1.71875 1.359375q0.5625 0.828125 0.5625 1.90625q0 1.0625 -0.609375 2.015625q-0.609375 0.9375 -1.75 1.46875q-1.140625 0.515625 -2.578125 0.515625q-1.8125 0 -3.046875 -0.53125q-1.21875 -0.53125 -1.921875 -1.59375q-0.6875 -1.0625 -0.71875 -2.40625zm19.459198 1.1875l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.141342 5.765625l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm13.953842 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.5895538 1.46875l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm6.150177 3.71875l-0.1875 -1.53125q0.546875 0.140625 0.9375 0.140625q0.546875 0 0.875 -0.1875q0.328125 -0.171875 0.546875 -0.5q0.15625 -0.25 0.5 -1.21875q0.046875 -0.140625 0.140625 -0.40625l-3.671875 -9.6875l1.765625 0l2.015625 5.59375q0.390625 1.078125 0.703125 2.25q0.28125 -1.125 0.671875 -2.203125l2.078125 -5.640625l1.640625 0l-3.6875 9.828125q-0.59375 1.609375 -0.921875 2.203125q-0.4375 0.8125 -1.0 1.1875q-0.5625 0.375 -1.34375 0.375q-0.484375 0 -1.0625 -0.203125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m6.8477693 152.91733l48.85039 0" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m6.8477693 152.91733l48.85039 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m145.08398 152.91733l33.95276 0" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m145.08398 152.91733l33.95276 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m55.698162 136.88583l89.38582 0l0 32.06299l-89.38582 0z" fill-rule="evenodd"/><path fill="#000000" d="m81.92428 150.43732l0 -8.59375l1.140625 0l0 7.578125l4.234375 0l0 1.015625l-5.375 0zm6.595703 -7.375l0 -1.21875l1.0625 0l0 1.21875l-1.0625 0zm0 7.375l0 -6.21875l1.0625 0l0 6.21875l-1.0625 0zm2.6660156 0l0 -6.21875l0.9375 0l0 0.875q0.296875 -0.46875 0.78125 -0.734375q0.484375 -0.28125 1.109375 -0.28125q0.6875 0 1.125 0.28125q0.453125 0.28125 0.625 0.796875q0.75 -1.078125 1.921875 -1.078125q0.9375 0 1.421875 0.515625q0.5 0.5 0.5 1.578125l0 4.265625l-1.046875 0l0 -3.921875q0 -0.625 -0.109375 -0.90625q-0.09375 -0.28125 -0.359375 -0.453125q-0.265625 -0.171875 -0.640625 -0.171875q-0.65625 0 -1.09375 0.4375q-0.421875 0.4375 -0.421875 1.40625l0 3.609375l-1.0625 0l0 -4.046875q0 -0.703125 -0.265625 -1.046875q-0.25 -0.359375 -0.828125 -0.359375q-0.453125 0 -0.828125 0.234375q-0.375 0.234375 -0.546875 0.6875q-0.171875 0.453125 -0.171875 1.296875l0 3.234375l-1.046875 0zm9.996094 -7.375l0 -1.21875l1.0625 0l0 1.21875l-1.0625 0zm0 7.375l0 -6.21875l1.0625 0l0 6.21875l-1.0625 0zm4.9628906 -0.9375l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125zm5.2871094 -1.0625l1.09375 0.125q-0.25 0.953125 -0.953125 1.484375q-0.703125 0.53125 -1.78125 0.53125q-1.359375 0 -2.171875 -0.84375q-0.796875 -0.84375 -0.796875 -2.359375q0 -1.5625 0.8125 -2.421875q0.8125 -0.875 2.09375 -0.875q1.25 0 2.03125 0.84375q0.796875 0.84375 0.796875 2.390625q0 0.09375 0 0.28125l-4.640625 0q0.0625 1.03125 0.578125 1.578125q0.515625 0.53125 1.296875 0.53125q0.578125 0 0.984375 -0.296875q0.421875 -0.3125 0.65625 -0.96875zm-3.453125 -1.703125l3.46875 0q-0.0625 -0.796875 -0.390625 -1.1875q-0.515625 -0.609375 -1.3125 -0.609375q-0.734375 0 -1.234375 0.484375q-0.484375 0.484375 -0.53125 1.3125zm9.908203 3.703125l0 -0.78125q-0.59375 0.921875 -1.734375 0.921875q-0.75 0 -1.375 -0.40625q-0.625 -0.421875 -0.96875 -1.15625q-0.34375 -0.734375 -0.34375 -1.6875q0 -0.921875 0.3125 -1.6875q0.3125 -0.765625 0.9375 -1.15625q0.625 -0.40625 1.390625 -0.40625q0.5625 0 1.0 0.234375q0.4375 0.234375 0.71875 0.609375l0 -3.078125l1.046875 0l0 8.59375l-0.984375 0zm-3.328125 -3.109375q0 1.203125 0.5 1.796875q0.5 0.578125 1.1875 0.578125q0.6875 0 1.171875 -0.5625q0.484375 -0.5625 0.484375 -1.71875q0 -1.28125 -0.5 -1.875q-0.484375 -0.59375 -1.203125 -0.59375q-0.703125 0 -1.171875 0.578125q-0.46875 0.5625 -0.46875 1.796875z" fill-rule="nonzero"/><path fill="#000000" d="m68.0942 162.57794l1.03125 -0.15625q0.09375 0.625 0.484375 0.953125q0.40625 0.328125 1.140625 0.328125q0.71875 0 1.0625 -0.28125q0.359375 -0.296875 0.359375 -0.703125q0 -0.359375 -0.3125 -0.5625q-0.21875 -0.140625 -1.078125 -0.359375q-1.15625 -0.296875 -1.609375 -0.5q-0.4375 -0.21875 -0.671875 -0.59375q-0.234375 -0.375 -0.234375 -0.84375q0 -0.40625 0.1875 -0.765625q0.1875 -0.359375 0.515625 -0.59375q0.25 -0.171875 0.671875 -0.296875q0.421875 -0.125 0.921875 -0.125q0.71875 0 1.265625 0.21875q0.5625 0.203125 0.828125 0.5625q0.265625 0.359375 0.359375 0.953125l-1.03125 0.140625q-0.0625 -0.46875 -0.40625 -0.734375q-0.328125 -0.28125 -0.953125 -0.28125q-0.71875 0 -1.03125 0.25q-0.3125 0.234375 -0.3125 0.5625q0 0.203125 0.125 0.359375q0.140625 0.171875 0.40625 0.28125q0.15625 0.0625 0.9375 0.265625q1.125 0.3125 1.5625 0.5q0.4375 0.1875 0.6875 0.546875q0.25 0.359375 0.25 0.90625q0 0.53125 -0.3125 1.0q-0.296875 0.453125 -0.875 0.71875q-0.578125 0.25 -1.3125 0.25q-1.21875 0 -1.859375 -0.5q-0.625 -0.515625 -0.796875 -1.5zm6.375 4.25l-0.125 -0.984375q0.34375 0.09375 0.609375 0.09375q0.34375 0 0.546875 -0.125q0.21875 -0.109375 0.359375 -0.3125q0.09375 -0.171875 0.328125 -0.796875q0.015625 -0.078125 0.09375 -0.25l-2.375 -6.234375l1.140625 0l1.296875 3.59375q0.25 0.6875 0.453125 1.453125q0.1875 -0.734375 0.4375 -1.421875l1.328125 -3.625l1.046875 0l-2.359375 6.328125q-0.390625 1.015625 -0.59375 1.40625q-0.28125 0.53125 -0.65625 0.765625q-0.359375 0.25 -0.859375 0.25q-0.296875 0 -0.671875 -0.140625zm5.625 -4.25l1.03125 -0.15625q0.09375 0.625 0.484375 0.953125q0.40625 0.328125 1.140625 0.328125q0.71875 0 1.0625 -0.28125q0.359375 -0.296875 0.359375 -0.703125q0 -0.359375 -0.3125 -0.5625q-0.21875 -0.140625 -1.078125 -0.359375q-1.15625 -0.296875 -1.609375 -0.5q-0.4375 -0.21875 -0.671875 -0.59375q-0.234375 -0.375 -0.234375 -0.84375q0 -0.40625 0.1875 -0.765625q0.1875 -0.359375 0.515625 -0.59375q0.25 -0.171875 0.671875 -0.296875q0.421875 -0.125 0.921875 -0.125q0.71875 0 1.265625 0.21875q0.5625 0.203125 0.828125 0.5625q0.265625 0.359375 0.359375 0.953125l-1.03125 0.140625q-0.0625 -0.46875 -0.40625 -0.734375q-0.328125 -0.28125 -0.953125 -0.28125q-0.71875 0 -1.03125 0.25q-0.3125 0.234375 -0.3125 0.5625q0 0.203125 0.125 0.359375q0.140625 0.171875 0.40625 0.28125q0.15625 0.0625 0.9375 0.265625q1.125 0.3125 1.5625 0.5q0.4375 0.1875 0.6875 0.546875q0.25 0.359375 0.25 0.90625q0 0.53125 -0.3125 1.0q-0.296875 0.453125 -0.875 0.71875q-0.578125 0.25 -1.3125 0.25q-1.21875 0 -1.859375 -0.5q-0.625 -0.515625 -0.796875 -1.5zm8.71875 0.921875l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125zm5.2871094 -1.0625l1.09375 0.125q-0.25 0.953125 -0.953125 1.484375q-0.703125 0.53125 -1.78125 0.53125q-1.359375 0 -2.171875 -0.84375q-0.796875 -0.84375 -0.796875 -2.359375q0 -1.5625 0.8125 -2.421875q0.8125 -0.875 2.09375 -0.875q1.25 0 2.03125 0.84375q0.796875 0.84375 0.796875 2.390625q0 0.09375 0 0.28125l-4.640625 0q0.0625 1.03125 0.578125 1.578125q0.515625 0.53125 1.296875 0.53125q0.578125 0 0.984375 -0.296875q0.421875 -0.3125 0.65625 -0.96875zm-3.453125 -1.703125l3.46875 0q-0.0625 -0.796875 -0.390625 -1.1875q-0.515625 -0.609375 -1.3125 -0.609375q-0.734375 0 -1.234375 0.484375q-0.484375 0.484375 -0.53125 1.3125zm5.876953 3.703125l0 -6.21875l0.9375 0l0 0.875q0.296875 -0.46875 0.78125 -0.734375q0.484375 -0.28125 1.109375 -0.28125q0.6875 0 1.125 0.28125q0.453125 0.28125 0.625 0.796875q0.75 -1.078125 1.921875 -1.078125q0.9375 0 1.421875 0.515625q0.5 0.5 0.5 1.578125l0 4.265625l-1.046875 0l0 -3.921875q0 -0.625 -0.109375 -0.90625q-0.09375 -0.28125 -0.359375 -0.453125q-0.265625 -0.171875 -0.640625 -0.171875q-0.65625 0 -1.09375 0.4375q-0.421875 0.4375 -0.421875 1.40625l0 3.609375l-1.0625 0l0 -4.046875q0 -0.703125 -0.265625 -1.046875q-0.25 -0.359375 -0.828125 -0.359375q-0.453125 0 -0.828125 0.234375q-0.375 0.234375 -0.546875 0.6875q-0.171875 0.453125 -0.171875 1.296875l0 3.234375l-1.046875 0zm17.392578 -2.28125l1.03125 0.140625q-0.171875 1.0625 -0.875 1.671875q-0.703125 0.609375 -1.71875 0.609375q-1.28125 0 -2.0625 -0.828125q-0.765625 -0.84375 -0.765625 -2.40625q0 -1.0 0.328125 -1.75q0.34375 -0.765625 1.015625 -1.140625q0.6875 -0.375 1.5 -0.375q1.0 0 1.640625 0.515625q0.65625 0.5 0.84375 1.453125l-1.03125 0.15625q-0.140625 -0.625 -0.515625 -0.9375q-0.375 -0.328125 -0.90625 -0.328125q-0.796875 0 -1.296875 0.578125q-0.5 0.5625 -0.5 1.796875q0 1.265625 0.484375 1.828125q0.484375 0.5625 1.25 0.5625q0.625 0 1.03125 -0.375q0.421875 -0.375 0.546875 -1.171875zm6.0000076 1.515625q-0.5937576 0.5 -1.1406326 0.703125q-0.53125 0.203125 -1.15625 0.203125q-1.03125 0 -1.578125 -0.5q-0.546875 -0.5 -0.546875 -1.28125q0 -0.453125 0.203125 -0.828125q0.203125 -0.390625 0.546875 -0.609375q0.34375 -0.234375 0.765625 -0.34375q0.296875 -0.09375 0.9375 -0.171875q1.265625 -0.140625 1.8750076 -0.359375q0 -0.21875 0 -0.265625q0 -0.65625 -0.29688263 -0.921875q-0.40625 -0.34375 -1.203125 -0.34375q-0.734375 0 -1.09375 0.265625q-0.359375 0.25 -0.53125 0.90625l-1.03125 -0.140625q0.140625 -0.65625 0.46875 -1.0625q0.328125 -0.40625 0.9375 -0.625q0.609375 -0.21875 1.40625 -0.21875q0.796875 0 1.2968826 0.1875q0.5 0.1875 0.734375 0.46875q0.234375 0.28125 0.328125 0.71875q0.046875 0.265625 0.046875 0.96875l0 1.40625q0 1.46875 0.0625 1.859375q0.078125 0.390625 0.28125 0.75l-1.109375 0q-0.15625 -0.328125 -0.203125 -0.765625zm-0.09375 -2.359375q-0.5781326 0.234375 -1.7187576 0.40625q-0.65625 0.09375 -0.921875 0.21875q-0.265625 0.109375 -0.421875 0.328125q-0.140625 0.21875 -0.140625 0.5q0 0.421875 0.3125 0.703125q0.328125 0.28125 0.9375 0.28125q0.609375 0 1.078125 -0.265625q0.484375 -0.265625 0.703125 -0.734375q0.17188263 -0.359375 0.17188263 -1.046875l0 -0.390625zm2.6738281 3.125l0 -8.59375l1.0625 0l0 8.59375l-1.0625 0zm2.6660156 0l0 -8.59375l1.0625 0l0 8.59375l-1.0625 0zm2.2753906 -1.859375l1.03125 -0.15625q0.09375 0.625 0.484375 0.953125q0.40625 0.328125 1.140625 0.328125q0.71875 0 1.0625 -0.28125q0.359375 -0.296875 0.359375 -0.703125q0 -0.359375 -0.3125 -0.5625q-0.21875 -0.140625 -1.078125 -0.359375q-1.15625 -0.296875 -1.609375 -0.5q-0.4375 -0.21875 -0.671875 -0.59375q-0.234375 -0.375 -0.234375 -0.84375q0 -0.40625 0.1875 -0.765625q0.1875 -0.359375 0.515625 -0.59375q0.25 -0.171875 0.671875 -0.296875q0.421875 -0.125 0.921875 -0.125q0.71875 0 1.265625 0.21875q0.5625 0.203125 0.828125 0.5625q0.265625 0.359375 0.359375 0.953125l-1.03125 0.140625q-0.0625 -0.46875 -0.40625 -0.734375q-0.328125 -0.28125 -0.953125 -0.28125q-0.71875 0 -1.03125 0.25q-0.3125 0.234375 -0.3125 0.5625q0 0.203125 0.125 0.359375q0.140625 0.171875 0.40625 0.28125q0.15625 0.0625 0.9375 0.265625q1.125 0.3125 1.5625 0.5q0.4375 0.1875 0.6875 0.546875q0.25 0.359375 0.25 0.90625q0 0.53125 -0.3125 1.0q-0.296875 0.453125 -0.875 0.71875q-0.578125 0.25 -1.3125 0.25q-1.21875 0 -1.859375 -0.5q-0.625 -0.515625 -0.796875 -1.5z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m89.902885 171.04015l174.83463 0l0 48.850388l-174.83463 0z" fill-rule="evenodd"/><path stroke="#6d9eeb" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m89.902885 171.04015l174.83463 0l0 48.850388l-174.83463 0z" fill-rule="evenodd"/><path fill="#000000" d="m130.0844 202.38535l0 -13.359375l1.765625 0l0 5.484375l6.9375 0l0 -5.484375l1.765625 0l0 13.359375l-1.765625 0l0 -6.296875l-6.9375 0l0 6.296875l-1.765625 0zm12.597946 -4.84375q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm8.641342 1.953125l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm13.5625 1.421875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm6.9134827 1.46875l0 -13.359375l1.78125 0l0 6.625l6.625 -6.625l2.390625 0l-5.59375 5.421875l5.84375 7.9375l-2.328125 0l-4.765625 -6.765625l-2.171875 2.140625l0 4.625l-1.78125 0zm18.943573 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm6.228302 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm17.000732 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.7656403 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375153 0 3.1562653 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.2187653 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.5468903 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.3906403 -2.65625l5.4062653 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.0312653 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.094467 5.765625l0 -13.359375l1.640625 0l0 13.359375l-1.640625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m9.1482525 232.87665l117.79527 -0.34646606" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m9.1482525 232.87665l117.79527 -0.34646606" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m233.43172 232.53018l120.34645 0.34646606" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.43172 232.53018l120.34645 0.34646606" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m126.94353 216.49869l106.48819 0l0 32.06299l-106.48819 0z" fill-rule="evenodd"/><path fill="#000000" d="m152.36165 237.33018l0 -9.546875l1.265625 0l0 3.921875l4.953125 0l0 -3.921875l1.265625 0l0 9.546875l-1.265625 0l0 -4.5l-4.953125 0l0 4.5l-1.265625 0zm13.953278 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm8.93837 0l0 -0.875q-0.65625 1.03125 -1.9375 1.03125q-0.8125 0 -1.515625 -0.453125q-0.6875 -0.453125 -1.078125 -1.265625q-0.375 -0.828125 -0.375 -1.890625q0 -1.03125 0.34375 -1.875q0.34375 -0.84375 1.03125 -1.28125q0.703125 -0.453125 1.546875 -0.453125q0.625 0 1.109375 0.265625q0.5 0.25 0.796875 0.671875l0 -3.421875l1.171875 0l0 9.546875l-1.09375 0zm-3.703125 -3.453125q0 1.328125 0.5625 1.984375q0.5625 0.65625 1.328125 0.65625q0.765625 0 1.296875 -0.625q0.53125 -0.625 0.53125 -1.90625q0 -1.421875 -0.546875 -2.078125q-0.546875 -0.671875 -1.34375 -0.671875q-0.78125 0 -1.3125 0.640625q-0.515625 0.625 -0.515625 2.0zm7.9124756 3.453125l-2.125 -6.90625l1.21875 0l1.09375 3.984375l0.421875 1.484375q0.015625 -0.109375 0.359375 -1.421875l1.09375 -4.046875l1.203125 0l1.03125 4.0l0.34375 1.328125l0.40625 -1.34375l1.171875 -3.984375l1.140625 0l-2.15625 6.90625l-1.21875 0l-1.09375 -4.140625l-0.265625 -1.171875l-1.40625 5.3125l-1.21875 0zm12.859528 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.9749756 3.46875l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm9.18837 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375z" fill-rule="nonzero"/><defs><linearGradient id="p.1" gradientUnits="userSpaceOnUse" gradientTransform="matrix(4.500288203456436 0.0 0.0 4.500288203456436 0.0 0.0)" spreadMethod="pad" x1="20.153856515233464" y1="38.20913288577608" x2="20.153840567727556" y2="42.70942108920426"><stop offset="0.0" stop-color="#ff0000"/><stop offset="0.51" stop-color="#dab7a6"/><stop offset="0.99999994" stop-color="#dab7a6" stop-opacity="0.0"/><stop offset="1.0" stop-color="#ffffff" stop-opacity="0.0"/></linearGradient></defs><path fill="url(#p.1)" d="m90.698166 171.95273l173.29134 0l0 20.251968l-173.29134 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m203.76447 87.804726l114.4252 0l0 48.850395l-114.4252 0z" fill-rule="evenodd"/><path stroke="#8e7cc3" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m203.76447 87.804726l114.4252 0l0 48.850395l-114.4252 0z" fill-rule="evenodd"/><path fill="#000000" d="m245.33514 113.91555l0 -1.578125l5.65625 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.375 0.515625 -2.84375 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421875 -2.4375q-0.8125 -1.59375 -0.8125 -3.5625q0 -1.953125 0.8125 -3.640625q0.8125 -1.6875 2.34375 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.625 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.265625 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.09375 -0.453125 1.65625 -0.953125l0 -2.484375l-3.921875 0zm7.448929 0.390625q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.0468597 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.0312347 0 -3.2812347 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.81248474 0.921875 2.0468597 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.0468597 0.921875q-0.796875 0.90625 -0.796875 2.765625zm9.688217 4.84375l0 -8.40625l-1.453125 0l0 -1.265625l1.453125 0l0 -1.03125q0 -0.96875 0.171875 -1.453125q0.234375 -0.640625 0.828125 -1.03125q0.59375 -0.390625 1.671875 -0.390625q0.6875 0 1.53125 0.15625l-0.25 1.4375q-0.5 -0.09375 -0.953125 -0.09375q-0.75 0 -1.0625 0.328125q-0.3125 0.3125 -0.3125 1.1875l0 0.890625l1.890625 0l0 1.265625l-1.890625 0l0 8.40625l-1.625 0zm11.417664 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125732 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m179.05511 152.91733l37.984253 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m179.05511 152.91733l37.984253 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m306.4252 152.91733l47.338593 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m306.4252 152.91733l47.338593 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m217.03937 136.88583l89.38583 0l0 32.06299l-89.38583 0z" fill-rule="evenodd"/><path fill="#000000" d="m226.58624 154.67169l1.0625 -0.09375q0.078125 0.65625 0.359375 1.0625q0.28125 0.40625 0.859375 0.671875q0.59375 0.25 1.328125 0.25q0.640625 0 1.140625 -0.1875q0.5 -0.203125 0.734375 -0.53125q0.25 -0.34375 0.25 -0.734375q0 -0.40625 -0.234375 -0.703125q-0.234375 -0.3125 -0.765625 -0.515625q-0.359375 -0.140625 -1.546875 -0.421875q-1.171875 -0.28125 -1.640625 -0.53125q-0.625 -0.328125 -0.921875 -0.796875q-0.296875 -0.484375 -0.296875 -1.078125q0 -0.640625 0.359375 -1.203125q0.375 -0.578125 1.078125 -0.859375q0.71875 -0.296875 1.578125 -0.296875q0.953125 0 1.6875 0.3125q0.734375 0.296875 1.125 0.90625q0.390625 0.59375 0.421875 1.34375l-1.09375 0.078125q-0.09375 -0.8125 -0.609375 -1.21875q-0.5 -0.421875 -1.484375 -0.421875q-1.03125 0 -1.5 0.375q-0.46875 0.375 -0.46875 0.90625q0 0.46875 0.328125 0.765625q0.328125 0.296875 1.703125 0.609375q1.390625 0.3125 1.90625 0.546875q0.75 0.359375 1.109375 0.890625q0.359375 0.515625 0.359375 1.21875q0 0.6875 -0.390625 1.296875q-0.390625 0.59375 -1.125 0.9375q-0.734375 0.328125 -1.65625 0.328125q-1.171875 0 -1.96875 -0.328125q-0.78125 -0.34375 -1.234375 -1.03125q-0.4375 -0.6875 -0.453125 -1.546875zm8.207031 5.15625l-0.125 -0.984375q0.34375 0.09375 0.609375 0.09375q0.34375 0 0.546875 -0.125q0.21875 -0.109375 0.359375 -0.3125q0.09375 -0.171875 0.328125 -0.796875q0.015625 -0.078125 0.09375 -0.25l-2.375 -6.234375l1.140625 0l1.296875 3.59375q0.25 0.6875 0.453125 1.453125q0.1875 -0.734375 0.4375 -1.421875l1.328125 -3.625l1.046875 0l-2.359375 6.328125q-0.390625 1.015625 -0.59375 1.40625q-0.28125 0.53125 -0.65625 0.765625q-0.359375 0.25 -0.859375 0.25q-0.296875 0 -0.671875 -0.140625zm5.625 -4.25l1.03125 -0.15625q0.09375 0.625 0.484375 0.953125q0.40625 0.328125 1.140625 0.328125q0.71875 0 1.0625 -0.28125q0.359375 -0.296875 0.359375 -0.703125q0 -0.359375 -0.3125 -0.5625q-0.21875 -0.140625 -1.078125 -0.359375q-1.15625 -0.296875 -1.609375 -0.5q-0.4375 -0.21875 -0.671875 -0.59375q-0.234375 -0.375 -0.234375 -0.84375q0 -0.40625 0.1875 -0.765625q0.1875 -0.359375 0.515625 -0.59375q0.25 -0.171875 0.671875 -0.296875q0.421875 -0.125 0.921875 -0.125q0.71875 0 1.265625 0.21875q0.5625 0.203125 0.828125 0.5625q0.265625 0.359375 0.359375 0.953125l-1.03125 0.140625q-0.0625 -0.46875 -0.40625 -0.734375q-0.328125 -0.28125 -0.953125 -0.28125q-0.71875 0 -1.03125 0.25q-0.3125 0.234375 -0.3125 0.5625q0 0.203125 0.125 0.359375q0.140625 0.171875 0.40625 0.28125q0.15625 0.0625 0.9375 0.265625q1.125 0.3125 1.5625 0.5q0.4375 0.1875 0.6875 0.546875q0.25 0.359375 0.25 0.90625q0 0.53125 -0.3125 1.0q-0.296875 0.453125 -0.875 0.71875q-0.578125 0.25 -1.3125 0.25q-1.21875 0 -1.859375 -0.5q-0.625 -0.515625 -0.796875 -1.5zm8.71875 0.921875l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125zm5.2871094 -1.0625l1.09375 0.125q-0.25 0.953125 -0.953125 1.484375q-0.703125 0.53125 -1.78125 0.53125q-1.359375 0 -2.171875 -0.84375q-0.796875 -0.84375 -0.796875 -2.359375q0 -1.5625 0.8125 -2.421875q0.8125 -0.875 2.09375 -0.875q1.25 0 2.03125 0.84375q0.796875 0.84375 0.796875 2.390625q0 0.09375 0 0.28125l-4.640625 0q0.0625 1.03125 0.578125 1.578125q0.515625 0.53125 1.296875 0.53125q0.578125 0 0.984375 -0.296875q0.421875 -0.3125 0.65625 -0.96875zm-3.453125 -1.703125l3.46875 0q-0.0625 -0.796875 -0.390625 -1.1875q-0.515625 -0.609375 -1.3125 -0.609375q-0.734375 0 -1.234375 0.484375q-0.484375 0.484375 -0.53125 1.3125zm5.876953 3.703125l0 -6.21875l0.9375 0l0 0.875q0.296875 -0.46875 0.78125 -0.734375q0.484375 -0.28125 1.109375 -0.28125q0.6875 0 1.125 0.28125q0.453125 0.28125 0.625 0.796875q0.75 -1.078125 1.921875 -1.078125q0.9375 0 1.421875 0.515625q0.5 0.5 0.5 1.578125l0 4.265625l-1.046875 0l0 -3.921875q0 -0.625 -0.109375 -0.90625q-0.09375 -0.28125 -0.359375 -0.453125q-0.265625 -0.171875 -0.640625 -0.171875q-0.65625 0 -1.09375 0.4375q-0.421875 0.4375 -0.421875 1.40625l0 3.609375l-1.0625 0l0 -4.046875q0 -0.703125 -0.265625 -1.046875q-0.25 -0.359375 -0.828125 -0.359375q-0.453125 0 -0.828125 0.234375q-0.375 0.234375 -0.546875 0.6875q-0.171875 0.453125 -0.171875 1.296875l0 3.234375l-1.046875 0zm17.392578 -2.28125l1.03125 0.140625q-0.171875 1.0625 -0.875 1.671875q-0.703125 0.609375 -1.71875 0.609375q-1.28125 0 -2.0625 -0.828125q-0.765625 -0.84375 -0.765625 -2.40625q0 -1.0 0.328125 -1.75q0.34375 -0.765625 1.015625 -1.140625q0.6875 -0.375 1.5 -0.375q1.0 0 1.640625 0.515625q0.65625 0.5 0.84375 1.453125l-1.03125 0.15625q-0.140625 -0.625 -0.515625 -0.9375q-0.375 -0.328125 -0.90625 -0.328125q-0.796875 0 -1.296875 0.578125q-0.5 0.5625 -0.5 1.796875q0 1.265625 0.484375 1.828125q0.484375 0.5625 1.25 0.5625q0.625 0 1.03125 -0.375q0.421875 -0.375 0.546875 -1.171875zm6.0 1.515625q-0.59375 0.5 -1.140625 0.703125q-0.53125 0.203125 -1.15625 0.203125q-1.03125 0 -1.578125 -0.5q-0.546875 -0.5 -0.546875 -1.28125q0 -0.453125 0.203125 -0.828125q0.203125 -0.390625 0.546875 -0.609375q0.34375 -0.234375 0.765625 -0.34375q0.296875 -0.09375 0.9375 -0.171875q1.265625 -0.140625 1.875 -0.359375q0 -0.21875 0 -0.265625q0 -0.65625 -0.296875 -0.921875q-0.40625 -0.34375 -1.203125 -0.34375q-0.734375 0 -1.09375 0.265625q-0.359375 0.25 -0.53125 0.90625l-1.03125 -0.140625q0.140625 -0.65625 0.46875 -1.0625q0.328125 -0.40625 0.9375 -0.625q0.609375 -0.21875 1.40625 -0.21875q0.796875 0 1.296875 0.1875q0.5 0.1875 0.734375 0.46875q0.234375 0.28125 0.328125 0.71875q0.046875 0.265625 0.046875 0.96875l0 1.40625q0 1.46875 0.0625 1.859375q0.078125 0.390625 0.28125 0.75l-1.109375 0q-0.15625 -0.328125 -0.203125 -0.765625zm-0.09375 -2.359375q-0.578125 0.234375 -1.71875 0.40625q-0.65625 0.09375 -0.921875 0.21875q-0.265625 0.109375 -0.421875 0.328125q-0.140625 0.21875 -0.140625 0.5q0 0.421875 0.3125 0.703125q0.328125 0.28125 0.9375 0.28125q0.609375 0 1.078125 -0.265625q0.484375 -0.265625 0.703125 -0.734375q0.171875 -0.359375 0.171875 -1.046875l0 -0.390625zm2.6738281 3.125l0 -8.59375l1.0625 0l0 8.59375l-1.0625 0zm2.6660156 0l0 -8.59375l1.0625 0l0 8.59375l-1.0625 0zm2.2753906 -1.859375l1.03125 -0.15625q0.09375 0.625 0.484375 0.953125q0.40625 0.328125 1.140625 0.328125q0.71875 0 1.0625 -0.28125q0.359375 -0.296875 0.359375 -0.703125q0 -0.359375 -0.3125 -0.5625q-0.21875 -0.140625 -1.078125 -0.359375q-1.15625 -0.296875 -1.609375 -0.5q-0.4375 -0.21875 -0.671875 -0.59375q-0.234375 -0.375 -0.234375 -0.84375q0 -0.40625 0.1875 -0.765625q0.1875 -0.359375 0.515625 -0.59375q0.25 -0.171875 0.671875 -0.296875q0.421875 -0.125 0.921875 -0.125q0.71875 0 1.265625 0.21875q0.5625 0.203125 0.828125 0.5625q0.265625 0.359375 0.359375 0.953125l-1.03125 0.140625q-0.0625 -0.46875 -0.40625 -0.734375q-0.328125 -0.28125 -0.953125 -0.28125q-0.71875 0 -1.03125 0.25q-0.3125 0.234375 -0.3125 0.5625q0 0.203125 0.125 0.359375q0.140625 0.171875 0.40625 0.28125q0.15625 0.0625 0.9375 0.265625q1.125 0.3125 1.5625 0.5q0.4375 0.1875 0.6875 0.546875q0.25 0.359375 0.25 0.90625q0 0.53125 -0.3125 1.0q-0.296875 0.453125 -0.875 0.71875q-0.578125 0.25 -1.3125 0.25q-1.21875 0 -1.859375 -0.5q-0.625 -0.515625 -0.796875 -1.5z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m150.87927 111.83202l52.88188 0.40944672" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m154.30624 111.85855l46.027924 0.35638428" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m154.30624 111.85856l1.133255 -1.1158447l-3.0983734 1.1006241l3.0809631 1.1484756z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m200.33417 112.214935l-1.133255 1.1158447l3.0983887 -1.1006317l-3.0809784 -1.148468z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m159.04854 85.32021l37.417328 0l0 32.06299l-37.417328 0z" fill-rule="evenodd"/><path fill="#000000" d="m168.78291 104.91708l1.125 -0.109375q0.140625 0.796875 0.546875 1.15625q0.40625 0.359375 1.03125 0.359375q0.53125 0 0.9375 -0.25q0.421875 -0.25 0.671875 -0.65625q0.265625 -0.421875 0.4375 -1.125q0.171875 -0.703125 0.171875 -1.421875q0 -0.078125 0 -0.234375q-0.359375 0.546875 -0.96875 0.90625q-0.59375 0.34375 -1.3125 0.34375q-1.1875 0 -2.015625 -0.859375q-0.8125 -0.859375 -0.8125 -2.265625q0 -1.453125 0.859375 -2.328125q0.859375 -0.890625 2.140625 -0.890625q0.9375 0 1.703125 0.5q0.78125 0.5 1.171875 1.4375q0.40625 0.921875 0.40625 2.671875q0 1.828125 -0.40625 2.921875q-0.390625 1.078125 -1.171875 1.640625q-0.78125 0.5625 -1.84375 0.5625q-1.109375 0 -1.828125 -0.609375q-0.703125 -0.625 -0.84375 -1.75zm4.796875 -4.21875q0 -1.0 -0.546875 -1.59375q-0.53125 -0.59375 -1.28125 -0.59375q-0.78125 0 -1.375 0.640625q-0.578125 0.640625 -0.578125 1.65625q0 0.90625 0.546875 1.484375q0.5625 0.5625 1.359375 0.5625q0.828125 0 1.34375 -0.5625q0.53125 -0.578125 0.53125 -1.59375zm2.9124756 6.421875l0 -9.546875l3.59375 0q0.953125 0 1.453125 0.09375q0.703125 0.125 1.171875 0.453125q0.484375 0.328125 0.765625 0.921875q0.296875 0.59375 0.296875 1.296875q0 1.21875 -0.78125 2.0625q-0.765625 0.84375 -2.796875 0.84375l-2.4375 0l0 3.875l-1.265625 0zm1.265625 -5.0l2.453125 0q1.234375 0 1.75 -0.453125q0.515625 -0.46875 0.515625 -1.28125q0 -0.609375 -0.3125 -1.03125q-0.296875 -0.421875 -0.796875 -0.5625q-0.3125 -0.09375 -1.171875 -0.09375l-2.4375 0l0 3.421875z" fill-rule="nonzero"/></g></svg>
+\ No newline at end of file
diff --git a/g3doc/architecture_guide/BUILD b/g3doc/architecture_guide/BUILD
new file mode 100644
index 000000000..404f627a4
--- /dev/null
+++ b/g3doc/architecture_guide/BUILD
@@ -0,0 +1,50 @@
+load("//website:defs.bzl", "doc")
+
+package(
+    default_visibility = ["//website:__pkg__"],
+    licenses = ["notice"],
+)
+
+doc(
+    name = "platforms",
+    src = "platforms.md",
+    category = "Architecture Guide",
+    data = [
+        "platforms.png",
+        "platforms.svg",
+    ],
+    permalink = "/docs/architecture_guide/platforms/",
+    weight = "40",
+)
+
+doc(
+    name = "resources",
+    src = "resources.md",
+    category = "Architecture Guide",
+    data = [
+        "resources.png",
+        "resources.svg",
+    ],
+    permalink = "/docs/architecture_guide/resources/",
+    weight = "30",
+)
+
+doc(
+    name = "security",
+    src = "security.md",
+    category = "Architecture Guide",
+    data = [
+        "security.png",
+        "security.svg",
+    ],
+    permalink = "/docs/architecture_guide/security/",
+    weight = "10",
+)
+
+doc(
+    name = "performance",
+    src = "performance.md",
+    category = "Architecture Guide",
+    permalink = "/docs/architecture_guide/performance/",
+    weight = "20",
+)
diff --git a/g3doc/architecture_guide/performance.md b/g3doc/architecture_guide/performance.md
new file mode 100644
index 000000000..39dbb0045
--- /dev/null
+++ b/g3doc/architecture_guide/performance.md
@@ -0,0 +1,277 @@
+# Performance Guide
+
+[TOC]
+
+gVisor is designed to provide a secure, virtualized environment while preserving
+key benefits of containerization, such as small fixed overheads and a dynamic
+resource footprint. For containerized infrastructure, this can provide a
+turn-key solution for sandboxing untrusted workloads: there are no changes to
+the fundamental resource model.
+
+gVisor imposes runtime costs over native containers. These costs come in two
+forms: additional cycles and memory usage, which may manifest as increased
+latency, reduced throughput or density, or not at all. In general, these costs
+come from two different sources.
+
+First, the existence of the [Sentry](../README.md#sentry) means that additional
+memory will be required, and application system calls must traverse additional
+layers of software. The design emphasizes
+[security](/docs/architecture_guide/security/) and therefore we chose to use a
+language for the Sentry that provides benefits in this domain but may not yet
+offer the raw performance of other choices. Costs imposed by these design
+choices are **structural costs**.
+
+Second, as gVisor is an independent implementation of the system call surface,
+many of the subsystems or specific calls are not as optimized as more mature
+implementations. A good example here is the network stack, which is continuing
+to evolve but does not support all the advanced recovery mechanisms offered by
+other stacks and is less CPU efficient. This is an **implementation cost** and
+is distinct from **structural costs**. Improvements here are ongoing and driven
+by the workloads that matter to gVisor users and contributors.
+
+This page provides a guide for understanding baseline performance, and calls out
+distint **structural costs** and **implementation costs**, highlighting where
+improvements are possible and not possible.
+
+While we include a variety of workloads here, it’s worth emphasizing that gVisor
+may not be an appropriate solution for every workload, for reasons other than
+performance. For example, a sandbox may provide minimal benefit for a trusted
+database, since _user data would already be inside the sandbox_ and there is no
+need for an attacker to break out in the first place.
+
+## Methodology
+
+All data below was generated using the [benchmark tools][benchmark-tools]
+repository, and the machines under test are uniform [Google Compute Engine][gce]
+Virtual Machines (VMs) with the following specifications:
+
+    Machine type: n1-standard-4 (broadwell)
+    Image: Debian GNU/Linux 9 (stretch) 4.19.0-0
+    BootDisk: 2048GB SSD persistent disk
+
+Through this document, `runsc` is used to indicate the runtime provided by
+gVisor. When relevant, we use the name `runsc-platform` to describe a specific
+[platform choice](/docs/architecture_guide/platforms/).
+
+**Except where specified, all tests below are conducted with the `ptrace`
+platform. The `ptrace` platform works everywhere and does not require hardware
+virtualization or kernel modifications but suffers from the highest structural
+costs by far. This platform is used to provide a clear understanding of the
+performance model, but in no way represents an ideal scenario. In the future,
+this guide will be extended to bare metal environments and include additional
+platforms.**
+
+## Memory access
+
+gVisor does not introduce any additional costs with respect to raw memory
+accesses. Page faults and other Operating System (OS) mechanisms are translated
+through the Sentry, but once mappings are installed and available to the
+application, there is no additional overhead.
+
+{% include graph.html id="sysbench-memory"
+url="/performance/sysbench-memory.csv" title="perf.py sysbench.memory
+--runtime=runc --runtime=runsc" %}
+
+The above figure demonstrates the memory transfer rate as measured by
+`sysbench`.
+
+## Memory usage
+
+The Sentry provides an additional layer of indirection, and it requires memory
+in order to store state associated with the application. This memory generally
+consists of a fixed component, plus an amount that varies with the usage of
+operating system resources (e.g. how many sockets or files are opened).
+
+For many use cases, fixed memory overheads are a primary concern. This may be
+because sandboxed containers handle a low volume of requests, and it is
+therefore important to achieve high densities for efficiency.
+
+{% include graph.html id="density" url="/performance/density.csv" title="perf.py
+density --runtime=runc --runtime=runsc" log="true" y_min="100000" %}
+
+The above figure demonstrates these costs based on three sample applications.
+This test is the result of running many instances of a container (50, or 5 in
+the case of redis) and calculating available memory on the host before and
+afterwards, and dividing the difference by the number of containers. This
+technique is used for measuring memory usage over the `usage_in_bytes` value of
+the container cgroup because we found that some container runtimes, other than
+`runc` and `runsc`, do not use an individual container cgroup.
+
+The first application is an instance of `sleep`: a trivial application that does
+nothing. The second application is a synthetic `node` application which imports
+a number of modules and listens for requests. The third application is a similar
+synthetic `ruby` application which does the same. Finally, we include an
+instance of `redis` storing approximately 1GB of data. In all cases, the sandbox
+itself is responsible for a small, mostly fixed amount of memory overhead.
+
+## CPU performance
+
+gVisor does not perform emulation or otherwise interfere with the raw execution
+of CPU instructions by the application. Therefore, there is no runtime cost
+imposed for CPU operations.
+
+{% include graph.html id="sysbench-cpu" url="/performance/sysbench-cpu.csv"
+title="perf.py sysbench.cpu --runtime=runc --runtime=runsc" %}
+
+The above figure demonstrates the `sysbench` measurement of CPU events per
+second. Events per second is based on a CPU-bound loop that calculates all prime
+numbers in a specified range. We note that `runsc` does not impose a performance
+penalty, as the code is executing natively in both cases.
+
+This has important consequences for classes of workloads that are often
+CPU-bound, such as data processing or machine learning. In these cases, `runsc`
+will similarly impose minimal runtime overhead.
+
+{% include graph.html id="tensorflow" url="/performance/tensorflow.csv"
+title="perf.py tensorflow --runtime=runc --runtime=runsc" %}
+
+For example, the above figure shows a sample TensorFlow workload, the
+[convolutional neural network example][cnn]. The time indicated includes the
+full start-up and run time for the workload, which trains a model.
+
+## System calls
+
+Some **structural costs** of gVisor are heavily influenced by the
+[platform choice](/docs/architecture_guide/platforms/), which implements system
+call interception. Today, gVisor supports a variety of platforms. These
+platforms present distinct performance, compatibility and security trade-offs.
+For example, the KVM platform has low overhead system call interception but runs
+poorly with nested virtualization.
+
+{% include graph.html id="syscall" url="/performance/syscall.csv" title="perf.py
+syscall --runtime=runc --runtime=runsc-ptrace --runtime=runsc-kvm" y_min="100"
+log="true" %}
+
+The above figure demonstrates the time required for a raw system call on various
+platforms. The test is implemented by a custom binary which performs a large
+number of system calls and calculates the average time required.
+
+This cost will principally impact applications that are system call bound, which
+tend to be high-performance data stores and static network services. In general,
+the impact of system call interception will be lower the more work an
+application does.
+
+{% include graph.html id="redis" url="/performance/redis.csv" title="perf.py
+redis --runtime=runc --runtime=runsc" %}
+
+For example, `redis` is an application that performs relatively little work in
+userspace: in general it reads from a connected socket, reads or modifies some
+data, and writes a result back to the socket. The above figure shows the results
+of running [comprehensive set of benchmarks][redis-benchmark]. We can see that
+small operations impose a large overhead, while larger operations, such as
+`LRANGE`, where more work is done in the application, have a smaller relative
+overhead.
+
+Some of these costs above are **structural costs**, and `redis` is likely to
+remain a challenging performance scenario. However, optimizing the
+[platform](/docs/architecture_guide/platforms/) will also have a dramatic
+impact.
+
+## Start-up time
+
+For many use cases, the ability to spin-up containers quickly and efficiently is
+important. A sandbox may be short-lived and perform minimal user work (e.g. a
+function invocation).
+
+{% include graph.html id="startup" url="/performance/startup.csv" title="perf.py
+startup --runtime=runc --runtime=runsc" %}
+
+The above figure indicates how total time required to start a container through
+[Docker][docker]. This benchmark uses three different applications. First, an
+alpine Linux-container that executes `true`. Second, a `node` application that
+loads a number of modules and binds an HTTP server. The time is measured by a
+successful request to the bound port. Finally, a `ruby` application that
+similarly loads a number of modules and binds an HTTP server.
+
+> Note: most of the time overhead above is associated Docker itself. This is
+> evident with the empty `runc` benchmark. To avoid these costs with `runsc`,
+> you may also consider using `runsc do` mode or invoking the
+> [OCI runtime](../user_guide/quick_start/oci.md) directly.
+
+## Network
+
+Networking is mostly bound by **implementation costs**, and gVisor's network
+stack is improving quickly.
+
+While typically not an important metric in practice for common sandbox use
+cases, nevertheless `iperf` is a common microbenchmark used to measure raw
+throughput.
+
+{% include graph.html id="iperf" url="/performance/iperf.csv" title="perf.py
+iperf --runtime=runc --runtime=runsc" %}
+
+The above figure shows the result of an `iperf` test between two instances. For
+the upload case, the specified runtime is used for the `iperf` client, and in
+the download case, the specified runtime is the server. A native runtime is
+always used for the other endpoint in the test.
+
+{% include graph.html id="applications" metric="requests_per_second"
+url="/performance/applications.csv" title="perf.py http.(node|ruby)
+--connections=25 --runtime=runc --runtime=runsc" %}
+
+The above figure shows the result of simple `node` and `ruby` web services that
+render a template upon receiving a request. Because these synthetic benchmarks
+do minimal work per request, must like the `redis` case, they suffer from high
+overheads. In practice, the more work an application does the smaller the impact
+of **structural costs** become.
+
+## File system
+
+Some aspects of file system performance are also reflective of **implementation
+costs**, and an area where gVisor's implementation is improving quickly.
+
+In terms of raw disk I/O, gVisor does not introduce significant fundamental
+overhead. For general file operations, gVisor introduces a small fixed overhead
+for data that transitions across the sandbox boundary. This manifests as
+**structural costs** in some cases, since these operations must be routed
+through the [Gofer](../README.md#gofer) as a result of our
+[Security Model](/docs/architecture_guide/security/), but in most cases are
+dominated by **implementation costs**, due to an internal
+[Virtual File System][vfs] (VFS) implementation that needs improvement.
+
+{% include graph.html id="fio-bw" url="/performance/fio.csv" title="perf.py fio
+--engine=sync --runtime=runc --runtime=runsc" log="true" %}
+
+The above figures demonstrate the results of `fio` for reads and writes to and
+from the disk. In this case, the disk quickly becomes the bottleneck and
+dominates other costs.
+
+{% include graph.html id="fio-tmpfs-bw" url="/performance/fio-tmpfs.csv"
+title="perf.py fio --engine=sync --runtime=runc --tmpfs=True --runtime=runsc"
+log="true" %}
+
+The above figure shows the raw I/O performance of using a `tmpfs` mount which is
+sandbox-internal in the case of `runsc`. Generally these operations are
+similarly bound to the cost of copying around data in-memory, and we don't see
+the cost of VFS operations.
+
+{% include graph.html id="httpd100k" metric="transfer_rate"
+url="/performance/httpd100k.csv" title="perf.py http.httpd --connections=1
+--connections=5 --connections=10 --connections=25 --runtime=runc
+--runtime=runsc" %}
+
+The high costs of VFS operations can manifest in benchmarks that execute many
+such operations in the hot path for serving requests, for example. The above
+figure shows the result of using gVisor to serve small pieces of static content
+with predictably poor results. This workload represents `apache` serving a
+single file sized 100k from the container image to a client running
+[ApacheBench][ab] with varying levels of concurrency. The high overhead comes
+principally from the VFS implementation that needs improvement, with several
+internal serialization points (since all requests are reading the same file).
+Note that some of some of network stack performance issues also impact this
+benchmark.
+
+{% include graph.html id="ffmpeg" url="/performance/ffmpeg.csv" title="perf.py
+media.ffmpeg --runtime=runc --runtime=runsc" %}
+
+For benchmarks that are bound by raw disk I/O and a mix of compute, file system
+operations are less of an issue. The above figure shows the total time required
+for an `ffmpeg` container to start, load and transcode a 27MB input video.
+
+[ab]: https://en.wikipedia.org/wiki/ApacheBench
+[benchmark-tools]: https://github.com/google/gvisor/tree/master/benchmarks
+[gce]: https://cloud.google.com/compute/
+[cnn]: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/convolutional_network.py
+[docker]: https://docker.io
+[redis-benchmark]: https://redis.io/topics/benchmarks
+[vfs]: https://en.wikipedia.org/wiki/Virtual_file_system
diff --git a/g3doc/architecture_guide/platforms.md b/g3doc/architecture_guide/platforms.md
new file mode 100644
index 000000000..d112c9a28
--- /dev/null
+++ b/g3doc/architecture_guide/platforms.md
@@ -0,0 +1,61 @@
+# Platform Guide
+
+[TOC]
+
+gVisor requires a platform to implement interception of syscalls, basic context
+switching, and memory mapping functionality. Internally, gVisor uses an
+abstraction sensibly called [Platform][platform]. A simplified version of this
+interface looks like:
+
+```golang
+type Platform interface {
+    NewAddressSpace() (AddressSpace, error)
+    NewContext() Context
+}
+
+type Context interface {
+    Switch(as AddressSpace, ac arch.Context) (..., error)
+}
+
+type AddressSpace interface {
+    MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, ...) error
+    Unmap(addr usermem.Addr, length uint64)
+}
+```
+
+There are a number of different ways to implement this interface that come with
+various trade-offs, generally around performance and hardware requirements.
+
+## Implementations
+
+The choice of platform depends on the context in which `runsc` is executing. In
+general, virtualized platforms may be limited to platforms that do not require
+hardware virtualized support (since the hardware is already in use):
+
+![Platforms](platforms.png "Platform examples.")
+
+### ptrace
+
+The ptrace platform uses [PTRACE_SYSEMU][ptrace] to execute user code without
+allowing it to execute host system calls. This platform can run anywhere that
+`ptrace` works (even VMs without nested virtualization), which is ubiquitous.
+
+Unfortunately, the ptrace platform has high context switch overhead, so system
+call-heavy applications may pay a [performance penalty](./performance.md).
+
+### KVM
+
+The KVM platform uses the kernel's [KVM][kvm] functionality to allow the Sentry
+to act as both guest OS and VMM. The KVM platform can run on bare-metal or in a
+VM with nested virtualization enabled. While there is no virtualized hardware
+layer -- the sandbox retains a process model -- gVisor leverages virtualization
+extensions available on modern processors in order to improve isolation and
+performance of address space switches.
+
+## Changing Platforms
+
+See [Changing Platforms](../user_guide/platforms.md).
+
+[kvm]: https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt
+[platform]: https://cs.opensource.google/gvisor/gvisor/+/release-20190304.1:pkg/sentry/platform/platform.go;l=33
+[ptrace]: http://man7.org/linux/man-pages/man2/ptrace.2.html
diff --git a/g3doc/architecture_guide/platforms.png b/g3doc/architecture_guide/platforms.png
new file mode 100644
index 000000000..005d56feb
--- /dev/null
+++ b/g3doc/architecture_guide/platforms.png
diff --git a/g3doc/architecture_guide/platforms.svg b/g3doc/architecture_guide/platforms.svg
new file mode 100644
index 000000000..b0bac9ba7
--- /dev/null
+++ b/g3doc/architecture_guide/platforms.svg
@@ -0,0 +1,334 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="142.67763mm"
+   height="67.063133mm"
+   viewBox="0 0 142.67763 67.063134"
+   version="1.1"
+   id="svg8"
+   inkscape:export-filename="/home/ascannell/resources.png"
+   inkscape:export-xdpi="53.50127"
+   inkscape:export-ydpi="53.50127"
+   inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
+   sodipodi:docname="platforms.svg">
+  <defs
+     id="defs2" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="0.98994949"
+     inkscape:cx="86.443612"
+     inkscape:cy="102.88104"
+     inkscape:document-units="mm"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:window-width="1920"
+     inkscape:window-height="1005"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1" />
+  <metadata
+     id="metadata5">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-36.081387,-98.953278)">
+    <rect
+       id="rect10"
+       width="33.408691"
+       height="33.408691"
+       x="36.081387"
+       y="120.06757"
+       style="fill:#44aa00;stroke-width:0.26458332" />
+    <rect
+       style="fill:#b3b3b3;stroke-width:0.23881446"
+       id="rect16"
+       width="142.45465"
+       height="10.423517"
+       x="36.08139"
+       y="155.5929" />
+    <rect
+       id="rect10-7"
+       width="30.52453"
+       height="18.976137"
+       x="37.416695"
+       y="121.65508"
+       style="fill:#ff8080;stroke-width:0.19060372" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="41.03727"
+       y="148.58765"
+       id="text65"><tspan
+         sodipodi:role="line"
+         id="tspan63"
+         x="41.03727"
+         y="148.58765"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="45.473087"
+       y="132.50232"
+       id="text123"><tspan
+         sodipodi:role="line"
+         id="tspan121"
+         x="45.473087"
+         y="132.50232"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:6.43922186px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.16098055"
+       x="97.768547"
+       y="163.15665"
+       id="text163"><tspan
+         sodipodi:role="line"
+         id="tspan161"
+         x="97.768547"
+         y="163.15665"
+         style="stroke-width:0.16098055">host</tspan></text>
+    <rect
+       style="fill:#e9afdd;stroke-width:0.39185274"
+       id="rect16-7"
+       width="72.9646"
+       height="54.79026"
+       x="105.79441"
+       y="98.953278" />
+    <rect
+       id="rect10-5"
+       width="33.408691"
+       height="33.408691"
+       x="108.24348"
+       y="100.53072"
+       style="fill:#44aa00;stroke-width:0.26458332" />
+    <rect
+       id="rect10-7-6"
+       width="30.52453"
+       height="20.045216"
+       x="109.57877"
+       y="102.11823"
+       style="fill:#ff8080;stroke-width:0.19589928" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="112.86765"
+       y="129.01863"
+       id="text65-2"><tspan
+         sodipodi:role="line"
+         id="tspan63-9"
+         x="112.86765"
+         y="129.01863"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="117.63519"
+       y="114.02371"
+       id="text123-1"><tspan
+         sodipodi:role="line"
+         id="tspan121-2"
+         x="117.63519"
+         y="114.02371"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <rect
+       id="rect10-7-7"
+       width="11.815663"
+       height="8.0126781"
+       x="54.538059"
+       y="143.27702"
+       style="fill:#aaccff;stroke-width:0.07705856" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:4.35074377px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.10876859"
+       x="55.931114"
+       y="148.90578"
+       id="text144"><tspan
+         sodipodi:role="line"
+         id="tspan142"
+         x="55.931114"
+         y="148.90578"
+         style="stroke-width:0.10876859">KVM</tspan></text>
+    <rect
+       id="rect10-6"
+       width="33.408691"
+       height="33.408691"
+       x="71.044685"
+       y="119.73112"
+       style="fill:#44aa00;stroke-width:0.26458332" />
+    <rect
+       id="rect10-7-0"
+       width="30.52453"
+       height="18.976137"
+       x="72.37999"
+       y="121.31865"
+       style="fill:#ff8080;stroke-width:0.19060372" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="76.000565"
+       y="148.25128"
+       id="text65-6"><tspan
+         sodipodi:role="line"
+         id="tspan63-2"
+         x="76.000565"
+         y="148.25128"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="80.436386"
+       y="132.16595"
+       id="text123-6"><tspan
+         sodipodi:role="line"
+         id="tspan121-1"
+         x="80.436386"
+         y="132.16595"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <rect
+       id="rect10-7-7-8"
+       width="11.815664"
+       height="8.0126781"
+       x="89.501358"
+       y="142.94067"
+       style="fill:#ffeeaa;stroke-width:0.07705856" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416"
+       x="89.92292"
+       y="147.89806"
+       id="text144-7"><tspan
+         sodipodi:role="line"
+         id="tspan142-9"
+         x="89.92292"
+         y="147.89806"
+         style="stroke-width:0.08486416">ptrace</tspan></text>
+    <rect
+       id="rect10-7-7-8-3"
+       width="11.815665"
+       height="8.0126781"
+       x="127.08897"
+       y="123.97878"
+       style="fill:#ffeeaa;stroke-width:0.07705856" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416"
+       x="127.51052"
+       y="128.9362"
+       id="text144-7-7"><tspan
+         sodipodi:role="line"
+         id="tspan142-9-5"
+         x="127.51052"
+         y="128.9362"
+         style="stroke-width:0.08486416">ptrace</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:5.45061255px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.13626531"
+       x="138.49318"
+       y="152.11841"
+       id="text229"><tspan
+         sodipodi:role="line"
+         id="tspan227"
+         x="138.49318"
+         y="152.11841"
+         style="stroke-width:0.13626531">VM</tspan></text>
+    <rect
+       style="fill:#b3b3b3;stroke-width:0.16518368"
+       id="rect16-9"
+       width="68.15374"
+       height="10.423517"
+       x="108.24348"
+       y="134.99774" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:6.17854786px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.15446369"
+       x="132.91473"
+       y="142.07658"
+       id="text248"><tspan
+         sodipodi:role="line"
+         id="tspan246"
+         x="132.91473"
+         y="142.07658"
+         style="stroke-width:0.15446369">guest</tspan></text>
+    <rect
+       id="rect10-5-2"
+       width="33.408691"
+       height="33.408691"
+       x="143.32402"
+       y="100.35877"
+       style="fill:#44aa00;stroke-width:0.26458332" />
+    <rect
+       id="rect10-7-6-2"
+       width="30.52453"
+       height="20.045216"
+       x="144.65933"
+       y="101.94627"
+       style="fill:#ff8080;stroke-width:0.19589929" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="147.94815"
+       y="128.84665"
+       id="text65-2-8"><tspan
+         sodipodi:role="line"
+         id="tspan63-9-9"
+         x="147.94815"
+         y="128.84665"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="152.71565"
+       y="113.85176"
+       id="text123-1-7"><tspan
+         sodipodi:role="line"
+         id="tspan121-2-3"
+         x="152.71565"
+         y="113.85176"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <rect
+       id="rect10-7-7-8-3-6"
+       width="11.815666"
+       height="8.0126781"
+       x="162.16933"
+       y="123.80682"
+       style="fill:#ffeeaa;stroke-width:0.07705856" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416"
+       x="162.59088"
+       y="128.76421"
+       id="text144-7-7-1"><tspan
+         sodipodi:role="line"
+         id="tspan142-9-5-2"
+         x="162.59088"
+         y="128.76421"
+         style="stroke-width:0.08486416">ptrace</tspan></text>
+  </g>
+</svg>
diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md
new file mode 100644
index 000000000..1dec37bd1
--- /dev/null
+++ b/g3doc/architecture_guide/resources.md
@@ -0,0 +1,144 @@
+# Resource Model
+
+[TOC]
+
+The resource model for gVisor does not assume a fixed number of threads of
+execution (i.e. vCPUs) or amount of physical memory. Where possible, decisions
+about underlying physical resources are delegated to the host system, where
+optimizations can be made with global information. This delegation allows the
+sandbox to be highly dynamic in terms of resource usage: spanning a large number
+of cores and large amount of memory when busy, and yielding those resources back
+to the host when not.
+
+In order words, the shape of the sandbox should closely track the shape of the
+sandboxed process:
+
+![Resource model](resources.png "Workloads of different shapes.")
+
+## Processes
+
+Much like a Virtual Machine (VM), a gVisor sandbox appears as an opaque process
+on the system. Processes within the sandbox do not manifest as processes on the
+host system, and process-level interactions within the sandbox requires entering
+the sandbox (e.g. via a [Docker exec][exec]).
+
+## Networking
+
+The sandbox attaches a network endpoint to the system, but runs it's own network
+stack. All network resources, other than packets in flight on the host, exist
+only inside the sandbox, bound by relevant resource limits.
+
+You can interact with network endpoints exposed by the sandbox, just as you
+would any other container, but network introspection similarly requires entering
+the sandbox.
+
+## Files
+
+Files in the sandbox may be backed by different implementations. For host-native
+files (where a file descriptor is available), the Gofer may return a file
+descriptor to the Sentry via [SCM_RIGHTS][scmrights][^1].
+
+These files may be read from and written to through standard system calls, and
+also mapped into the associated application's address space. This allows the
+same host memory to be shared across multiple sandboxes, although this mechanism
+does not preclude the use of side-channels (see [Security Model](./security.md).
+
+Note that some file systems exist only within the context of the sandbox. For
+example, in many cases a `tmpfs` mount will be available at `/tmp` or
+`/dev/shm`, which allocates memory directly from the sandbox memory file (see
+below). Ultimately, these will be accounted against relevant limits in a similar
+way as the host native case.
+
+## Threads
+
+The Sentry models individual task threads with [goroutines][goroutine]. As a
+result, each task thread is a lightweight [green thread][greenthread], and may
+not correspond to an underlying host thread.
+
+However, application execution is modelled as a blocking system call with the
+Sentry. This means that additional host threads may be created, *depending on
+the number of active application threads*. In practice, a busy application will
+converge on the number of active threads, and the host will be able to make
+scheduling decisions about all application threads.
+
+## Time
+
+Time in the sandbox is provided by the Sentry, through its own [vDSO][vdso] and
+time-keeping implementation. This is distinct from the host time, and no state
+is shared with the host, although the time will be initialized with the host
+clock.
+
+The Sentry runs timers to note the passage of time, much like a kernel running
+on hardware (though the timers are software timers, in this case). These timers
+provide updates to the vDSO, the time returned through system calls, and the
+time recorded for usage or limit tracking (e.g. [RLIMIT_CPU][rlimit]).
+
+When all application threads are idle, the Sentry disables timers until an event
+occurs that wakes either the Sentry or an application thread, similar to a
+[tickless kernel][tickless]. This allows the Sentry to achieve near zero CPU
+usage for idle applications.
+
+## Memory
+
+The Sentry implements its own memory management, including demand-paging and a
+Sentry internal page cache for files that cannot be used natively. A single
+[memfd][memfd] backs all application memory.
+
+### Address spaces
+
+The creation of address spaces is platform-specific. For some platforms,
+additional "stub" processes may be created on the host in order to support
+additional address spaces. These stubs are subject to various limits applied at
+the sandbox level (e.g. PID limits).
+
+### Physical memory
+
+The host is able to manage physical memory using regular means (e.g. tracking
+working sets, reclaiming and swapping under pressure). The Sentry lazily
+populates host mappings for applications, and allow the host to demand-page
+those regions, which is critical for the functioning of those mechanisms.
+
+In order to avoid excessive overhead, the Sentry does not demand-page individual
+pages. Instead, it selects appropriate regions based on heuristics. There is a
+trade-off here: the Sentry is unable to trivially determine which pages are
+active and which are not. Even if pages were individually faulted, the host may
+select pages to be reclaimed or swapped without the Sentry's knowledge.
+
+Therefore, memory usage statistics within the sandbox (e.g. via `proc`) are
+approximations. The Sentry maintains an internal breakdown of memory usage, and
+can collect accurate information but only through a relatively expensive API
+call. In any case, it would likely be considered unwise to share precise
+information about how the host is managing memory with the sandbox.
+
+Finally, when an application marks a region of memory as no longer needed, for
+example via a call to [madvise][madvise], the Sentry *releases this memory back
+to the host*. There can be performance penalties for this, since it may be
+cheaper in many cases to retain the memory and use it to satisfy some other
+request. However, releasing it immediately to the host allows the host to more
+effectively multiplex resources and apply an efficient global policy.
+
+## Limits
+
+All Sentry threads and Sentry memory are subject to a container cgroup. However,
+application usage will not appear as anonymous memory usage, and will instead be
+accounted to the `memfd`. All anonymous memory will correspond to Sentry usage,
+and host memory charged to the container will work as standard.
+
+The cgroups can be monitored for standard signals: pressure indicators,
+threshold notifiers, etc. and can also be adjusted dynamically. Note that the
+Sentry itself may listen for pressure signals in its containing cgroup, in order
+to purge internal caches.
+
+[goroutine]: https://tour.golang.org/concurrency/1
+[greenthread]: https://en.wikipedia.org/wiki/Green_threads
+[scheduler]: https://morsmachine.dk/go-scheduler
+[vdso]: https://en.wikipedia.org/wiki/VDSO
+[rlimit]: http://man7.org/linux/man-pages/man2/getrlimit.2.html
+[tickless]: https://en.wikipedia.org/wiki/Tickless_kernel
+[memfd]: http://man7.org/linux/man-pages/man2/memfd_create.2.html
+[scmrights]: http://man7.org/linux/man-pages/man7/unix.7.html
+[madvise]: http://man7.org/linux/man-pages/man2/madvise.2.html
+[exec]: https://docs.docker.com/engine/reference/commandline/exec/
+[^1]: Unless host networking is enabled, the Sentry is not able to create or
+    open host file descriptors itself, it can only receive them in this way
+    from the Gofer.
diff --git a/g3doc/architecture_guide/resources.png b/g3doc/architecture_guide/resources.png
new file mode 100644
index 000000000..f715008ec
--- /dev/null
+++ b/g3doc/architecture_guide/resources.png
diff --git a/g3doc/architecture_guide/resources.svg b/g3doc/architecture_guide/resources.svg
new file mode 100644
index 000000000..fd7805d90
--- /dev/null
+++ b/g3doc/architecture_guide/resources.svg
@@ -0,0 +1,208 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="108.24417mm"
+   height="47.513165mm"
+   viewBox="0 0 108.24417 47.513165"
+   version="1.1"
+   id="svg8"
+   inkscape:export-filename="/home/ascannell/resources.png"
+   inkscape:export-xdpi="53.50127"
+   inkscape:export-ydpi="53.50127"
+   inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
+   sodipodi:docname="resources.svg">
+  <defs
+     id="defs2" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="0.98994949"
+     inkscape:cx="16.897058"
+     inkscape:cy="41.261746"
+     inkscape:document-units="mm"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:window-width="1920"
+     inkscape:window-height="1005"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1" />
+  <metadata
+     id="metadata5">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-36.081387,-118.50325)">
+    <rect
+       id="rect10"
+       width="33.408691"
+       height="33.408691"
+       x="36.081387"
+       y="120.06757"
+       style="fill:#44aa00;stroke-width:0.26458332" />
+    <circle
+       style="fill:#44aa00;stroke-width:0.21849461"
+       id="path12"
+       cx="87.958534"
+       cy="136.63828"
+       r="17.105247" />
+    <path
+       sodipodi:type="star"
+       style="fill:#44aa00;stroke-width:0.26458332"
+       id="path14"
+       sodipodi:sides="3"
+       sodipodi:cx="124.13387"
+       sodipodi:cy="141.81859"
+       sodipodi:r1="23.31534"
+       sodipodi:r2="11.65767"
+       sodipodi:arg1="0.52359878"
+       sodipodi:arg2="1.5707963"
+       inkscape:flatsided="false"
+       inkscape:rounded="0"
+       inkscape:randomized="0"
+       d="m 144.32555,153.47626 -20.19168,0 -20.19167,0 10.09583,-17.48651 10.09584,-17.4865 10.09584,17.4865 z"
+       inkscape:transform-center-x="1.8384776e-06"
+       inkscape:transform-center-y="-5.8288369" />
+    <rect
+       style="fill:#b3b3b3;stroke-width:0.20817307"
+       id="rect16"
+       width="108.24416"
+       height="10.423517"
+       x="36.08139"
+       y="155.5929" />
+    <path
+       sodipodi:type="star"
+       style="fill:#ff8080;stroke-width:0.20018946"
+       id="path14-3"
+       sodipodi:sides="3"
+       sodipodi:cx="124.13387"
+       sodipodi:cy="139.31911"
+       sodipodi:r1="17.640888"
+       sodipodi:r2="8.8204451"
+       sodipodi:arg1="0.52359878"
+       sodipodi:arg2="1.5707963"
+       inkscape:flatsided="false"
+       inkscape:rounded="0"
+       inkscape:randomized="0"
+       d="m 139.41133,148.13955 -15.27746,0 -15.27745,0 7.63872,-13.23067 7.63873,-13.23066 7.63873,13.23066 z"
+       inkscape:transform-center-x="3.9117172e-06"
+       inkscape:transform-center-y="-4.4102243" />
+    <circle
+       style="fill:#ff8080;stroke-width:0.18094084"
+       id="path12-6"
+       cx="87.93705"
+       cy="134.75125"
+       r="14.165282" />
+    <rect
+       id="rect10-7"
+       width="30.52453"
+       height="25.657875"
+       x="37.416695"
+       y="121.65508"
+       style="fill:#ff8080;stroke-width:0.22163473" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="47.387276"
+       y="151.7626"
+       id="text65"><tspan
+         sodipodi:role="line"
+         id="tspan63"
+         x="47.387276"
+         y="151.7626"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="82.156319"
+       y="151.71547"
+       id="text65-5"><tspan
+         sodipodi:role="line"
+         id="tspan63-3"
+         x="82.156319"
+         y="151.71547"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="118.66879"
+       y="151.71547"
+       id="text65-5-5"><tspan
+         sodipodi:role="line"
+         id="tspan63-3-6"
+         x="118.66879"
+         y="151.71547"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="45.473087"
+       y="136.20644"
+       id="text123"><tspan
+         sodipodi:role="line"
+         id="tspan121"
+         x="45.473087"
+         y="136.20644"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="80.153076"
+       y="136.00925"
+       id="text123-1"><tspan
+         sodipodi:role="line"
+         id="tspan121-2"
+         x="80.153076"
+         y="136.00925"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="116.50173"
+       y="138.68195"
+       id="text123-1-7"><tspan
+         sodipodi:role="line"
+         id="tspan121-2-0"
+         x="116.50173"
+         y="138.68195"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:6.43922186px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.16098055"
+       x="81.893562"
+       y="163.15665"
+       id="text163"><tspan
+         sodipodi:role="line"
+         id="tspan161"
+         x="81.893562"
+         y="163.15665"
+         style="stroke-width:0.16098055">host</tspan></text>
+  </g>
+</svg>
diff --git a/g3doc/architecture_guide/security.md b/g3doc/architecture_guide/security.md
new file mode 100644
index 000000000..b99b86332
--- /dev/null
+++ b/g3doc/architecture_guide/security.md
@@ -0,0 +1,255 @@
+# Security Model
+
+[TOC]
+
+gVisor was created in order to provide additional defense against the
+exploitation of kernel bugs by untrusted userspace code. In order to understand
+how gVisor achieves this goal, it is first necessary to understand the basic
+threat model.
+
+## Threats: The Anatomy of an Exploit
+
+An exploit takes advantage of a software or hardware bug in order to escalate
+privileges, gain access to privileged data, or disrupt services. All of the
+possible interactions that a malicious application can have with the rest of the
+system (attack vectors) define the attack surface. We categorize these attack
+vectors into several common classes.
+
+### System API
+
+An operating system or hypervisor exposes an abstract System API in the form of
+system calls and traps. This API may be documented and stable, as with Linux, or
+it may be abstracted behind a library, as with Windows (i.e. win32.dll or
+ntdll.dll). The System API includes all standard interfaces that application
+code uses to interact with the system. This includes high-level abstractions
+that are derived from low-level system calls, such as system files, sockets and
+namespaces.
+
+Although the System API is exposed to applications by design, bugs and race
+conditions within the kernel or hypervisor may occasionally be exploitable via
+the API. This is common in part due to the fact that most kernels and
+hypervisors are written in [C][clang], which is well-suited to interfacing with
+hardware but often prone to security issues. In order to exploit these issues, a
+typical attack might involve some combination of the following:
+
+1.  Opening or creating some combination of files, sockets or other descriptors.
+1.  Passing crafted, malicious arguments, structures or packets.
+1.  Racing with multiple threads in order to hit specific code paths.
+
+For example, for the [Dirty Cow][dirtycow] privilege escalation bug, an
+application would open a specific file in `/proc` or use a specific `ptrace`
+system call, and use multiple threads in order to trigger a race condition when
+touching a fresh page of memory. The attacker then gains control over a page of
+memory belonging to the system. With additional privileges or access to
+privileged data in the kernel, an attacker will often be able to employ
+additional techniques to gain full access to the rest of the system.
+
+While bugs in the implementation of the System API are readily fixed, they are
+also the most common form of exploit. The exposure created by this class of
+exploit is what gVisor aims to minimize and control, described in detail below.
+
+### System ABI
+
+Hardware and software exploits occasionally exist in execution paths that are
+not part of an intended System API. In this case, exploits may be found as part
+of implicit actions the hardware or privileged system code takes in response to
+certain events, such as traps or interrupts. For example, the recent
+[POPSS][popss] flaw required only native code execution (no specific system call
+or file access). In that case, the Xen hypervisor was similarly vulnerable,
+highlighting that hypervisors are not immune to this vector.
+
+### Side Channels
+
+Hardware side channels may be exploitable by any code running on a system:
+native, sandboxed, or virtualized. However, many host-level mitigations against
+hardware side channels are still effective with a sandbox. For example, kernels
+built with retpoline protect against some speculative execution attacks
+(Spectre) and frame poisoning may protect against L1 terminal fault (L1TF)
+attacks. Hypervisors may introduce additional complications in this regard, as
+there is no mitigation against an application in a normally functioning Virtual
+Machine (VM) exploiting the L1TF vulnerability for another VM on the sibling
+hyperthread.
+
+### Other Vectors
+
+The above categories in no way represent an exhaustive list of exploits, as we
+focus only on running untrusted code from within the operating system or
+hypervisor. We do not consider other ways that a more generic adversary may
+interact with a system, such as inserting a portable storage device with a
+malicious filesystem image, using a combination of crafted keyboard or touch
+inputs, or saturating a network device with ill-formed packets.
+
+Furthermore, high-level systems may contain exploitable components. An attacker
+need not escalate privileges within a container if there’s an exploitable
+network-accessible service on the host or some other API path. *A sandbox is not
+a substitute for a secure architecture*.
+
+## Goals: Limiting Exposure
+
+![Threat model](security.png "Threat model.")
+
+gVisor’s primary design goal is to minimize the System API attack vector through
+multiple layers of defense, while still providing a process model. There are two
+primary security principles that inform this design. First, the application’s
+direct interactions with the host System API are intercepted by the Sentry,
+which implements the System API instead. Second, the System API accessible to
+the Sentry itself is minimized to a safer, restricted set. The first principle
+minimizes the possibility of direct exploitation of the host System API by
+applications, and the second principle minimizes indirect exploitability, which
+is the exploitation by an exploited or buggy Sentry (e.g. chaining an exploit).
+
+The first principle is similar to the security basis for a Virtual Machine (VM).
+With a VM, an application’s interactions with the host are replaced by
+interactions with a guest operating system and a set of virtualized hardware
+devices. These hardware devices are then implemented via the host System API by
+a Virtual Machine Monitor (VMM). The Sentry similarly prevents direct
+interactions by providing its own implementation of the System API that the
+application must interact with. Applications are not able to to directly craft
+specific arguments or flags for the host System API, or interact directly with
+host primitives.
+
+For both the Sentry and a VMM, it’s worth noting that while direct interactions
+are not possible, indirect interactions are still possible. For example, a read
+on a host-backed file in the Sentry may ultimately result in a host read system
+call (made by the Sentry, not by passing through arguments from the
+application), similar to how a read on a block device in a VM may result in the
+VMM issuing a corresponding host read system call from a backing file.
+
+An important distinction from a VM is that the Sentry implements a System API
+based directly on host System API primitives instead of relying on virtualized
+hardware and a guest operating system. This selects a distinct set of
+trade-offs, largely in the performance, efficiency and compatibility domains.
+Since transitions in and out of the sandbox are relatively expensive, a guest
+operating system will typically take ownership of resources. For example, in the
+above case, the guest operating system may read the block device data in a local
+page cache, to avoid subsequent reads. This may lead to better performance but
+lower efficiency, since memory may be wasted or duplicated. The Sentry opts
+instead to defer to the host for many operations during runtime, for improved
+efficiency but lower performance in some use cases.
+
+### What can a sandbox do?
+
+An application in a gVisor sandbox is permitted to do most things a standard
+container can do: for example, applications can read and write files mapped
+within the container, make network connections, etc. As described above,
+gVisor's primary goal is to limit exposure to bugs and exploits while still
+allowing most applications to run. Even so, gVisor will limit some operations
+that might be permitted with a standard container. Even with appropriate
+capabilities, a user in a gVisor sandbox will only be able to manipulate
+virtualized system resources (e.g. the system time, kernel settings or
+filesystem attributes) and not underlying host system resources.
+
+While the sandbox virtualizes many operations for the application, we limit the
+sandbox's own interactions with the host to the following high-level operations:
+
+1.  Communicate with a Gofer process via a connected socket. The sandbox may
+    receive new file descriptors from the Gofer process, corresponding to opened
+    files. These files can then be read from and written to by the sandbox.
+1.  Make a minimal set of host system calls. The calls do not include the
+    creation of new sockets (unless host networking mode is enabled) or opening
+    files. The calls include duplication and closing of file descriptors,
+    synchronization, timers and signal management.
+1.  Read and write packets to a virtual ethernet device. This is not required if
+    host networking is enabled (or networking is disabled).
+
+### System ABI, Side Channels and Other Vectors
+
+gVisor relies on the host operating system and the platform for defense against
+hardware-based attacks. Given the nature of these vulnerabilities, there is
+little defense that gVisor can provide (there’s no guarantee that additional
+hardware measures, such as virtualization, memory encryption, etc. would
+actually decrease the attack surface). Note that this is true even when using
+hardware virtualization for acceleration, as the host kernel or hypervisor is
+ultimately responsible for defending against attacks from within malicious
+guests.
+
+gVisor similarly relies on the host resource mechanisms (cgroups) for defense
+against resource exhaustion and denial of service attacks. Network policy
+controls should be applied at the container level to ensure appropriate network
+policy enforcement. Note that the sandbox itself is not capable of altering or
+configuring these mechanisms, and the sandbox itself should make an attacker
+less likely to exploit or override these controls through other means.
+
+## Principles: Defense-in-Depth
+
+For gVisor development, there are several engineering principles that are
+employed in order to ensure that the system meets its design goals.
+
+1.  No system call is passed through directly to the host. Every supported call
+    has an independent implementation in the Sentry, that is unlikely to suffer
+    from identical vulnerabilities that may appear in the host. This has the
+    consequence that all kernel features used by applications require an
+    implementation within the Sentry.
+1.  Only common, universal functionality is implemented. Some filesystems,
+    network devices or modules may expose specialized functionality to user
+    space applications via mechanisms such as extended attributes, raw sockets
+    or ioctls. Since the Sentry is responsible for implementing the full system
+    call surface, we do not implement or pass through these specialized APIs.
+1.  The host surface exposed to the Sentry is minimized. While the system call
+    surface is not trivial, it is explicitly enumerated and controlled. The
+    Sentry is not permitted to open new files, create new sockets or do many
+    other interesting things on the host.
+
+Additionally, we have practical restrictions that are imposed on the project to
+minimize the risk of Sentry exploitability. For example:
+
+1.  Unsafe code is carefully controlled. All unsafe code is isolated in files
+    that end with "unsafe.go", in order to facilitate validation and auditing.
+    No file without the unsafe suffix may import the unsafe package.
+1.  No CGo is allowed. The Sentry must be a pure Go binary.
+1.  External imports are not generally allowed within the core packages. Only
+    limited external imports are used within the setup code. The code available
+    inside the Sentry is carefully controlled, to ensure that the above rules
+    are effective.
+
+Finally, we recognize that security is a process, and that vigilance is
+critical. Beyond our security disclosure process, the Sentry is fuzzed
+continuously to identify potential bugs and races proactively, and production
+crashes are recorded and triaged to similarly identify material issues.
+
+## FAQ
+
+### Is this more or less secure than a Virtual Machine?
+
+The security of a VM depends to a large extent on what is exposed from the host
+kernel and userspace support code. For example, device emulation code in the
+host kernel (e.g. APIC) or optimizations (e.g. vhost) can be more complex than a
+simple system call, and exploits carry the same risks. Similarly, the userspace
+support code is frequently unsandboxed, and exploits, while rare, may allow
+unfettered access to the system.
+
+Some platforms leverage the same virtualization hardware as VMs in order to
+provide better system call interception performance. However, gVisor does not
+implement any device emulation, and instead opts to use a sandboxed host System
+API directly. Both approaches significantly reduce the original attack surface.
+Ultimately, since gVisor is capable of using the same hardware mechanism, one
+should not assume that the mere use of virtualization hardware makes a system
+more or less secure, just as it would be a mistake to make the claim that the
+use of a unibody alone makes a car safe.
+
+### Does this stop hardware side channels?
+
+In general, gVisor does not provide protection against hardware side channels,
+although it may make exploits that rely on direct access to the host System API
+more difficult to use. To minimize exposure, you should follow relevant guidance
+from vendors and keep your host kernel and firmware up-to-date.
+
+### Is this just a ptrace sandbox?
+
+No: the term “ptrace sandbox” generally refers to software that uses the Linux
+ptrace facility to inspect and authorize system calls made by applications,
+enforcing a specific policy. These commonly suffer from two issues. First,
+vulnerable system calls may be authorized by the sandbox, as the application
+still has direct access to some System API. Second, it’s impossible to avoid
+time-of-check, time-of-use race conditions without disabling multi-threading.
+
+In gVisor, the platforms that use ptrace operate differently. The stubs that are
+traced are never allowed to continue execution into the host kernel and complete
+a call directly. Instead, all system calls are interpreted and handled by the
+Sentry itself, who reflects resulting register state back into the tracee before
+continuing execution in userspace. This is very similar to the mechanism used by
+User-Mode Linux (UML).
+
+[dirtycow]: https://en.wikipedia.org/wiki/Dirty_COW
+[clang]: https://en.wikipedia.org/wiki/C_(programming_language)
+[popss]: https://nvd.nist.gov/vuln/detail/CVE-2018-8897
diff --git a/g3doc/architecture_guide/security.png b/g3doc/architecture_guide/security.png
new file mode 100644
index 000000000..c29befbf6
--- /dev/null
+++ b/g3doc/architecture_guide/security.png
diff --git a/g3doc/architecture_guide/security.svg b/g3doc/architecture_guide/security.svg
new file mode 100644
index 000000000..0575e2dec
--- /dev/null
+++ b/g3doc/architecture_guide/security.svg
@@ -0,0 +1,153 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="92.963379mm"
+   height="107.18885mm"
+   viewBox="0 0 92.963379 107.18885"
+   version="1.1"
+   id="svg8"
+   inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
+   sodipodi:docname="defense.svg">
+  <defs
+     id="defs2" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="0.98994949"
+     inkscape:cx="-242.99254"
+     inkscape:cy="136.90181"
+     inkscape:document-units="mm"
+     inkscape:current-layer="layer4"
+     showgrid="false"
+     inkscape:object-nodes="true"
+     inkscape:window-width="1920"
+     inkscape:window-height="1005"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0" />
+  <metadata
+     id="metadata5">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:groupmode="layer"
+     id="layer2"
+     inkscape:label="Layer 2"
+     transform="translate(-61.112559,-78.160466)">
+    <g
+       id="g4644"
+       style="fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576"
+       transform="matrix(1,0,0,-1,2.138671,277.94235)">
+      <path
+         transform="scale(0.26458333)"
+         inkscape:connector-curvature="0"
+         style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:3.77952766;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576"
+         d="M 398.57227,351.84766 224.7832,452.18359 398.57227,552.51953 572.35938,452.18359 Z"
+         id="path4638" />
+      <path
+         inkscape:connector-curvature="0"
+         style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:3.77952766;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576"
+         d="M 572.35938,452.18359 398.57227,552.51953 V 753.19141 L 572.35938,652.85547 Z"
+         transform="scale(0.26458333)"
+         id="path4640" />
+      <path
+         id="path4642"
+         d="m 59.473888,119.64024 45.981172,26.54722 v 53.09443 L 59.473888,172.73467 Z"
+         style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576"
+         inkscape:connector-curvature="0" />
+    </g>
+  </g>
+  <g
+     inkscape:groupmode="layer"
+     id="layer3"
+     inkscape:label="Layer 3"
+     transform="translate(-61.112559,-78.160466)">
+    <g
+       id="g4554"
+       transform="matrix(-0.39771468,0.69855937,-0.69855937,-0.39771468,366.58103,126.65261)">
+      <g
+         id="g4662"
+         transform="translate(59.46839,130.66062)">
+        <path
+           inkscape:connector-curvature="0"
+           id="path4548"
+           transform="scale(0.26458333)"
+           d="M 398.57227,351.84766 224.7832,452.18359 398.57227,552.51953 572.35938,452.18359 Z"
+           style="opacity:1;fill:#0066ff;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:4.70182848;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+        <path
+           inkscape:connector-curvature="0"
+           id="path4550"
+           transform="scale(0.26458333)"
+           d="M 572.35938,452.18359 398.57227,552.51953 V 753.19141 L 572.35938,652.85547 Z"
+           style="opacity:1;fill:#0044aa;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:4.29276943;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+        <path
+           inkscape:connector-curvature="0"
+           style="opacity:1;fill:#5599ff;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:1.24402535;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+           d="m 59.473888,119.64024 45.981172,26.54722 v 53.09443 L 59.473888,172.73467 Z"
+           id="path4552" />
+      </g>
+    </g>
+  </g>
+  <g
+     inkscape:groupmode="layer"
+     id="layer4"
+     inkscape:label="Layer 4"
+     transform="translate(-61.112559,-78.160466)">
+    <path
+       style="fill:#e000ae;fill-opacity:1;stroke-width:0.12476727"
+       d="m 84.610811,107.36071 v 2.55773 2.55772 h 2.49535 2.49534 v -2.55772 -2.55773 h -2.49534 z m 40.674129,0 v 2.55773 2.55772 h 2.49535 2.49534 v -2.55772 -2.55773 h -2.49534 z m -35.558669,5.11545 v 2.55773 2.55773 h 2.49535 2.49534 v -2.55773 -2.55773 h -2.49534 z m 4.99069,5.11546 v 2.55773 2.55773 h -2.49534 -2.49535 v 2.49534 2.49535 h -2.55773 -2.55773 v 2.55773 2.55773 h -2.55773 -2.55773 v 10.16853 10.16853 h 2.55773 2.55773 v -7.67562 -7.67587 l 2.52654,0.0339 2.52654,0.0336 0.0327,5.08427 0.0327,5.08426 h 2.49388 2.49388 v 2.55919 2.5592 l 5.08427,-0.0327 5.084269,-0.0326 v -2.49534 -2.49535 l -5.084269,-0.0324 -5.08427,-0.0327 v -2.55626 -2.55651 h 12.726269 12.72626 v 2.55651 2.55626 l -5.05868,0.0327 -5.05893,0.0324 v 2.49535 2.49534 l 5.05893,0.0326 5.05868,0.0327 v -2.55919 -2.55919 h 2.49388 2.49413 l 0.0324,-5.08426 0.0327,-5.08427 2.52653,-0.0336 2.52654,-0.0339 v 7.67586 7.67563 h 2.55773 2.55773 v -10.16854 -10.16853 h -2.55773 -2.55773 v -2.55773 -2.55773 h -2.55773 -2.55773 v -2.49535 -2.49534 h -2.49535 -2.49534 v -2.55773 -2.55773 h -2.55773 -2.55773 v 2.55773 2.55773 h -7.6108 -7.610809 v -2.55773 -2.55773 h -2.55774 z m 25.452519,0 h 2.49535 2.49535 v -2.55773 -2.55773 h -2.49535 -2.49535 v 2.55773 z m -25.452519,10.10615 h 5.11546 5.115459 v 2.55773 2.55773 h -5.115459 -5.11546 v -2.55773 z m 15.221609,0 h 5.11546 5.11545 v 2.55773 2.55773 h -5.11545 -5.11546 v -2.55773 z"
+       id="path4732"
+       inkscape:connector-curvature="0" />
+  </g>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     style="display:inline"
+     transform="translate(-61.112559,-78.160466)">
+    <g
+       transform="translate(-131.49557,42.495842)"
+       style="fill:#007200;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="g4628">
+      <path
+         id="path4529"
+         d="m 239.09034,36.164616 -45.98169,26.547215 45.98169,26.547217 45.98117,-26.547217 z"
+         style="opacity:1;fill:#4aba19;fill-opacity:0.34509804;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+         inkscape:connector-curvature="0" />
+      <path
+         id="path4531"
+         d="m 285.07151,62.711828 -45.98117,26.54722 v 53.094432 l 45.98117,-26.54722 z"
+         style="opacity:1;fill:#007900;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+         inkscape:connector-curvature="0" />
+      <path
+         inkscape:connector-curvature="0"
+         style="opacity:1;fill:#003d00;fill-opacity:0.34509804;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+         d="m 193.10865,62.711831 45.98117,26.54722 v 53.094429 l -45.98117,-26.54722 z"
+         id="path4541" />
+    </g>
+  </g>
+</svg>
diff --git a/g3doc/community.md b/g3doc/community.md
new file mode 100644
index 000000000..76f4d87c3
--- /dev/null
+++ b/g3doc/community.md
@@ -0,0 +1,31 @@
+# Participation
+
+To contribute code, please read the [contributing guide](../CONTRIBUTING.md).
+
+Please note that the [Code of Conduct](../CODE_OF_CONDUCT.md) applies to
+community forums as well as technical participation.
+
+## Communication channels
+
+The project maintains two mailing lists:
+
+*   [gvisor-users][gvisor-users] for accouncements and general discussion.
+*   [gvisor-dev][gvisor-dev] for development and contribution.
+
+We also have a [chat room hosted on Gitter][gitter-chat].
+
+We'd love to hear from you!
+
+## Community meetings
+
+The community calendar shows upcoming public meetings and opportunities to
+collaborate or discuss the project. Meetings are planned and announced ahead of
+time via the [gvisor-users][gvisor-users] mailing list.
+
+These meetings are public: anyone can join.
+
+<iframe src="https://calendar.google.com/calendar/b/1/embed?showTitle=0&amp;height=600&amp;wkst=1&amp;bgcolor=%23FFFFFF&amp;src=bd6f4k210u3ukmlj9b8vl053fk%40group.calendar.google.com&amp;color=%23AB8B00&amp;ctz=America%2FLos_Angeles" style="border-width:0" width="600" height="400" frameborder="0" scrolling="no"></iframe>
+
+[gitter-chat]: https://gitter.im/gvisor/community
+[gvisor-dev]: https://groups.google.com/forum/#!forum/gvisor-dev
+[gvisor-users]: https://groups.google.com/forum/#!forum/gvisor-users
diff --git a/g3doc/logo.txt b/g3doc/logo.txt
new file mode 100644
index 000000000..92f9cad5f
--- /dev/null
+++ b/g3doc/logo.txt
@@ -0,0 +1 @@
+The gVisor logo files are licensed under CC BY-SA 4.0 (Creative Commons Attribution-ShareAlike 4.0 International).
diff --git a/g3doc/roadmap.md b/g3doc/roadmap.md
new file mode 100644
index 000000000..06ea25a8b
--- /dev/null
+++ b/g3doc/roadmap.md
@@ -0,0 +1,49 @@
+# Roadmap
+
+gVisor [GitHub Issues][issues] serve as the source-of-truth for most work in
+flight. Specific performance and compatibility issues are generally tracked
+there. [GitHub Milestones][milestones] may be used to track larger features that
+span many issues. However, labels are also used to aggregate cross-cutting
+feature work.
+
+## Core Improvements
+
+Most gVisor work is focused on four areas.
+
+*   [Performance][performance]: overall sandbox performance, including platform
+    performance, is a critical area for investment. This includes: network
+    performance (throughput and latency), file system performance (metadata and
+    data I/O), application switch and fault costs, etc. The goal of gVisor is to
+    provide sandboxing without a material performance or efficiency impact on
+    all but the most performance-sensitive applications.
+
+*   [Compatibility][compatibility]: supporting a wide range of applications
+    requires supporting a large system API, including special system files (e.g.
+    proc, sys, dev, etc.). The goal of gVisor is to support the broad set of
+    applications that depend on a generic Linux API, rather than a specific
+    kernel version.
+
+*   [Infrastructure & tooling][infrastructure]: the above goals require
+    aggressive testing and coverage, and well-established processes. This
+    includes adding appropriate system call coverage, end-to-end suites and
+    runtime tests.
+
+*   [Integration][integration]: Container infrastructure is evolving rapidly and
+    becoming more complex, and gVisor must continuously implement relevant and
+    popular features to ensure that integration points remain robust and
+    feature-complete while preserving security guarantees.
+
+## Releases
+
+Releases are available on [GitHub][releases].
+
+As a convenience, binary packages are also published. Instructions for their use
+are available via the [Installation instructions](./user_guide/install.md).
+
+[issues]: https://github.com/google/gvisor/issues
+[milestones]: https://github.com/google/gvisor/milestones
+[releases]: https://github.com/google/gvisor/releases
+[performance]: https://github.com/google/gvisor/issues?q=is%3Aopen+is%3Aissue+label%3A%22area%3A+performance%22
+[integration]: https://github.com/google/gvisor/issues?q=is%3Aopen+is%3Aissue+label%3A%22area%3A+integration%22
+[compatibility]: https://github.com/google/gvisor/issues?q=is%3Aopen+is%3Aissue+label%3A%22area%3A+compatibility%22
+[infrastructure]: https://github.com/google/gvisor/issues?q=is%3Aopen+is%3Aissue+label%3A%22area%3A+tooling%22
diff --git a/g3doc/user_guide/BUILD b/g3doc/user_guide/BUILD
new file mode 100644
index 000000000..5568e1ba4
--- /dev/null
+++ b/g3doc/user_guide/BUILD
@@ -0,0 +1,70 @@
+load("//website:defs.bzl", "doc")
+
+package(
+    default_visibility = ["//website:__pkg__"],
+    licenses = ["notice"],
+)
+
+doc(
+    name = "compatibility",
+    src = "compatibility.md",
+    category = "Compatibility",
+    permalink = "/docs/user_guide/compatibility/",
+    weight = "0",
+)
+
+doc(
+    name = "checkpoint_restore",
+    src = "checkpoint_restore.md",
+    category = "User Guide",
+    permalink = "/docs/user_guide/checkpoint_restore/",
+    weight = "60",
+)
+
+doc(
+    name = "debugging",
+    src = "debugging.md",
+    category = "User Guide",
+    permalink = "/docs/user_guide/debugging/",
+    weight = "70",
+)
+
+doc(
+    name = "FAQ",
+    src = "FAQ.md",
+    category = "User Guide",
+    permalink = "/docs/user_guide/FAQ/",
+    weight = "90",
+)
+
+doc(
+    name = "filesystem",
+    src = "filesystem.md",
+    category = "User Guide",
+    permalink = "/docs/user_guide/filesystem/",
+    weight = "40",
+)
+
+doc(
+    name = "networking",
+    src = "networking.md",
+    category = "User Guide",
+    permalink = "/docs/user_guide/networking/",
+    weight = "50",
+)
+
+doc(
+    name = "install",
+    src = "install.md",
+    category = "User Guide",
+    permalink = "/docs/user_guide/install/",
+    weight = "10",
+)
+
+doc(
+    name = "platforms",
+    src = "platforms.md",
+    category = "User Guide",
+    permalink = "/docs/user_guide/platforms/",
+    weight = "30",
+)
diff --git a/g3doc/user_guide/FAQ.md b/g3doc/user_guide/FAQ.md
new file mode 100644
index 000000000..89df65e99
--- /dev/null
+++ b/g3doc/user_guide/FAQ.md
@@ -0,0 +1,122 @@
+# FAQ
+
+[TOC]
+
+### What operating systems are supported? {#supported-os}
+
+Today, gVisor requires Linux.
+
+### What CPU architectures are supported? {#supported-cpus}
+
+gVisor currently supports [x86_64/AMD64](https://en.wikipedia.org/wiki/X86-64)
+compatible processors. Preliminary support is also available for
+[ARM64](https://en.wikipedia.org/wiki/ARM_architecture#AArch64).
+
+### Do I need to modify my Linux application to use gVisor? {#modify-app}
+
+No. gVisor is capable of running unmodified Linux binaries.
+
+### What binary formats does gVisor support? {#supported-binaries}
+
+gVisor supports Linux
+[ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) binaries.
+
+Binaries run in gVisor should be built for the
+[AMD64](https://en.wikipedia.org/wiki/X86-64) or
+[AArch64](https://en.wikipedia.org/wiki/ARM_architecture#AArch64) CPU
+architectures.
+
+### Can I run Docker images using gVisor? {#docker-images}
+
+Yes. Please see the [Docker Quick Start][docker].
+
+### Can I run Kubernetes pods using gVisor? {#k8s-pods}
+
+Yes. Please see the [Kubernetes Quick Start][k8s].
+
+### What's the security model? {#security-model}
+
+See the [Security Model][security-model].
+
+## Troubleshooting
+
+### My container runs fine with `runc` but fails with `runsc` {#app-compatibility}
+
+If you’re having problems running a container with `runsc` it’s most likely due
+to a compatibility issue or a missing feature in gVisor. See
+[Debugging][debugging].
+
+### When I run my container, docker fails with: `open /run/containerd/.../<containerid>/log.json: no such file or directory` {#memfd-create}
+
+You are using an older version of Linux which doesn't support `memfd_create`.
+
+This is tracked in [bug #268](https://gvisor.dev/issue/268).
+
+### When I run my container, docker fails with: `flag provided but not defined: -console` {#old-docker}
+
+You're using an old version of Docker. See [Docker Quick Start][docker].
+
+### I can’t see a file copied with: `docker cp` {#fs-cache}
+
+For performance reasons, gVisor caches directory contents, and therefore it may
+not realize a new file was copied to a given directory. To invalidate the cache
+and force a refresh, create a file under the directory in question and list the
+contents again.
+
+As a workaround, shared root filesystem can be enabled. See
+[Filesystem][filesystem].
+
+This bug is tracked in [bug #4](https://gvisor.dev/issue/4).
+
+Note that `kubectl cp` works because it does the copy by exec'ing inside the
+sandbox, and thus gVisor's internal cache is made aware of the new files and
+directories.
+
+### I'm getting an error like: `panic: unable to attach: operation not permitted` or `fork/exec /proc/self/exe: invalid argument: unknown` {#runsc-perms}
+
+Make sure that permissions and the owner is correct on the `runsc` binary.
+
+```bash
+sudo chown root:root /usr/local/bin/runsc
+sudo chmod 0755 /usr/local/bin/runsc
+```
+
+### I'm getting an error like `mount submount "/etc/hostname": creating mount with source ".../hostname": input/output error: unknown.` {#memlock}
+
+There is a bug in Linux kernel versions 5.1 to 5.3.15, 5.4.2, and 5.5. Upgrade
+to a newer kernel or add the following to
+`/lib/systemd/system/containerd.service` as a workaround.
+
+```
+LimitMEMLOCK=infinity
+```
+
+And run `systemctl daemon-reload && systemctl restart containerd` to restart
+containerd.
+
+See [issue #1765](https://gvisor.dev/issue/1765) for more details.
+
+### My container cannot resolve another container's name when using Docker user defined bridge {#docker-bridge}
+
+This is normally indicated by errors like `bad address 'container-name'` when
+trying to communicate to another container in the same network.
+
+Docker user defined bridge uses an embedded DNS server bound to the loopback
+interface on address 127.0.0.10. This requires access to the host network in
+order to communicate to the DNS server. runsc network is isolated from the host
+and cannot access the DNS server on the host network without breaking the
+sandbox isolation. There are a few different workarounds you can try:
+
+*   Use default bridge network with `--link` to connect containers. Default
+    bridge doesn't use embedded DNS.
+*   Use [`--network=host`][host-net] option in runsc, however beware that it
+    will use the host network stack and is less secure.
+*   Use IPs instead of container names.
+*   Use [Kubernetes][k8s]. Container name lookup works fine in Kubernetes.
+
+[security-model]: /docs/architecture_guide/security/
+[host-net]: /docs/user_guide/networking/#network-passthrough
+[debugging]: /docs/user_guide/debugging/
+[filesystem]: /docs/user_guide/filesystem/
+[docker]: /docs/user_guide/quick_start/docker/
+[k8s]: /docs/user_guide/quick_start/kubernetes/
diff --git a/g3doc/user_guide/checkpoint_restore.md b/g3doc/user_guide/checkpoint_restore.md
new file mode 100644
index 000000000..0ab0911b0
--- /dev/null
+++ b/g3doc/user_guide/checkpoint_restore.md
@@ -0,0 +1,101 @@
+# Checkpoint/Restore
+
+[TOC]
+
+gVisor has the ability to checkpoint a process, save its current state in a
+state file, and restore into a new container using the state file.
+
+## How to use checkpoint/restore
+
+Checkpoint/restore functionality is currently available via raw `runsc`
+commands. To use the checkpoint command, first run a container.
+
+```bash
+runsc run <container id>
+```
+
+To checkpoint the container, the `--image-path` flag must be provided. This is
+the directory path within which the checkpoint state-file will be created. The
+file will be called `checkpoint.img` and necessary directories will be created
+if they do not yet exist.
+
+> Note: Two checkpoints cannot be saved to the same directory; every image-path
+> provided must be unique.
+
+```bash
+runsc checkpoint --image-path=<path> <container id>
+```
+
+There is also an optional `--leave-running` flag that allows the container to
+continue to run after the checkpoint has been made. (By default, containers stop
+their processes after committing a checkpoint.)
+
+> Note: All top-level runsc flags needed when calling run must be provided to
+> checkpoint if --leave-running is used.
+
+> Note: --leave-running functions by causing an immediate restore so the
+> container, although will maintain its given container id, may have a different
+> process id.
+
+```bash
+runsc checkpoint --image-path=<path> --leave-running <container id>
+```
+
+To restore, provide the image path to the `checkpoint.img` file created during
+the checkpoint. Because containers stop by default after checkpointing, restore
+needs to happen in a new container (restore is a command which parallels start).
+
+```bash
+runsc create <container id>
+
+runsc restore --image-path=<path> <container id>
+```
+
+## How to use checkpoint/restore in Docker:
+
+Currently checkpoint/restore through `runsc` is not entirely compatible with
+Docker, although there has been progress made from both gVisor and Docker to
+enable compatibility. Here, we document the ideal workflow.
+
+Run a container:
+
+```bash
+docker run [options] --runtime=runsc <image>`
+```
+
+Checkpoint a container:
+
+```bash
+docker checkpoint create <container> <checkpoint_name>`
+```
+
+Create a new container into which to restore:
+
+```bash
+docker create [options] --runtime=runsc <image>
+```
+
+Restore a container:
+
+```bash
+docker start --checkpoint --checkpoint-dir=<directory> <container>
+```
+
+### Issues Preventing Compatibility with Docker
+
+-   **[Moby #37360][leave-running]:** Docker version 18.03.0-ce and earlier
+    hangs when checkpointing and does not create the checkpoint. To successfully
+    use this feature, install a custom version of docker-ce from the moby
+    repository. This issue is caused by an improper implementation of the
+    `--leave-running` flag. This issue is fixed in newer releases.
+-   **Docker does not support restoration into new containers:** Docker
+    currently expects the container which created the checkpoint to be the same
+    container used to restore which is not possible in runsc. When Docker
+    supports container migration and therefore restoration into new containers,
+    this will be the flow.
+-   **[Moby #37344][checkpoint-dir]:** Docker does not currently support the
+    `--checkpoint-dir` flag but this will be required when restoring from a
+    checkpoint made in another container.
+
+[leave-running]: https://github.com/moby/moby/pull/37360
+[checkpoint-dir]: https://github.com/moby/moby/issues/37344
diff --git a/g3doc/user_guide/compatibility.md b/g3doc/user_guide/compatibility.md
new file mode 100644
index 000000000..9d3e3680f
--- /dev/null
+++ b/g3doc/user_guide/compatibility.md
@@ -0,0 +1,93 @@
+# Applications
+
+[TOC]
+
+gVisor implements a large portion of the Linux surface and while we strive to
+make it broadly compatible, there are (and always will be) unimplemented
+features and bugs. The only real way to know if it will work is to try. If you
+find a container that doesn’t work and there is no known issue, please
+[file a bug][bug] indicating the full command you used to run the image. You can
+view open issues related to compatibility [here][issues].
+
+If you're able to provide the [debug logs](../debugging/), the problem likely to
+be fixed much faster.
+
+## What works?
+
+The following applications/images have been tested:
+
+*   elasticsearch
+*   golang
+*   httpd
+*   java8
+*   jenkins
+*   mariadb
+*   memcached
+*   mongo
+*   mysql
+*   nginx
+*   node
+*   php
+*   postgres
+*   prometheus
+*   python
+*   redis
+*   registry
+*   tomcat
+*   wordpress
+
+## Utilities
+
+Most common utilities work. Note that:
+
+*   Some tools, such as `tcpdump` and old versions of `ping`, require explicitly
+    enabling raw sockets via the unsafe `--net-raw` runsc flag.
+*   Different Docker images can behave differently. For example, Alpine Linux
+    and Ubuntu have different `ip` binaries.
+
+    Specific tools include:
+
+<!-- mdformat off(don't wrap the table) -->
+
+| Tool     | Status                                    |
+|:--------:|:-----------------------------------------:|
+| apt-get  | Working.                                  |
+| bundle   | Working.                                  |
+| cat      | Working.                                  |
+| curl     | Working.                                  |
+| dd       | Working.                                  |
+| df       | Working.                                  |
+| dig      | Working.                                  |
+| drill    | Working.                                  |
+| env      | Working.                                  |
+| find     | Working.                                  |
+| gdb      | Working.                                  |
+| gosu     | Working.                                  |
+| grep     | Working (unless stdin is a pipe and stdout is /dev/null). |
+| ifconfig | Works partially, like ip. Full support [in progress](https://gvisor.dev/issue/578). |
+| ip       | Some subcommands work (e.g. addr, route). Full support [in progress](https://gvisor.dev/issue/578). |
+| less     | Working.                                  |
+| ls       | Working.                                  |
+| lsof     | Working.                                  |
+| mount    | Works in readonly mode. gVisor doesn't currently support creating new mounts at runtime. |
+| nc       | Working.                                  |
+| nmap     | Not working.                              |
+| netstat  | [In progress](https://gvisor.dev/issue/2112). |
+| nslookup | Working.                                  |
+| ping     | Working.                                  |
+| ps       | Working.                                  |
+| route    | Working.                                  |
+| ss       | [In progress](https://gvisor.dev/issue/2114). |
+| sshd     | Partially working. Job control [in progress](https://gvisor.dev/issue/154). |
+| strace   | Working.                                  |
+| tar      | Working.                                  |
+| tcpdump  | [In progress](https://gvisor.dev/issue/173). |
+| top      | Working.                                  |
+| uptime   | Working.                                  |
+| vim      | Working.                                  |
+| wget     | Working.                                  |
+
+<!-- mdformat on -->
+
+[bug]: https://github.com/google/gvisor/issues/new?title=Compatibility%20Issue:
+[issues]: https://github.com/google/gvisor/issues?q=is%3Aissue+is%3Aopen+label%3A%22area%3A+compatibility%22
diff --git a/g3doc/user_guide/debugging.md b/g3doc/user_guide/debugging.md
new file mode 100644
index 000000000..0525fd5c0
--- /dev/null
+++ b/g3doc/user_guide/debugging.md
@@ -0,0 +1,131 @@
+# Debugging
+
+[TOC]
+
+To enable debug and system call logging, add the `runtimeArgs` below to your
+[Docker](../quick_start/docker/) configuration (`/etc/docker/daemon.json`):
+
+```json
+{
+    "runtimes": {
+        "runsc": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--debug-log=/tmp/runsc/",
+                "--debug",
+                "--strace"
+            ]
+       }
+    }
+}
+```
+
+> Note: the last `/` in `--debug-log` is needed to interpret it as a directory.
+> Then each `runsc` command executed will create a separate log file. Otherwise,
+> log messages from all commands will be appended to the same file.
+
+You may also want to pass `--log-packets` to troubleshoot network problems. Then
+restart the Docker daemon:
+
+```bash
+sudo systemctl restart docker
+```
+
+Run your container again, and inspect the files under `/tmp/runsc`. The log file
+ending with `.boot` will contain the strace logs from your application, which
+can be useful for identifying missing or broken system calls in gVisor. If you
+are having problems starting the container, the log file ending with `.create`
+may have the reason for the failure.
+
+## Stack traces
+
+The command `runsc debug --stacks` collects stack traces while the sandbox is
+running which can be useful to troubleshoot issues or just to learn more about
+gVisor. It connects to the sandbox process, collects a stack dump, and writes it
+to the console. For example:
+
+```bash
+docker run --runtime=runsc --rm -d alpine sh -c "while true; do echo running; sleep 1; done"
+63254c6ab3a6989623fa1fb53616951eed31ac605a2637bb9ddba5d8d404b35b
+
+sudo runsc --root /var/run/docker/runtime-runsc/moby debug --stacks 63254c6ab3a6989623fa1fb53616951eed31ac605a2637bb9ddba5d8d404b35b
+```
+
+> Note: `--root` variable is provided by docker and is normally set to
+> `/var/run/docker/runtime-[runtime-name]/moby`. If in doubt, `--root` is logged
+> to `runsc` logs.
+
+## Debugger
+
+You can debug gVisor like any other Golang program. If you're running with
+Docker, you'll need to find the sandbox PID and attach the debugger as root.
+Here is an example:
+
+```bash
+# Get a runsc with debug symbols (download nightly or build with symbols).
+bazel build -c dbg //runsc:runsc
+
+# Start the container you want to debug.
+docker run --runtime=runsc --rm --name=test -d alpine sleep 1000
+
+# Find the sandbox PID.
+docker inspect test | grep Pid | head -n 1
+
+# Attach your favorite debugger.
+sudo dlv attach <PID>
+
+# Set a breakpoint and resume.
+break mm.MemoryManager.MMap
+continue
+```
+
+## Profiling
+
+`runsc` integrates with Go profiling tools and gives you easy commands to
+profile CPU and heap usage. First you need to enable `--profile` in the command
+line options before starting the container:
+
+```json
+{
+    "runtimes": {
+        "runsc-prof": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--profile"
+            ]
+       }
+    }
+}
+```
+
+> Note: Enabling profiling loosens the seccomp protection added to the sandbox,
+> and should not be run in production under normal circumstances.
+
+Then restart docker to refresh the runtime options. While the container is
+running, execute `runsc debug` to collect profile information and save to a
+file. Here are the options available:
+
+*   **--profile-heap:** Generates heap profile to the speficied file.
+*   **--profile-cpu:** Enables CPU profiler, waits for `--duration` seconds and
+    generates CPU profile to the speficied file.
+
+For example:
+
+```bash
+docker run --runtime=runsc-prof --rm -d alpine sh -c "while true; do echo running; sleep 1; done"
+63254c6ab3a6989623fa1fb53616951eed31ac605a2637bb9ddba5d8d404b35b
+
+sudo runsc --root /var/run/docker/runtime-runsc-prof/moby debug --profile-heap=/tmp/heap.prof 63254c6ab3a6989623fa1fb53616951eed31ac605a2637bb9ddba5d8d404b35b
+sudo runsc --root /var/run/docker/runtime-runsc-prof/moby debug --profile-cpu=/tmp/cpu.prof --duration=30s 63254c6ab3a6989623fa1fb53616951eed31ac605a2637bb9ddba5d8d404b35b
+```
+
+The resulting files can be opened using `go tool pprof` or [pprof][]. The
+examples below create image file (`.svg`) with the heap profile and writes the
+top functions using CPU to the console:
+
+```bash
+go tool pprof -svg /usr/local/bin/runsc /tmp/heap.prof
+go tool pprof -top /usr/local/bin/runsc /tmp/cpu.prof
+```
+
+[pprof]: https://github.com/google/pprof/blob/master/doc/README.md
diff --git a/g3doc/user_guide/filesystem.md b/g3doc/user_guide/filesystem.md
new file mode 100644
index 000000000..cd00762dd
--- /dev/null
+++ b/g3doc/user_guide/filesystem.md
@@ -0,0 +1,60 @@
+# Filesystem
+
+[TOC]
+
+gVisor accesses the filesystem through a file proxy, called the Gofer. The gofer
+runs as a separate process, that is isolated from the sandbox. Gofer instances
+communicate with their respective sentry using the 9P protocol. For another
+explanation see [What is gVisor?](../README.md).
+
+## Sandbox overlay
+
+To isolate the host filesystem from the sandbox, you can set a writable tmpfs
+overlay on top of the entire filesystem. All modifications are made to the
+overlay, keeping the host filesystem unmodified.
+
+> Note: All created and modified files are stored in memory inside the sandbox.
+
+To use the tmpfs overlay, add the following `runtimeArgs` to your Docker
+configuration (`/etc/docker/daemon.json`) and restart the Docker daemon:
+
+```json
+{
+    "runtimes": {
+        "runsc": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--overlay"
+            ]
+       }
+    }
+}
+```
+
+## Shared root filesystem
+
+The root filesystem is where the image is extracted and is not generally
+modified from outside the sandbox. This allows for some optimizations, like
+skipping checks to determine if a directory has changed since the last time it
+was cached, thus missing updates that may have happened. If you need to `docker
+cp` files inside the root filesystem, you may want to enable shared mode. Just
+be aware that file system access will be slower due to the extra checks that are
+required.
+
+> Note: External mounts are always shared.
+
+To use set the root filesystem shared, add the following `runtimeArgs` to your
+Docker configuration (`/etc/docker/daemon.json`) and restart the Docker daemon:
+
+```json
+{
+    "runtimes": {
+        "runsc": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--file-access=shared"
+            ]
+       }
+    }
+}
+```
diff --git a/g3doc/user_guide/install.md b/g3doc/user_guide/install.md
new file mode 100644
index 000000000..9afdd264d
--- /dev/null
+++ b/g3doc/user_guide/install.md
@@ -0,0 +1,157 @@
+# Installation
+
+[TOC]
+
+> Note: gVisor supports only x86\_64 and requires Linux 4.14.77+
+> ([older Linux](./networking.md#gso)).
+
+## Versions
+
+The `runsc` binaries and repositories are available in multiple versions and
+release channels. You should pick the version you'd like to install. For
+experimentation, the nightly release is recommended. For production use, the
+latest release is recommended.
+
+After selecting an appropriate release channel from the options below, proceed
+to the preferred installation mechanism: manual or from an `apt` repository.
+
+### HEAD
+
+Binaries are available for every commit on the `master` branch, and are
+available at the following URL:
+
+`https://storage.googleapis.com/gvisor/releases/master/latest/runsc`
+
+Checksums for the release binary are at:
+
+`https://storage.googleapis.com/gvisor/releases/master/latest/runsc.sha512`
+
+For `apt` installation, use the `master` as the `${DIST}` below.
+
+### Nightly
+
+Nightly releases are built most nights from the master branch, and are available
+at the following URL:
+
+`https://storage.googleapis.com/gvisor/releases/nightly/latest/runsc`
+
+Checksums for the release binary are at:
+
+`https://storage.googleapis.com/gvisor/releases/nightly/latest/runsc.sha512`
+
+Specific nightly releases can be found at:
+
+`https://storage.googleapis.com/gvisor/releases/nightly/${yyyy-mm-dd}/runsc`
+
+Note that a release may not be available for every day.
+
+For `apt` installation, use the `nightly` as the `${DIST}` below.
+
+### Latest release
+
+The latest official release is available at the following URL:
+
+`https://storage.googleapis.com/gvisor/releases/release/latest`
+
+For `apt` installation, use the `release` as the `${DIST}` below.
+
+### Specific release
+
+A given release release is available at the following URL:
+
+`https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}`
+
+See the [releases][releases] page for information about specific releases.
+
+For `apt` installation of a specific release, which may include point updates,
+use the date of the release, e.g. `${yyyymmdd}`, as the `${DIST}` below.
+
+> Note: only newer releases may be available as `apt` repositories.
+
+### Point release
+
+A given point release is available at the following URL:
+
+`https://storage.googleapis.com/gvisor/releases/release/${yyyymmdd}.${rc}`
+
+Note that `apt` installation of a specific point release is not supported.
+
+## Install from an `apt` repository
+
+First, appropriate dependencies must be installed to allow `apt` to install
+packages via https:
+
+```bash
+sudo apt-get update && \
+sudo apt-get install -y \
+    apt-transport-https \
+    ca-certificates \
+    curl \
+    gnupg-agent \
+    software-properties-common
+```
+
+Next, the key used to sign archives should be added to your `apt` keychain:
+
+```bash
+curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add -
+```
+
+Based on the release type, you will need to substitute `${DIST}` below, using
+one of:
+
+*   `master`: For HEAD.
+*   `nightly`: For nightly releases.
+*   `release`: For the latest release.
+*   `${yyyymmdd}`: For a specific releases (see above).
+
+The repository for the release you wish to install should be added:
+
+```bash
+sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases ${DIST} main"
+```
+
+For example, to install the latest official release, you can use:
+
+```bash
+sudo add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main"
+```
+
+Now the runsc package can be installed:
+
+```bash
+sudo apt-get update && sudo apt-get install -y runsc
+```
+
+If you have Docker installed, it will be automatically configured.
+
+## Install directly
+
+The binary URLs provided above can be used to install directly. For example, the
+latest nightly binary can be downloaded, validated, and placed in an appropriate
+location by running:
+
+```bash
+(
+  set -e
+  URL=https://storage.googleapis.com/gvisor/releases/nightly/latest
+  wget ${URL}/runsc
+  wget ${URL}/runsc.sha512
+  sha512sum -c runsc.sha512
+  rm -f runsc.sha512
+  sudo mv runsc /usr/local/bin
+  sudo chown root:root /usr/local/bin/runsc
+  sudo chmod 0755 /usr/local/bin/runsc
+)
+```
+
+**It is important to copy this binary to a location that is accessible to all
+users, and ensure it is executable by all users**, since `runsc` executes itself
+as user `nobody` to avoid unnecessary privileges. The `/usr/local/bin` directory
+is a good place to put the `runsc` binary.
+
+After installation, try out `runsc` by following the
+[Docker Quick Start](./quick_start/docker.md) or
+[OCI Quick Start](./quick_start/oci.md).
+
+[releases]: https://github.com/google/gvisor/releases
diff --git a/g3doc/user_guide/networking.md b/g3doc/user_guide/networking.md
new file mode 100644
index 000000000..4aa394c91
--- /dev/null
+++ b/g3doc/user_guide/networking.md
@@ -0,0 +1,85 @@
+# Networking
+
+[TOC]
+
+gVisor implements its own network stack called [netstack][netstack]. All aspects
+of the network stack are handled inside the Sentry — including TCP connection
+state, control messages, and packet assembly — keeping it isolated from the host
+network stack. Data link layer packets are written directly to the virtual
+device inside the network namespace setup by Docker or Kubernetes.
+
+The IP address and routes configured for the device are transferred inside the
+sandbox. The loopback device runs exclusively inside the sandbox and does not
+use the host. You can inspect them by running:
+
+```bash
+docker run --rm --runtime=runsc alpine ip addr
+```
+
+## Network passthrough
+
+For high-performance networking applications, you may choose to disable the user
+space network stack and instead use the host network stack, including the
+loopback. Note that this mode decreases the isolation to the host.
+
+Add the following `runtimeArgs` to your Docker configuration
+(`/etc/docker/daemon.json`) and restart the Docker daemon:
+
+```json
+{
+    "runtimes": {
+        "runsc": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--network=host"
+            ]
+       }
+    }
+}
+```
+
+## Disabling external networking
+
+To completely isolate the host and network from the sandbox, external networking
+can be disabled. The sandbox will still contain a loopback provided by netstack.
+
+Add the following `runtimeArgs` to your Docker configuration
+(`/etc/docker/daemon.json`) and restart the Docker daemon:
+
+```json
+{
+    "runtimes": {
+        "runsc": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--network=none"
+            ]
+       }
+    }
+}
+```
+
+### Disable GSO {#gso}
+
+If your Linux is older than 4.14.17, you can disable Generic Segmentation
+Offload (GSO) to run with a kernel that is newer than 3.17. Add the
+`--gso=false` flag to your Docker runtime configuration
+(`/etc/docker/daemon.json`) and restart the Docker daemon:
+
+> Note: Network performance, especially for large payloads, will be greatly
+> reduced.
+
+```json
+{
+    "runtimes": {
+        "runsc": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--gso=false"
+            ]
+       }
+    }
+}
+```
+
+[netstack]: https://github.com/google/netstack
diff --git a/g3doc/user_guide/platforms.md b/g3doc/user_guide/platforms.md
new file mode 100644
index 000000000..752025881
--- /dev/null
+++ b/g3doc/user_guide/platforms.md
@@ -0,0 +1,95 @@
+# Changing Platforms
+
+[TOC]
+
+This guide described how to change the
+[platform](../architecture_guide/platforms.md) used by `runsc`.
+
+## Prerequisites
+
+If you intend to run the KVM platform, you will also to have KVM installed on
+your system. If you are running a Debian based system like Debian or Ubuntu you
+can usually do this by ensuring the module is loaded, and permissions are
+appropriately set on the `/dev/kvm` device.
+
+If you have an Intel CPU:
+
+```bash
+sudo modprobe kvm-intel && sudo chmod a+rw /dev/kvm
+```
+
+If you have an AMD CPU:
+
+```bash
+sudo modprobe kvm-amd && sudo chmod a+rw /dev/kvm
+```
+
+If you are using a virtual machine you will need to make sure that nested
+virtualization is configured. Here are links to documents on how to set up
+nested virtualization in several popular environments:
+
+*   Google Cloud: [Enabling Nested Virtualization for VM Instances][nested-gcp]
+*   Microsoft Azure:
+    [How to enable nested virtualization in an Azure VM][nested-azure]
+*   VirtualBox: [Nested Virtualization][nested-virtualbox]
+*   KVM: [Nested Guests][nested-kvm]
+
+***Note: nested virtualization will have poor performance and is historically a
+cause of security issues (e.g.
+[CVE-2018-12904](https://nvd.nist.gov/vuln/detail/CVE-2018-12904)). It is not
+recommended for production.***
+
+## Configuring Docker
+
+The platform is selected by the `--platform` command line flag passed to
+`runsc`. By default, the ptrace platform is selected. For example, to select the
+KVM platform, modify your Docker configuration (`/etc/docker/daemon.json`) to
+pass the `--platform` argument:
+
+```json
+{
+    "runtimes": {
+        "runsc": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--platform=kvm"
+            ]
+       }
+    }
+}
+```
+
+You must restart the Docker daemon after making changes to this file, typically
+this is done via `systemd`:
+
+```bash
+sudo systemctl restart docker
+```
+
+Note that you may configure multiple runtimes using different platforms. For
+example, the following configuration has one configuration for ptrace and one
+for the KVM platform:
+
+```json
+{
+    "runtimes": {
+        "runsc-ptrace": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--platform=ptrace"
+            ]
+        },
+        "runsc-kvm": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--platform=kvm"
+            ]
+        }
+    }
+}
+```
+
+[nested-azure]: https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nested-virtualization
+[nested-gcp]: https://cloud.google.com/compute/docs/instances/enable-nested-virtualization-vm-instances
+[nested-virtualbox]: https://www.virtualbox.org/manual/UserManual.html#nested-virt
+[nested-kvm]: https://www.linux-kvm.org/page/Nested_Guests
diff --git a/g3doc/user_guide/quick_start/BUILD b/g3doc/user_guide/quick_start/BUILD
new file mode 100644
index 000000000..63f17f9cb
--- /dev/null
+++ b/g3doc/user_guide/quick_start/BUILD
@@ -0,0 +1,33 @@
+load("//website:defs.bzl", "doc")
+
+package(
+    default_visibility = ["//website:__pkg__"],
+    licenses = ["notice"],
+)
+
+doc(
+    name = "docker",
+    src = "docker.md",
+    category = "User Guide",
+    permalink = "/docs/user_guide/quick_start/docker/",
+    subcategory = "Quick Start",
+    weight = "11",
+)
+
+doc(
+    name = "oci",
+    src = "oci.md",
+    category = "User Guide",
+    permalink = "/docs/user_guide/quick_start/oci/",
+    subcategory = "Quick Start",
+    weight = "12",
+)
+
+doc(
+    name = "kubernetes",
+    src = "kubernetes.md",
+    category = "User Guide",
+    permalink = "/docs/user_guide/quick_start/kubernetes/",
+    subcategory = "Quick Start",
+    weight = "13",
+)
diff --git a/g3doc/user_guide/quick_start/docker.md b/g3doc/user_guide/quick_start/docker.md
new file mode 100644
index 000000000..6ad594ecc
--- /dev/null
+++ b/g3doc/user_guide/quick_start/docker.md
@@ -0,0 +1,96 @@
+# Docker Quick Start
+
+> Note: This guide requires Docker version 17.09.0 or greater. Refer to the
+> [Docker documentation][docker] for how to install it.
+
+This guide will help you quickly get started running Docker containers using
+gVisor.
+
+First, follow the [Installation guide][install].
+
+If you use the `apt` repository or the `automated` install, then you can skip
+the next section and proceed straight to running a container.
+
+## Configuring Docker
+
+First you will need to configure Docker to use `runsc` by adding a runtime entry
+to your Docker configuration (e.g. `/etc/docker/daemon.json`). The easiest way
+to this is via the `runsc install` command. This will install a docker runtime
+named "runsc" by default.
+
+```bash
+sudo runsc install
+```
+
+You may also wish to install a runtime entry for debugging. The `runsc install`
+command can accept options that will be passed to the runtime when it is invoked
+by Docker.
+
+```bash
+sudo runsc install --runtime runsc-debug -- \
+  --debug \
+  --debug-log=/tmp/runsc-debug.log \
+  --strace \
+  --log-packets
+```
+
+You must restart the Docker daemon after installing the runtime. Typically this
+is done via `systemd`:
+
+```bash
+sudo systemctl restart docker
+```
+
+## Running a container
+
+Now run your container using the `runsc` runtime:
+
+```bash
+docker run --runtime=runsc --rm hello-world
+```
+
+You can also run a terminal to explore the container.
+
+```bash
+docker run --runtime=runsc --rm -it ubuntu /bin/bash
+```
+
+Many docker options are compatible with gVisor, try them out. Here is an
+example:
+
+```bash
+docker run --runtime=runsc --rm --link backend:database -v ~/bin:/tools:ro -p 8080:80 --cpus=0.5 -it busybox telnet towel.blinkenlights.nl
+```
+
+## Verify the runtime
+
+You can verify that you are running in gVisor using the `dmesg` command.
+
+```text
+$ docker run --runtime=runsc -it ubuntu dmesg
+[    0.000000] Starting gVisor...
+[    0.354495] Daemonizing children...
+[    0.564053] Constructing home...
+[    0.976710] Preparing for the zombie uprising...
+[    1.299083] Creating process schedule...
+[    1.479987] Committing treasure map to memory...
+[    1.704109] Searching for socket adapter...
+[    1.748935] Generating random numbers by fair dice roll...
+[    2.059747] Digging up root...
+[    2.259327] Checking naughty and nice process list...
+[    2.610538] Rewriting operating system in Javascript...
+[    2.613217] Ready!
+```
+
+Note that this is easily replicated by an attacker so applications should never
+use `dmesg` to verify the runtime in a security sensitive context.
+
+Next, look at the different options available for gVisor: [platform][platforms],
+[network][networking], [filesystem][filesystem].
+
+[docker]: https://docs.docker.com/install/
+[storage-driver]: https://docs.docker.com/engine/reference/commandline/dockerd/#daemon-storage-driver
+[install]: /docs/user_guide/install/
+[filesystem]: /docs/user_guide/filesystem/
+[networking]: /docs/user_guide/networking/
+[platforms]: /docs/user_guide/platforms/
diff --git a/g3doc/user_guide/quick_start/kubernetes.md b/g3doc/user_guide/quick_start/kubernetes.md
new file mode 100644
index 000000000..f875d8002
--- /dev/null
+++ b/g3doc/user_guide/quick_start/kubernetes.md
@@ -0,0 +1,36 @@
+# Kubernetes Quick Start
+
+gVisor can be used to run Kubernetes pods and has several integration points
+with Kubernetes.
+
+## Using Minikube
+
+gVisor can run sandboxed containers in a Kubernetes cluster with Minikube. After
+the gVisor addon is enabled, pods with `io.kubernetes.cri.untrusted-workload`
+set to true will execute with `runsc`. Follow [these instructions][minikube] to
+enable gVisor addon.
+
+## Using Containerd
+
+You can also setup Kubernetes nodes to run pods in gvisor using the
+[containerd][containerd] CRI runtime and the `gvisor-containerd-shim`. You can
+use either the `io.kubernetes.cri.untrusted-workload` annotation or
+[RuntimeClass][runtimeclass] to run Pods with `runsc`. You can find instructions
+[here][gvisor-containerd-shim].
+
+## Using GKE Sandbox
+
+[GKE Sandbox][gke-sandbox] is available in [Google Kubernetes Engine][gke]. You
+just need to deploy a node pool with gVisor enabled in your cluster, and it will
+run pods annotated with `runtimeClassName: gvisor` inside a gVisor sandbox for
+you. [Here][wordpress-quick] is a quick example showing how to deploy a
+WordPress site. You can view the full documentation [here][gke-sandbox-docs].
+
+[containerd]: https://containerd.io/
+[minikube]: https://github.com/kubernetes/minikube/blob/master/deploy/addons/gvisor/README.md
+[gke]: https://cloud.google.com/kubernetes-engine/
+[gke-sandbox]: https://cloud.google.com/kubernetes-engine/sandbox/
+[gke-sandbox-docs]: https://cloud.google.com/kubernetes-engine/docs/how-to/sandbox-pods
+[gvisor-containerd-shim]: https://github.com/google/gvisor-containerd-shim
+[runtimeclass]: https://kubernetes.io/docs/concepts/containers/runtime-class/
+[wordpress-quick]: /docs/tutorials/kubernetes/
diff --git a/g3doc/user_guide/quick_start/oci.md b/g3doc/user_guide/quick_start/oci.md
new file mode 100644
index 000000000..877169145
--- /dev/null
+++ b/g3doc/user_guide/quick_start/oci.md
@@ -0,0 +1,45 @@
+# OCI Quick Start
+
+This guide will quickly get you started running your first gVisor sandbox
+container using the runtime directly with the default platform.
+
+First, follow the [Installation guide][install].
+
+## Run an OCI compatible container
+
+Now we will create an [OCI][oci] container bundle to run our container. First we
+will create a root directory for our bundle.
+
+```bash
+mkdir bundle
+cd bundle
+```
+
+Create a root file system for the container. We will use the Docker hello-world
+image as the basis for our container.
+
+```bash
+mkdir rootfs
+docker export $(docker create hello-world) | tar -xf - -C rootfs
+```
+
+Next, create an specification file called `config.json` that contains our
+container specification. We will update the default command it runs to `/hello`
+in the `hello-world` container.
+
+```bash
+runsc spec
+sed -i 's;"sh";"/hello";' config.json
+```
+
+Finally run the container.
+
+```bash
+sudo runsc run hello
+```
+
+Next try [using CNI to set up networking](../../../tutorials/cni/) or
+[running gVisor using Docker](../docker/).
+
+[oci]: https://opencontainers.org/
+[install]: /docs/user_guide/install
diff --git a/g3doc/user_guide/tutorials/BUILD b/g3doc/user_guide/tutorials/BUILD
new file mode 100644
index 000000000..caae98623
--- /dev/null
+++ b/g3doc/user_guide/tutorials/BUILD
@@ -0,0 +1,37 @@
+load("//website:defs.bzl", "doc")
+
+package(
+    default_visibility = ["//website:__pkg__"],
+    licenses = ["notice"],
+)
+
+doc(
+    name = "docker",
+    src = "docker.md",
+    category = "User Guide",
+    permalink = "/docs/tutorials/docker/",
+    subcategory = "Tutorials",
+    weight = "21",
+)
+
+doc(
+    name = "cni",
+    src = "cni.md",
+    category = "User Guide",
+    permalink = "/docs/tutorials/cni/",
+    subcategory = "Tutorials",
+    weight = "22",
+)
+
+doc(
+    name = "kubernetes",
+    src = "kubernetes.md",
+    category = "User Guide",
+    data = [
+        "add-node-pool.png",
+        "node-pool-button.png",
+    ],
+    permalink = "/docs/tutorials/kubernetes/",
+    subcategory = "Tutorials",
+    weight = "33",
+)
diff --git a/g3doc/user_guide/tutorials/add-node-pool.png b/g3doc/user_guide/tutorials/add-node-pool.png
new file mode 100644
index 000000000..e4560359b
--- /dev/null
+++ b/g3doc/user_guide/tutorials/add-node-pool.png
diff --git a/g3doc/user_guide/tutorials/cni.md b/g3doc/user_guide/tutorials/cni.md
new file mode 100644
index 000000000..ad6c9fa59
--- /dev/null
+++ b/g3doc/user_guide/tutorials/cni.md
@@ -0,0 +1,172 @@
+# Using CNI
+
+This tutorial will show you how to set up networking for a gVisor sandbox using
+the
+[Container Networking Interface (CNI)](https://github.com/containernetworking/cni).
+
+## Install CNI Plugins
+
+First you will need to install the CNI plugins. CNI plugins are used to set up a
+network namespace that `runsc` can use with the sandbox.
+
+Start by creating the directories for CNI plugin binaries:
+
+```
+sudo mkdir -p /opt/cni/bin
+```
+
+Download the CNI plugins:
+
+```
+wget https://github.com/containernetworking/plugins/releases/download/v0.8.3/cni-plugins-linux-amd64-v0.8.3.tgz
+```
+
+Next, unpack the plugins into the CNI binary directory:
+
+```
+sudo tar -xvf cni-plugins-linux-amd64-v0.8.3.tgz -C /opt/cni/bin/
+```
+
+## Configure CNI Plugins
+
+This section will show you how to configure CNI plugins. This tutorial will use
+the "bridge" and "loopback" plugins which will create the necessary bridge and
+loopback devices in our network namespace. However, you should be able to use
+any CNI compatible plugin to set up networking for gVisor sandboxes.
+
+The bridge plugin configuration specifies the IP address subnet range for IP
+addresses that will be assigned to sandboxes as well as the network routing
+configuration. This tutorial will assign IP addresses from the `10.22.0.0/16`
+range and allow all outbound traffic, however you can modify this configuration
+to suit your use case.
+
+Create the bridge and loopback plugin configurations:
+
+```
+sudo mkdir -p /etc/cni/net.d
+
+sudo sh -c 'cat > /etc/cni/net.d/10-bridge.conf << EOF
+{
+  "cniVersion": "0.4.0",
+  "name": "mynet",
+  "type": "bridge",
+  "bridge": "cni0",
+  "isGateway": true,
+  "ipMasq": true,
+  "ipam": {
+    "type": "host-local",
+    "subnet": "10.22.0.0/16",
+    "routes": [
+      { "dst": "0.0.0.0/0" }
+    ]
+  }
+}
+EOF'
+
+sudo sh -c 'cat > /etc/cni/net.d/99-loopback.conf << EOF
+{
+  "cniVersion": "0.4.0",
+  "name": "lo",
+  "type": "loopback"
+}
+EOF'
+```
+
+## Create a Network Namespace
+
+For each gVisor sandbox you will create a network namespace and configure it
+using CNI. First, create a random network namespace name and then create the
+namespace.
+
+The network namespace path will then be `/var/run/netns/${CNI_CONTAINERID}`.
+
+```
+export CNI_PATH=/opt/cni/bin
+export CNI_CONTAINERID=$(printf '%x%x%x%x' $RANDOM $RANDOM $RANDOM $RANDOM)
+export CNI_COMMAND=ADD
+export CNI_NETNS=/var/run/netns/${CNI_CONTAINERID}
+
+sudo ip netns add ${CNI_CONTAINERID}
+```
+
+Next, run the bridge and loopback plugins to apply the configuration that was
+created earlier to the namespace. Each plugin outputs some JSON indicating the
+results of executing the plugin. For example, The bridge plugin's response
+includes the IP address assigned to the ethernet device created in the network
+namespace. Take note of the IP address for use later.
+
+```
+export CNI_IFNAME="eth0"
+sudo -E /opt/cni/bin/bridge < /etc/cni/net.d/10-bridge.conf
+export CNI_IFNAME="lo"
+sudo -E /opt/cni/bin/loopback < /etc/cni/net.d/99-loopback.conf
+```
+
+Get the IP address assigned to our sandbox:
+
+```
+POD_IP=$(sudo ip netns exec ${CNI_CONTAINERID} ip -4 addr show eth0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}')
+```
+
+## Create the OCI Bundle
+
+Now that our network namespace is created and configured, we can create the OCI
+bundle for our container. As part of the bundle's `config.json` we will specify
+that the container use the network namespace that we created.
+
+The container will run a simple python webserver that we will be able to connect
+to via the IP address assigned to it via the bridge CNI plugin.
+
+Create the bundle and root filesystem directories:
+
+```
+sudo mkdir -p bundle
+cd bundle
+sudo mkdir rootfs
+sudo docker export $(docker create python) | sudo tar --same-owner -pxf - -C rootfs
+sudo mkdir -p rootfs/var/www/html
+sudo sh -c 'echo "Hello World!" > rootfs/var/www/html/index.html'
+```
+
+Next create the `config.json` specifying the network namespace. `sudo
+/usr/local/bin/runsc spec sudo sed -i 's;"sh";"python", "-m", "http.server";'
+config.json sudo sed -i "s;\"cwd\": \"/\";\"cwd\": \"/var/www/html\";"
+config.json sudo sed -i "s;\"type\": \"network\";\"type\":
+\"network\",\n\t\t\t\t\"path\": \"/var/run/netns/${CNI_CONTAINERID}\";"
+config.json`
+
+## Run the Container
+
+Now we can run and connect to the webserver. Run the container in gVisor. Use
+the same ID used for the network namespace to be consistent:
+
+```
+sudo runsc run -detach ${CNI_CONTAINERID}
+```
+
+Connect to the server via the sandbox's IP address:
+
+```
+curl http://${POD_IP}:8000/
+```
+
+You should see the server returning `Hello World!`.
+
+## Cleanup
+
+After you are finished running the container, you can clean up the network
+namespace .
+
+```
+sudo runsc kill ${CNI_CONTAINERID}
+sudo runsc delete ${CNI_CONTAINERID}
+
+export CNI_COMMAND=DEL
+
+export CNI_IFNAME="lo"
+sudo -E /opt/cni/bin/loopback < /etc/cni/net.d/99-loopback.conf
+export CNI_IFNAME="eth0"
+sudo -E /opt/cni/bin/bridge < /etc/cni/net.d/10-bridge.conf
+
+sudo ip netns delete ${CNI_CONTAINERID}
+```
diff --git a/g3doc/user_guide/tutorials/docker.md b/g3doc/user_guide/tutorials/docker.md
new file mode 100644
index 000000000..705560038
--- /dev/null
+++ b/g3doc/user_guide/tutorials/docker.md
@@ -0,0 +1,68 @@
+# WordPress with Docker
+
+This page shows you how to deploy a sample [WordPress][wordpress] site using
+[Docker][docker].
+
+### Before you begin
+
+[Follow these instructions][docker-install] to install runsc with Docker. This
+document assumes that the runtime name chosen is `runsc`.
+
+### Running WordPress
+
+Now, let's deploy a WordPress site using Docker. WordPress site requires two
+containers: web server in the frontend, MySQL database in the backend.
+
+First, let's define a few environment variables that are shared between both
+containers:
+
+```bash
+export MYSQL_PASSWORD=${YOUR_SECRET_PASSWORD_HERE?}
+export MYSQL_DB=wordpress
+export MYSQL_USER=wordpress
+```
+
+Next, let's start the database container running MySQL and wait until the
+database is initialized:
+
+```bash
+docker run --runtime=runsc --name mysql -d \
+  -e MYSQL_RANDOM_ROOT_PASSWORD=1 \
+  -e MYSQL_PASSWORD="${MYSQL_PASSWORD}" \
+  -e MYSQL_DATABASE="${MYSQL_DB}" \
+  -e MYSQL_USER="${MYSQL_USER}" \
+  mysql:5.7
+
+# Wait until this message appears in the log.
+docker logs mysql |& grep 'port: 3306  MySQL Community Server (GPL)'
+```
+
+Once the database is running, you can start the WordPress frontend. We use the
+`--link` option to connect the frontend to the database, and expose the
+WordPress to port 8080 on the localhost.
+
+```bash
+docker run --runtime=runsc --name wordpress -d \
+  --link mysql:mysql \
+  -p 8080:80 \
+  -e WORDPRESS_DB_HOST=mysql \
+  -e WORDPRESS_DB_USER="${MYSQL_USER}" \
+  -e WORDPRESS_DB_PASSWORD="${MYSQL_PASSWORD}" \
+  -e WORDPRESS_DB_NAME="${MYSQL_DB}" \
+  -e WORDPRESS_TABLE_PREFIX=wp_ \
+  wordpress
+```
+
+Now, you can access the WordPress website pointing your favorite browser to
+<http://localhost:8080>.
+
+Congratulations! You have just deployed a WordPress site using Docker.
+
+### What's next
+
+[Learn how to deploy WordPress with Kubernetes][wordpress-k8s].
+
+[docker]: https://www.docker.com/
+[docker-install]: /docs/user_guide/quick_start/docker/
+[wordpress]: https://wordpress.com/
+[wordpress-k8s]: /docs/tutorials/kubernetes/
diff --git a/g3doc/user_guide/tutorials/kubernetes.md b/g3doc/user_guide/tutorials/kubernetes.md
new file mode 100644
index 000000000..d2a94b1b7
--- /dev/null
+++ b/g3doc/user_guide/tutorials/kubernetes.md
@@ -0,0 +1,134 @@
+# WordPress with Kubernetes
+
+This page shows you how to deploy a sample [WordPress][wordpress] site using
+[GKE Sandbox][gke-sandbox].
+
+### Before you begin
+
+Take the following steps to enable the Kubernetes Engine API:
+
+1.  Visit the [Kubernetes Engine page][project-selector] in the Google Cloud
+    Platform Console.
+1.  Create or select a project.
+
+### Creating a node pool with gVisor enabled
+
+Create a node pool inside your cluster with option `--sandbox type=gvisor` added
+to the command, like below:
+
+```bash
+gcloud beta container node-pools create sandbox-pool --cluster=${CLUSTER_NAME} --image-type=cos_containerd --sandbox type=gvisor
+```
+
+If you prefer to use the console, select your cluster and select the **ADD NODE
+POOL** button:
+
+![+ ADD NODE POOL](./node-pool-button.png)
+
+Then select the **Image type** with **Containerd** and select **Enable sandbox
+with gVisor** option. Select other options as you like:
+
+![+ NODE POOL](./add-node-pool.png)
+
+### Check that gVisor is enabled
+
+The gvisor RuntimeClass is instantiated during node creation. You can check for
+the existence of the gvisor RuntimeClass using the following command:
+
+```bash
+kubectl get runtimeclasses
+```
+
+### Wordpress deployment
+
+Now, let's deploy a WordPress site using GKE Sandbox. WordPress site requires
+two pods: web server in the frontend, MySQL database in the backend. Both
+applications use PersistentVolumes to store the site data data. In addition,
+they use secret store to share MySQL password between them.
+
+First, let's download the deployment configuration files to add the runtime
+class annotation to them:
+
+```bash
+curl -LO https://k8s.io/examples/application/wordpress/wordpress-deployment.yaml
+curl -LO https://k8s.io/examples/application/wordpress/mysql-deployment.yaml
+```
+
+Add a **spec.template.spec.runtimeClassName** set to **gvisor** to both files,
+as shown below:
+
+**wordpress-deployment.yaml:** ```yaml apiVersion: v1 kind: Service metadata:
+name: wordpress labels: app: wordpress spec: ports: - port: 80 selector: app:
+wordpress tier: frontend
+
+## type: LoadBalancer
+
+apiVersion: v1 kind: PersistentVolumeClaim metadata: name: wp-pv-claim labels:
+app: wordpress spec: accessModes: - ReadWriteOnce resources: requests:
+
+## storage: 20Gi
+
+apiVersion: apps/v1 kind: Deployment metadata: name: wordpress labels: app:
+wordpress spec: selector: matchLabels: app: wordpress tier: frontend strategy:
+type: Recreate template: metadata: labels: app: wordpress tier: frontend spec:
+runtimeClassName: gvisor # ADD THIS LINE containers: - image:
+wordpress:4.8-apache name: wordpress env: - name: WORDPRESS_DB_HOST value:
+wordpress-mysql - name: WORDPRESS_DB_PASSWORD valueFrom: secretKeyRef: name:
+mysql-pass key: password ports: - containerPort: 80 name: wordpress
+volumeMounts: - name: wordpress-persistent-storage mountPath: /var/www/html
+volumes: - name: wordpress-persistent-storage persistentVolumeClaim: claimName:
+wp-pv-claim ```
+
+**mysql-deployment.yaml:** ```yaml apiVersion: v1 kind: Service metadata: name:
+wordpress-mysql labels: app: wordpress spec: ports: - port: 3306 selector: app:
+wordpress tier: mysql
+
+## clusterIP: None
+
+apiVersion: v1 kind: PersistentVolumeClaim metadata: name: mysql-pv-claim
+labels: app: wordpress spec: accessModes: - ReadWriteOnce resources: requests:
+
+## storage: 20Gi
+
+apiVersion: apps/v1 kind: Deployment metadata: name: wordpress-mysql labels:
+app: wordpress spec: selector: matchLabels: app: wordpress tier: mysql strategy:
+type: Recreate template: metadata: labels: app: wordpress tier: mysql spec:
+runtimeClassName: gvisor # ADD THIS LINE containers: - image: mysql:5.6 name:
+mysql env: - name: MYSQL_ROOT_PASSWORD valueFrom: secretKeyRef: name: mysql-pass
+key: password ports: - containerPort: 3306 name: mysql volumeMounts: - name:
+mysql-persistent-storage mountPath: /var/lib/mysql volumes: - name:
+mysql-persistent-storage persistentVolumeClaim: claimName: mysql-pv-claim ```
+
+Note that apart from `runtimeClassName: gvisor`, nothing else about the
+Deployment has is changed.
+
+You are now ready to deploy the entire application. Just create a secret to
+store MySQL's password and *apply* both deployments:
+
+```bash
+kubectl create secret generic mysql-pass --from-literal=password=${YOUR_SECRET_PASSWORD_HERE?}
+kubectl apply -f mysql-deployment.yaml
+kubectl apply -f wordpress-deployment.yaml
+```
+
+Wait for the deployments to be ready and an external IP to be assigned to the
+Wordpress service:
+
+```bash
+watch kubectl get service wordpress
+```
+
+Now, copy the service `EXTERNAL-IP` from above to your favorite browser to view
+and configure your new WordPress site.
+
+Congratulations! You have just deployed a WordPress site using GKE Sandbox.
+
+### What's next
+
+To learn more about GKE Sandbox and how to run your deployment securely, take a
+look at the [documentation][gke-sandbox-docs].
+
+[gke-sandbox-docs]: https://cloud.google.com/kubernetes-engine/docs/how-to/sandbox-pods
+[gke-sandbox]: https://cloud.google.com/kubernetes-engine/sandbox/
+[project-selector]: https://console.cloud.google.com/projectselector/kubernetes
+[wordpress]: https://wordpress.com/
diff --git a/g3doc/user_guide/tutorials/node-pool-button.png b/g3doc/user_guide/tutorials/node-pool-button.png
new file mode 100644
index 000000000..bee0c11dc
--- /dev/null
+++ b/g3doc/user_guide/tutorials/node-pool-button.png
diff --git a/go.mod b/go.mod
index c4687ed02..434fa713f 100644
--- a/go.mod
+++ b/go.mod
@@ -1,23 +1,20 @@
 module gvisor.dev/gvisor
 
-go 1.13
+go 1.14
 
 require (
-  github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
-  github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
-  github.com/golang/mock v1.3.1
-  github.com/golang/protobuf v1.3.1
-  github.com/google/btree v1.0.0
-  github.com/google/go-cmp v0.2.0
-  github.com/google/go-github/v28 v28.1.1
-  github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
-  github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
-  github.com/kr/pty v1.1.1
-  github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
-  github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
-  github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
-  github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
-  golang.org/x/net v0.0.0-20190311183353-d8887717615a
-  golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6
-  golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+	github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
+	github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
+	github.com/golang/protobuf v1.3.1
+	github.com/google/btree v1.0.0
+	github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
+	github.com/kr/pretty v0.2.0 // indirect
+	github.com/kr/pty v1.1.1
+	github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
+	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+	github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
+	github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 // indirect
+	golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527
+	golang.org/x/time v0.0.0-20191024005414-555d28b269f0
+	gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
 )
diff --git a/go.sum b/go.sum
index 434770beb..c44a17c71 100644
--- a/go.sum
+++ b/go.sum
@@ -1,21 +1,32 @@
+github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422 h1:+FKjzBIdfBHYDvxCv+djmDJdes/AoDtg8gpcxowBlF8=
 github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422/go.mod h1:b6Nc7NRH5C4aCISLry0tLnTjcuTEvoiqcWDdsU0sOGM=
+github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079 h1:JFTFz3HZTGmgMz4E1TabNBNJljROSYgja1b4l50FNVs=
 github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU=
-github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
+github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
-github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
-github.com/google/go-github/v28 v28.1.1/go.mod h1:bsqJWQX05omyWVmc00nEUql9mhQyv38lDZ8kPZcQVoM=
+github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8 h1:GZGUPQiZfYrd9uOqyqwbQcHPkz/EZJVkZB1MkaO9UBI=
 github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
-github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs=
+github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
+github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78 h1:d9F+LNYwMyi3BDN4GzZdaSiq4otb8duVEWyZjeUtOQI=
 github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
+github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 h1:b6uOv7YOFK0TYG7HtkIgExQo+2RdLuwRft63jn2HWj8=
 github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
+github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e h1:/Tdc23Arz1OtdIsBY2utWepGRQ9fEAJlhkdoLzWMK8Q=
 github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e/go.mod h1:+SR5DhBJrl6ZM7CoCKvpw5BKroDKQ+PJqOg65H/2ktk=
+github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936 h1:J9gO8RJCAFlln1jsvRba/CWVUnMHwObklfxxjErl1uk=
 github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527 h1:uYVVQ9WP/Ds2ROhcaGPeIdVq0RIXVLwsHlnvJ+cT1So=
+golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=
+golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
+gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
diff --git a/images/BUILD b/images/BUILD
new file mode 100644
index 000000000..a50f388e9
--- /dev/null
+++ b/images/BUILD
@@ -0,0 +1,11 @@
+package(licenses = ["notice"])
+
+# The images filegroup is definitely not a hermetic target, and requires Make
+# to do anything meaningful with. However, this will be slurped up and used by
+# the tools/installer/images.sh installer, which will ensure that all required
+# images are available locally when running vm_tests.
+filegroup(
+    name = "images",
+    srcs = glob(["**"]),
+    visibility = ["//tools/installers:__pkg__"],
+)
diff --git a/images/Makefile b/images/Makefile
new file mode 100644
index 000000000..1485607bd
--- /dev/null
+++ b/images/Makefile
@@ -0,0 +1,93 @@
+#!/usr/bin/make -f
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ARCH is the architecture used for the build. This may be overriden at the
+# command line in order to perform a cross-build (in a limited capacity).
+ARCH := $(shell uname -m)
+
+# Note that the image prefixes used here must match the image mangling in
+# runsc/testutil.MangleImage. Names are mangled in this way to ensure that all
+# tests are using locally-defined images (that are consistent and idempotent).
+REMOTE_IMAGE_PREFIX ?= gcr.io/gvisor-presubmit
+LOCAL_IMAGE_PREFIX ?= gvisor.dev/images
+ALL_IMAGES := $(subst /,_,$(subst ./,,$(shell find . -name Dockerfile -exec dirname {} \;)))
+ifneq ($(ARCH),$(shell uname -m))
+DOCKER_PLATFORM_ARGS := --platform=$(ARCH)
+else
+DOCKER_PLATFORM_ARGS :=
+endif
+
+list-all-images:
+	@for image in $(ALL_IMAGES); do echo $${image}; done
+.PHONY: list-build-images
+
+%-all-images:
+	@$(MAKE) $(patsubst %,$*-%,$(ALL_IMAGES))
+
+# tag is a function that returns the tag name, given an image.
+#
+# The tag constructed is used to memoize the image generated (see README.md).
+# This scheme is used to enable aggressive caching in a central repository, but
+# ensuring that images will always be sourced using the local files if there
+# are changes.
+path = $(subst _,/,$(1))
+tag = $(shell find $(call path,$(1)) -type f -print | sort | xargs -n 1 sha256sum | sha256sum - | cut -c 1-16)
+remote_image = $(REMOTE_IMAGE_PREFIX)/$(subst _,/,$(1))_$(ARCH):$(call tag,$(1))
+local_image = $(LOCAL_IMAGE_PREFIX)/$(subst _,/,$(1))
+
+# rebuild builds the image locally. Only the "remote" tag will be applied. Note
+# we need to explicitly repull the base layer in order to ensure that the
+# architecture is correct. Note that we use the term "rebuild" here to avoid
+# conflicting with the bazel "build" terminology, which is used elsewhere.
+rebuild-%: register-cross
+	FROM=$(shell grep FROM $(call path,$*)/Dockerfile | cut -d' ' -f2-) && \
+		docker pull $(DOCKER_PLATFORM_ARGS) $$FROM
+	T=$$(mktemp -d) && cp -a $(call path,$*)/* $$T && \
+		docker build $(DOCKER_PLATFORM_ARGS) -t $(call remote_image,$*) $$T && \
+		rm -rf $$T
+
+# pull will check the "remote" image and pull if necessary. If the remote image
+# must be pulled, then it will tag with the latest local target. Note that pull
+# may fail if the remote image is not available.
+pull-%:
+	docker pull $(DOCKER_PLATFORM_ARGS) $(call remote_image,$*)
+
+# load will either pull the "remote" or build it locally. This is the preferred
+# entrypoint, as it should never file. The local tag should always be set after
+# this returns (either by the pull or the build).
+load-%:
+	docker inspect $(call remote_image,$*) >/dev/null 2>&1 || $(MAKE) pull-$* || $(MAKE) rebuild-$*
+	docker tag $(call remote_image,$*) $(call local_image,$*)
+
+# push pushes the remote image, after either pulling (to validate that the tag
+# already exists) or building manually.
+push-%: load-%
+	docker push $(call remote_image,$*)
+
+# register-cross registers the necessary qemu binaries for cross-compilation.
+# This may be used by any target that may execute containers that are not the
+# native format.
+register-cross:
+ifneq ($(ARCH),$(shell uname -m))
+ifeq (,$(wildcard /proc/sys/fs/binfmt_misc/qemu-*))
+	docker run --rm --privileged multiarch/qemu-user-static --reset --persistent yes
+else
+	@true # Already registered.
+endif
+else
+	@true # No cross required.
+endif
+.PHONY: register-cross
diff --git a/images/README.md b/images/README.md
new file mode 100644
index 000000000..d2efb5db4
--- /dev/null
+++ b/images/README.md
@@ -0,0 +1,61 @@
+# Container Images
+
+This directory contains all images used by tests.
+
+Note that all these images must be pushed to the testing project hosted on
+[Google Container Registry][gcr]. This will happen automatically as part of
+continuous integration. This will speed up loading as images will not need to be
+built from scratch for each test run.
+
+Image tooling is accessible via `make`, specifically via `tools/images.mk`.
+
+## Why make?
+
+Make is used because it can bootstrap the `default` image, which contains
+`bazel` and all other parts of the toolchain.
+
+## Listing images
+
+To list all images, use `make list-all-images` from the top-level directory.
+
+## Loading and referencing images
+
+To build a specific image, use `make load-<image>` from the top-level directory.
+This will ensure that an image `gvisor.dev/images/<image>:latest` is available.
+
+Images should always be referred to via the `gvisor.dev/images` canonical path.
+This tag exists only locally, but serves to decouple tests from the underlying
+image infrastructure.
+
+The continuous integration system can either take fine-grained dependencies on
+single images via individual `load` targets, or pull all images via a single
+`load-all-images` invocation.
+
+## Adding new images
+
+To add a new image, create a new directory under `images` containing a
+Dockerfile and any other files that the image requires. You may choose to add to
+an existing subdirectory if applicable, or create a new one.
+
+All images will be tagged and memoized using a hash of the directory contents.
+As a result, every image should be made completely reproducible if possible.
+This means using fixed tags and fixed versions whenever feasible.
+
+Notes that images should also be made architecture-independent if possible. The
+build scripts will handling loading the appropriate architecture onto the
+machine and tagging it with the single canonical tag.
+
+Add a `load-<image>` dependency in the Makefile if the image is required for a
+particular set of tests. This target will pull the tag from the image repository
+if available.
+
+## Building and pushing images
+
+All images can be built manually by running `build-<image>` and pushed using
+`push-<image>`. Note that you can also use `build-all-images` and
+`push-all-images`. Note that pushing will require appropriate permissions in the
+project.
+
+The continuous integration system can either take fine-grained dependencies on
+individual `push` targets, or ensure all images are up-to-date with a single
+`push-all-images` invocation.
diff --git a/images/basic/alpine/Dockerfile b/images/basic/alpine/Dockerfile
new file mode 100644
index 000000000..12b26040a
--- /dev/null
+++ b/images/basic/alpine/Dockerfile
@@ -0,0 +1 @@
+FROM alpine:3.11.5
diff --git a/images/basic/busybox/Dockerfile b/images/basic/busybox/Dockerfile
new file mode 100644
index 000000000..79b3f683a
--- /dev/null
+++ b/images/basic/busybox/Dockerfile
@@ -0,0 +1 @@
+FROM busybox:1.31.1
diff --git a/images/basic/httpd/Dockerfile b/images/basic/httpd/Dockerfile
new file mode 100644
index 000000000..83bc0ed88
--- /dev/null
+++ b/images/basic/httpd/Dockerfile
@@ -0,0 +1 @@
+FROM httpd:2.4.43
diff --git a/images/basic/mysql/Dockerfile b/images/basic/mysql/Dockerfile
new file mode 100644
index 000000000..95da9c48d
--- /dev/null
+++ b/images/basic/mysql/Dockerfile
@@ -0,0 +1 @@
+FROM mysql:8.0.19
diff --git a/images/basic/nginx/Dockerfile b/images/basic/nginx/Dockerfile
new file mode 100644
index 000000000..af2e62526
--- /dev/null
+++ b/images/basic/nginx/Dockerfile
@@ -0,0 +1 @@
+FROM nginx:1.17.9
diff --git a/images/basic/python/Dockerfile b/images/basic/python/Dockerfile
new file mode 100644
index 000000000..acf07cca9
--- /dev/null
+++ b/images/basic/python/Dockerfile
@@ -0,0 +1,2 @@
+FROM python:3
+ENTRYPOINT ["python", "-m", "http.server", "8080"]
diff --git a/images/basic/resolv/Dockerfile b/images/basic/resolv/Dockerfile
new file mode 100644
index 000000000..13665bdaf
--- /dev/null
+++ b/images/basic/resolv/Dockerfile
@@ -0,0 +1 @@
+FROM k8s.gcr.io/busybox:latest
diff --git a/images/basic/ruby/Dockerfile b/images/basic/ruby/Dockerfile
new file mode 100644
index 000000000..d290418fb
--- /dev/null
+++ b/images/basic/ruby/Dockerfile
@@ -0,0 +1 @@
+FROM ruby:2.7.1
diff --git a/images/basic/tomcat/Dockerfile b/images/basic/tomcat/Dockerfile
new file mode 100644
index 000000000..c7db39a36
--- /dev/null
+++ b/images/basic/tomcat/Dockerfile
@@ -0,0 +1 @@
+FROM tomcat:8.0
diff --git a/images/basic/ubuntu/Dockerfile b/images/basic/ubuntu/Dockerfile
new file mode 100644
index 000000000..331b71343
--- /dev/null
+++ b/images/basic/ubuntu/Dockerfile
@@ -0,0 +1 @@
+FROM ubuntu:trusty
diff --git a/images/default/Dockerfile b/images/default/Dockerfile
new file mode 100644
index 000000000..397082b02
--- /dev/null
+++ b/images/default/Dockerfile
@@ -0,0 +1,16 @@
+FROM fedora:31
+# Install bazel.
+RUN dnf install -y dnf-plugins-core && dnf copr enable -y vbatts/bazel
+RUN dnf install -y git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static libstdc++-static patch
+RUN pip install pycparser
+RUN dnf install -y bazel3
+# Install gcloud.
+RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-289.0.0-linux-x86_64.tar.gz | \
+    tar zxvf - google-cloud-sdk && \
+    google-cloud-sdk/install.sh && \
+    ln -s /google-cloud-sdk/bin/gcloud /usr/bin/gcloud
+# Install Docker client for the website build.
+RUN dnf config-manager --add-repo https://download.docker.com/linux/fedora/docker-ce.repo
+RUN dnf install -y docker-ce-cli
+WORKDIR /workspace
+ENTRYPOINT ["/usr/bin/bazel"]
diff --git a/images/hostoverlaytest/Dockerfile b/images/hostoverlaytest/Dockerfile
new file mode 100644
index 000000000..d83439e9c
--- /dev/null
+++ b/images/hostoverlaytest/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:bionic
+
+WORKDIR /root
+COPY . .
+
+RUN apt-get update && apt-get install -y gcc
+RUN gcc -O2 -o test test.c
diff --git a/images/hostoverlaytest/test.c b/images/hostoverlaytest/test.c
new file mode 100644
index 000000000..088f90746
--- /dev/null
+++ b/images/hostoverlaytest/test.c
@@ -0,0 +1,88 @@
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+int main(int argc, char** argv) {
+  const char kTestFilePath[] = "testfile.txt";
+  const char kOldFileData[] = "old data\n";
+  const char kNewFileData[] = "new data\n";
+  const size_t kPageSize = sysconf(_SC_PAGE_SIZE);
+
+  // Open a file that already exists in a host overlayfs lower layer.
+  const int fd_rdonly = open(kTestFilePath, O_RDONLY);
+  if (fd_rdonly < 0) {
+    err(1, "open(%s, O_RDONLY)", kTestFilePath);
+  }
+
+  // Check that the file's initial contents are what we expect when read via
+  // syscall.
+  char oldbuf[sizeof(kOldFileData)] = {};
+  ssize_t n = pread(fd_rdonly, oldbuf, sizeof(oldbuf), 0);
+  if (n < 0) {
+    err(1, "initial pread");
+  }
+  if (n != strlen(kOldFileData)) {
+    errx(1, "short initial pread (%ld/%lu bytes)", n, strlen(kOldFileData));
+  }
+  if (strcmp(oldbuf, kOldFileData) != 0) {
+    errx(1, "initial pread returned wrong data: %s", oldbuf);
+  }
+
+  // Check that the file's initial contents are what we expect when read via
+  // memory mapping.
+  void* page = mmap(NULL, kPageSize, PROT_READ, MAP_SHARED, fd_rdonly, 0);
+  if (page == MAP_FAILED) {
+    err(1, "mmap");
+  }
+  if (strcmp(page, kOldFileData) != 0) {
+    errx(1, "mapping contains wrong initial data: %s", (const char*)page);
+  }
+
+  // Open the same file writably, causing host overlayfs to copy it up, and
+  // replace its contents.
+  const int fd_rdwr = open(kTestFilePath, O_RDWR);
+  if (fd_rdwr < 0) {
+    err(1, "open(%s, O_RDWR)", kTestFilePath);
+  }
+  n = write(fd_rdwr, kNewFileData, strlen(kNewFileData));
+  if (n < 0) {
+    err(1, "write");
+  }
+  if (n != strlen(kNewFileData)) {
+    errx(1, "short write (%ld/%lu bytes)", n, strlen(kNewFileData));
+  }
+  if (ftruncate(fd_rdwr, strlen(kNewFileData)) < 0) {
+    err(1, "truncate");
+  }
+
+  int failed = 0;
+
+  // Check that syscalls on the old FD return updated contents. (Before Linux
+  // 4.18, this requires that runsc use a post-copy-up FD to service the read.)
+  char newbuf[sizeof(kNewFileData)] = {};
+  n = pread(fd_rdonly, newbuf, sizeof(newbuf), 0);
+  if (n < 0) {
+    err(1, "final pread");
+  }
+  if (n != strlen(kNewFileData)) {
+    warnx("short final pread (%ld/%lu bytes)", n, strlen(kNewFileData));
+    failed = 1;
+  } else if (strcmp(newbuf, kNewFileData) != 0) {
+    warnx("final pread returned wrong data: %s", newbuf);
+    failed = 1;
+  }
+
+  // Check that the memory mapping of the old FD has been updated. (Linux
+  // overlayfs does not do this, so regardless of kernel version this requires
+  // that runsc replace existing memory mappings with mappings of a
+  // post-copy-up FD.)
+  if (strcmp(page, kNewFileData) != 0) {
+    warnx("mapping contains wrong final data: %s", (const char*)page);
+    failed = 1;
+  }
+
+  return failed;
+}
diff --git a/images/hostoverlaytest/testfile.txt b/images/hostoverlaytest/testfile.txt
new file mode 100644
index 000000000..e4188c841
--- /dev/null
+++ b/images/hostoverlaytest/testfile.txt
@@ -0,0 +1 @@
+old data
diff --git a/images/iptables/Dockerfile b/images/iptables/Dockerfile
new file mode 100644
index 000000000..efd91cb80
--- /dev/null
+++ b/images/iptables/Dockerfile
@@ -0,0 +1,2 @@
+FROM ubuntu
+RUN apt update && apt install -y iptables
diff --git a/images/jekyll/Dockerfile b/images/jekyll/Dockerfile
new file mode 100644
index 000000000..4860dd750
--- /dev/null
+++ b/images/jekyll/Dockerfile
@@ -0,0 +1,13 @@
+FROM jekyll/jekyll:4.0.0
+USER root
+RUN gem install \
+        html-proofer:3.10.2 \
+        nokogiri:1.10.1 \
+        jekyll-autoprefixer:1.0.2 \
+        jekyll-inline-svg:1.1.4 \
+        jekyll-paginate:1.1.0 \
+        kramdown-parser-gfm:1.1.0 \
+        jekyll-relative-links:0.6.1 \
+        jekyll-feed:0.13.0 \
+        jekyll-sitemap:1.4.0
+CMD ["/usr/gem/gems/jekyll-4.0.0/exe/jekyll", "build", "-t", "-s", "/input", "-d", "/output"]
diff --git a/images/packetdrill/Dockerfile b/images/packetdrill/Dockerfile
new file mode 100644
index 000000000..01296dbaf
--- /dev/null
+++ b/images/packetdrill/Dockerfile
@@ -0,0 +1,8 @@
+FROM ubuntu:bionic
+RUN apt-get update && apt-get install -y net-tools git iptables iputils-ping \
+        netcat tcpdump jq tar bison flex make
+RUN hash -r
+RUN git clone --depth 1 --branch packetdrill-v2.0 \
+        https://github.com/google/packetdrill.git
+RUN cd packetdrill/gtests/net/packetdrill && ./configure && make
+CMD /bin/bash
diff --git a/images/packetimpact/Dockerfile b/images/packetimpact/Dockerfile
new file mode 100644
index 000000000..87aa99ef2
--- /dev/null
+++ b/images/packetimpact/Dockerfile
@@ -0,0 +1,16 @@
+FROM ubuntu:bionic
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+        # iptables to disable OS native packet processing.
+        iptables \
+        # nc to check that the posix_server is running.
+        netcat \
+        # tcpdump to log brief packet sniffing.
+        tcpdump \
+        # ip link show to display MAC addresses.
+        iproute2 \
+        # tshark to log verbose packet sniffing.
+        tshark \
+        # killall for cleanup.
+        psmisc
+RUN hash -r
+CMD /bin/bash
diff --git a/images/runtimes/go1.12/Dockerfile b/images/runtimes/go1.12/Dockerfile
new file mode 100644
index 000000000..cb2944062
--- /dev/null
+++ b/images/runtimes/go1.12/Dockerfile
@@ -0,0 +1,4 @@
+# Go is easy, since we already have everything we need to compile the proctor
+# binary and run the tests in the golang Docker image.
+FROM golang:1.12
+RUN ["go", "tool", "dist", "test", "-compile-only"]
diff --git a/test/runtimes/images/Dockerfile_java11 b/images/runtimes/java11/Dockerfile
index 9b7c3d5a3..03bc8aaf1 100644
--- a/test/runtimes/images/Dockerfile_java11
+++ b/images/runtimes/java11/Dockerfile
@@ -1,8 +1,3 @@
-# Compile the proctor binary.
-FROM golang:1.12 AS golang
-ADD ["proctor/", "/go/src/proctor/"]
-RUN ["go", "build", "-o", "/proctor", "/go/src/proctor"]
-
 FROM ubuntu:bionic
 RUN apt-get update && apt-get install -y \
   autoconf \
@@ -25,6 +20,3 @@ RUN set -ex \
 RUN curl -o jtreg.tar.gz https://ci.adoptopenjdk.net/view/Dependencies/job/jtreg/lastSuccessfulBuild/artifact/jtreg-4.2.0-tip.tar.gz
 RUN tar -xzf jtreg.tar.gz
 ENV PATH="/root/jtreg/bin:$PATH"
-
-COPY --from=golang /proctor /proctor
-ENTRYPOINT ["/proctor", "--runtime=java"]
diff --git a/test/runtimes/images/Dockerfile_nodejs12.4.0 b/images/runtimes/nodejs12.4.0/Dockerfile
index 26f68b487..d17924b62 100644
--- a/test/runtimes/images/Dockerfile_nodejs12.4.0
+++ b/images/runtimes/nodejs12.4.0/Dockerfile
@@ -1,8 +1,3 @@
-# Compile the proctor binary.
-FROM golang:1.12 AS golang
-ADD ["proctor/", "/go/src/proctor/"]
-RUN ["go", "build", "-o", "/proctor", "/go/src/proctor"]
-
 FROM ubuntu:bionic
 RUN apt-get update && apt-get install -y \
   curl \
@@ -21,8 +16,6 @@ RUN ./configure
 RUN make
 RUN make test-build
 
-COPY --from=golang /proctor /proctor
-
 # Including dumb-init emulates the Linux "init" process, preventing the failure
 # of tests involving worker processes.
-ENTRYPOINT ["/usr/bin/dumb-init", "/proctor", "--runtime=nodejs"]
+ENTRYPOINT ["/usr/bin/dumb-init"]
diff --git a/test/runtimes/images/Dockerfile_php7.3.6 b/images/runtimes/php7.3.6/Dockerfile
index e6b4c6329..e5f67f79c 100644
--- a/test/runtimes/images/Dockerfile_php7.3.6
+++ b/images/runtimes/php7.3.6/Dockerfile
@@ -1,8 +1,3 @@
-# Compile the proctor binary.
-FROM golang:1.12 AS golang
-ADD ["proctor/", "/go/src/proctor/"]
-RUN ["go", "build", "-o", "/proctor", "/go/src/proctor"]
-
 FROM ubuntu:bionic
 RUN apt-get update && apt-get install -y \
   autoconf \
@@ -22,6 +17,3 @@ RUN tar -zxf php-${VERSION}.tar.gz
 WORKDIR /root/php-${VERSION}
 RUN ./configure
 RUN make
-
-COPY --from=golang /proctor /proctor
-ENTRYPOINT ["/proctor", "--runtime=php"]
diff --git a/test/runtimes/images/Dockerfile_python3.7.3 b/images/runtimes/python3.7.3/Dockerfile
index 905cd22d7..4d1e1e221 100644
--- a/test/runtimes/images/Dockerfile_python3.7.3
+++ b/images/runtimes/python3.7.3/Dockerfile
@@ -1,10 +1,4 @@
-# Compile the proctor binary.
-FROM golang:1.12 AS golang
-ADD ["proctor/", "/go/src/proctor/"]
-RUN ["go", "build", "-o", "/proctor", "/go/src/proctor"]
-
 FROM ubuntu:bionic
-
 RUN apt-get update && apt-get install -y \
   curl \
   gcc \
@@ -25,6 +19,3 @@ RUN tar -zxf cpython-${VERSION}.tar.gz
 WORKDIR /root/cpython-${VERSION}
 RUN ./configure --with-pydebug
 RUN make -s -j2
-
-COPY --from=golang /proctor /proctor
-ENTRYPOINT ["/proctor", "--runtime=python"]
diff --git a/images/tmpfile/Dockerfile b/images/tmpfile/Dockerfile
new file mode 100644
index 000000000..e3816c8cb
--- /dev/null
+++ b/images/tmpfile/Dockerfile
@@ -0,0 +1,4 @@
+# Create file under /tmp to ensure files inside '/tmp' are not overridden.
+FROM alpine:3.11.5
+RUN mkdir -p /tmp/foo \
+  && echo 123 > /tmp/foo/file.txt
diff --git a/kokoro/build.cfg b/kokoro/build.cfg
deleted file mode 100644
index c9ceda947..000000000
--- a/kokoro/build.cfg
+++ /dev/null
@@ -1,24 +0,0 @@
-build_file: "repo/scripts/build.sh"
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-repo-key"
-    }
-  }
-}
-
-env_vars {
-  key: "KOKORO_REPO_KEY"
-  value: "73898_kokoro-repo-key"
-}
-
-action {
-  define_artifacts {
-    regex: "**/runsc"
-    regex: "**/runsc.*"
-    regex: "**/dists/**"
-    regex: "**/pool/**"
-  }
-}
diff --git a/kokoro/build_tests.cfg b/kokoro/build_tests.cfg
deleted file mode 100644
index c64b7e679..000000000
--- a/kokoro/build_tests.cfg
+++ /dev/null
@@ -1 +0,0 @@
-build_file: "repo/scripts/build.sh"
diff --git a/kokoro/common.cfg b/kokoro/common.cfg
deleted file mode 100644
index 669a2e458..000000000
--- a/kokoro/common.cfg
+++ /dev/null
@@ -1,29 +0,0 @@
-# Give Kokoro access to Remote Build Executor (RBE) service account key.
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-rbe-service-account"
-    }
-  }
-}
-
-# Configure bazel to access RBE.
-bazel_setting {
-  # Our GCP project name.
-  project_id: "gvisor-rbe"
-
-  # Use RBE for execution as well as caching.
-  local_execution: false
-
-  # This must match the values in the job config.
-  auth_credential: {
-    keystore_config_id: 73898
-    keyname: "kokoro-rbe-service-account"
-  }
-
-  # Do not change unless you know what you are doing.
-  bes_backend_address: "buildeventservice.googleapis.com"
-  foundry_backend_address: "remotebuildexecution.googleapis.com"
-  upsalite_frontend_address: "https://source.cloud.google.com"
-}
diff --git a/kokoro/do_tests.cfg b/kokoro/do_tests.cfg
deleted file mode 100644
index b45ec0b42..000000000
--- a/kokoro/do_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/do_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/docker_tests.cfg b/kokoro/docker_tests.cfg
deleted file mode 100644
index 0a0ef87ed..000000000
--- a/kokoro/docker_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/docker_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/go.cfg b/kokoro/go.cfg
deleted file mode 100644
index b9c1fcb12..000000000
--- a/kokoro/go.cfg
+++ /dev/null
@@ -1,20 +0,0 @@
-build_file: "repo/scripts/go.sh"
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-github-access-token"
-    }
-  }
-}
-
-env_vars {
-  key: "KOKORO_GITHUB_ACCESS_TOKEN"
-  value: "73898_kokoro-github-access-token"
-}
-
-env_vars {
-  key: "KOKORO_GO_PUSH"
-  value: "true"
-}
diff --git a/kokoro/go_tests.cfg b/kokoro/go_tests.cfg
deleted file mode 100644
index 5eb51041a..000000000
--- a/kokoro/go_tests.cfg
+++ /dev/null
@@ -1 +0,0 @@
-build_file: "repo/scripts/go.sh"
diff --git a/kokoro/hostnet_tests.cfg b/kokoro/hostnet_tests.cfg
deleted file mode 100644
index 520dc55a3..000000000
--- a/kokoro/hostnet_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/hostnet_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/iptables_tests.cfg b/kokoro/iptables_tests.cfg
deleted file mode 100644
index 7af20629a..000000000
--- a/kokoro/iptables_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/iptables_test.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/issue_reviver.cfg b/kokoro/issue_reviver.cfg
deleted file mode 100644
index 2370d9250..000000000
--- a/kokoro/issue_reviver.cfg
+++ /dev/null
@@ -1,15 +0,0 @@
-build_file: "repo/scripts/issue_reviver.sh"
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-github-access-token"
-    }
-  }
-}
-
-env_vars {
-  key: "KOKORO_GITHUB_ACCESS_TOKEN"
-  value: "73898_kokoro-github-access-token"
-}
diff --git a/kokoro/kvm_tests.cfg b/kokoro/kvm_tests.cfg
deleted file mode 100644
index 1feb60c8a..000000000
--- a/kokoro/kvm_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/kvm_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/kythe/generate_xrefs.cfg b/kokoro/kythe/generate_xrefs.cfg
deleted file mode 100644
index ccf657983..000000000
--- a/kokoro/kythe/generate_xrefs.cfg
+++ /dev/null
@@ -1,29 +0,0 @@
-build_file: "gvisor/kokoro/kythe/generate_xrefs.sh"
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-rbe-service-account"
-    }
-  }
-}
-
-bazel_setting {
-  project_id: "gvisor-rbe"
-  local_execution: false
-  auth_credential: {
-    keystore_config_id: 73898
-    keyname: "kokoro-rbe-service-account"
-  }
-  bes_backend_address: "buildeventservice.googleapis.com"
-  foundry_backend_address: "remotebuildexecution.googleapis.com"
-  upsalite_frontend_address: "https://source.cloud.google.com"
-}
-
-action {
-  define_artifacts {
-    regex: "**/*.kzip"
-    fail_if_no_artifacts: true
-  }
-}
diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
deleted file mode 100644
index 323b0f77b..000000000
--- a/kokoro/kythe/generate_xrefs.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -ex
-
-if command -v use_bazel.sh >/dev/null; then
-  use_bazel.sh latest
-fi
-bazel version
-
-python3 -V
-
-readonly KYTHE_VERSION='v0.0.41'
-readonly WORKDIR="$(mktemp -d)"
-readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
-if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
-  readonly KZIP_FILENAME="${KOKORO_ARTIFACTS_DIR}/${KOKORO_GIT_COMMIT}.kzip"
-else
-  readonly KZIP_FILENAME="$(git rev-parse HEAD).kzip"
-fi
-
-wget -q -O "${WORKDIR}/kythe.tar.gz" \
-  "https://github.com/kythe/kythe/releases/download/${KYTHE_VERSION}/kythe-${KYTHE_VERSION}.tar.gz"
-tar --no-same-owner -xzf "${WORKDIR}/kythe.tar.gz" --directory "$WORKDIR"
-
-if [[ -n "$KOKORO_ARTIFACTS_DIR" ]]; then
-  cd "${KOKORO_ARTIFACTS_DIR}/github/gvisor"
-fi
-bazel \
-  --bazelrc="${KYTHE_DIR}/extractors.bazelrc" \
-  build \
-  --override_repository kythe_release="${KYTHE_DIR}" \
-  --define=kythe_corpus=github.com/google/gvisor \
-  --cxxopt=-std=c++17 \
-  --config=remote \
-  --auth_credentials="${KOKORO_BAZEL_AUTH_CREDENTIAL}" \
-  //...
-
-"${KYTHE_DIR}/tools/kzip" merge \
-  --output "$KZIP_FILENAME" \
-  $(find -L bazel-out/*/extra_actions/ -name '*.kzip')
diff --git a/kokoro/make_tests.cfg b/kokoro/make_tests.cfg
deleted file mode 100644
index d973130ff..000000000
--- a/kokoro/make_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/make_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/overlay_tests.cfg b/kokoro/overlay_tests.cfg
deleted file mode 100644
index 6a2ddbd03..000000000
--- a/kokoro/overlay_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/overlay_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/packetdrill_tests.cfg b/kokoro/packetdrill_tests.cfg
deleted file mode 100644
index 258d7deb4..000000000
--- a/kokoro/packetdrill_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/packetdrill_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/release.cfg b/kokoro/release.cfg
deleted file mode 100644
index 5cec1790a..000000000
--- a/kokoro/release.cfg
+++ /dev/null
@@ -1,15 +0,0 @@
-build_file: "repo/scripts/release.sh"
-
-before_action {
-  fetch_keystore {
-    keystore_resource {
-      keystore_config_id: 73898
-      keyname: "kokoro-github-access-token"
-    }
-  }
-}
-
-env_vars {
-  key: "KOKORO_GITHUB_ACCESS_TOKEN"
-  value: "73898_kokoro-github-access-token"
-}
diff --git a/kokoro/root_tests.cfg b/kokoro/root_tests.cfg
deleted file mode 100644
index 28351695c..000000000
--- a/kokoro/root_tests.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-build_file: "repo/scripts/root_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-    regex: "**/runsc_logs_*.tar.gz"
-  }
-}
diff --git a/kokoro/runtime_tests.cfg b/kokoro/runtime_tests.cfg
deleted file mode 100644
index 7d56d5aca..000000000
--- a/kokoro/runtime_tests.cfg
+++ /dev/null
@@ -1 +0,0 @@
-build_file: "repo/scripts/runtime_tests.sh"
diff --git a/kokoro/runtime_tests/go1.12.cfg b/kokoro/runtime_tests/go1.12.cfg
deleted file mode 100644
index 164ddc18f..000000000
--- a/kokoro/runtime_tests/go1.12.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
-
-env_vars {
-  key: "RUNTIME_TEST_NAME"
-  value: "go1.12"
-}
diff --git a/kokoro/runtime_tests/java11.cfg b/kokoro/runtime_tests/java11.cfg
deleted file mode 100644
index 4957d4794..000000000
--- a/kokoro/runtime_tests/java11.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
-
-env_vars {
-  key: "RUNTIME_TEST_NAME"
-  value: "java11"
-}
diff --git a/kokoro/runtime_tests/nodejs12.4.0.cfg b/kokoro/runtime_tests/nodejs12.4.0.cfg
deleted file mode 100644
index 1df343f95..000000000
--- a/kokoro/runtime_tests/nodejs12.4.0.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
-
-env_vars {
-  key: "RUNTIME_TEST_NAME"
-  value: "nodejs12.4.0"
-}
diff --git a/kokoro/runtime_tests/php7.3.6.cfg b/kokoro/runtime_tests/php7.3.6.cfg
deleted file mode 100644
index 8e3667125..000000000
--- a/kokoro/runtime_tests/php7.3.6.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
-
-env_vars {
-  key: "RUNTIME_TEST_NAME"
-  value: "php7.3.6"
-}
diff --git a/kokoro/runtime_tests/python3.7.3.cfg b/kokoro/runtime_tests/python3.7.3.cfg
deleted file mode 100644
index 0ca70d5bb..000000000
--- a/kokoro/runtime_tests/python3.7.3.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-build_file: "github/github/kokoro/runtime_tests/runtime_tests.sh"
-
-env_vars {
-  key: "RUNTIME_TEST_NAME"
-  value: "python3.7.3"
-}
diff --git a/kokoro/simple_tests.cfg b/kokoro/simple_tests.cfg
deleted file mode 100644
index 32e0a9431..000000000
--- a/kokoro/simple_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/simple_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/swgso_tests.cfg b/kokoro/swgso_tests.cfg
deleted file mode 100644
index 101a9c607..000000000
--- a/kokoro/swgso_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/swgso_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/syscall_kvm_tests.cfg b/kokoro/syscall_kvm_tests.cfg
deleted file mode 100644
index 3b99e9c13..000000000
--- a/kokoro/syscall_kvm_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/syscall_kvm_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/kokoro/syscall_tests.cfg b/kokoro/syscall_tests.cfg
deleted file mode 100644
index ee6e4a3a4..000000000
--- a/kokoro/syscall_tests.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-build_file: "repo/scripts/syscall_tests.sh"
-
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    regex: "**/outputs.zip"
-  }
-}
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 1f3c0c687..114b516e2 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -10,6 +10,7 @@ go_library(
     name = "linux",
     srcs = [
         "aio.go",
+        "arch_amd64.go",
         "audit.go",
         "bpf.go",
         "capability.go",
@@ -17,6 +18,8 @@ go_library(
         "dev.go",
         "elf.go",
         "epoll.go",
+        "epoll_amd64.go",
+        "epoll_arm64.go",
         "errors.go",
         "eventfd.go",
         "exec.go",
@@ -28,6 +31,7 @@ go_library(
         "futex.go",
         "inotify.go",
         "ioctl.go",
+        "ioctl_tun.go",
         "ip.go",
         "ipc.go",
         "limits.go",
@@ -40,6 +44,8 @@ go_library(
         "poll.go",
         "prctl.go",
         "ptrace.go",
+        "ptrace_amd64.go",
+        "ptrace_arm64.go",
         "rseq.go",
         "rusage.go",
         "sched.go",
@@ -59,6 +65,7 @@ go_library(
         "wait.go",
         "xattr.go",
     ],
+    marshal = True,
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi",
diff --git a/pkg/abi/linux/arch_amd64.go b/pkg/abi/linux/arch_amd64.go
new file mode 100644
index 000000000..0be31e755
--- /dev/null
+++ b/pkg/abi/linux/arch_amd64.go
@@ -0,0 +1,23 @@
+// Copyright 2020 The gVisor Authors.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+// Start and end addresses of the vsyscall page.
+const (
+	VSyscallStartAddr uint64 = 0xffffffffff600000
+	VSyscallEndAddr   uint64 = 0xffffffffff601000
+)
diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go
index 89f9a793f..fa3ae5f18 100644
--- a/pkg/abi/linux/dev.go
+++ b/pkg/abi/linux/dev.go
@@ -36,6 +36,10 @@ func DecodeDeviceID(rdev uint32) (uint16, uint32) {
 //
 // See Documentations/devices.txt and uapi/linux/major.h.
 const (
+	// UNNAMED_MAJOR is the major device number for "unnamed" devices, whose
+	// minor numbers are dynamically allocated by the kernel.
+	UNNAMED_MAJOR = 0
+
 	// MEM_MAJOR is the major device number for "memory" character devices.
 	MEM_MAJOR = 1
 
diff --git a/pkg/abi/linux/elf.go b/pkg/abi/linux/elf.go
index 40f0459a0..7c9a02f20 100644
--- a/pkg/abi/linux/elf.go
+++ b/pkg/abi/linux/elf.go
@@ -102,4 +102,7 @@ const (
 
 	// NT_X86_XSTATE is for x86 extended state using xsave.
 	NT_X86_XSTATE = 0x202
+
+	// NT_ARM_TLS is for ARM TLS register.
+	NT_ARM_TLS = 0x401
 )
diff --git a/pkg/abi/linux/epoll.go b/pkg/abi/linux/epoll.go
index 0e881aa3c..1121a1a92 100644
--- a/pkg/abi/linux/epoll.go
+++ b/pkg/abi/linux/epoll.go
@@ -14,12 +14,9 @@
 
 package linux
 
-// EpollEvent is equivalent to struct epoll_event from epoll(2).
-type EpollEvent struct {
-	Events uint32
-	Fd     int32
-	Data   int32
-}
+import (
+	"gvisor.dev/gvisor/pkg/binary"
+)
 
 // Event masks.
 const (
@@ -60,3 +57,6 @@ const (
 	EPOLL_CTL_DEL = 0x2
 	EPOLL_CTL_MOD = 0x3
 )
+
+// SizeOfEpollEvent is the size of EpollEvent struct.
+var SizeOfEpollEvent = int(binary.Size(EpollEvent{}))
diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go
new file mode 100644
index 000000000..7e74b1143
--- /dev/null
+++ b/pkg/abi/linux/epoll_amd64.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+// EpollEvent is equivalent to struct epoll_event from epoll(2).
+//
+// +marshal slice:EpollEventSlice
+type EpollEvent struct {
+	Events uint32
+	// Linux makes struct epoll_event::data a __u64. We represent it as
+	// [2]int32 because, on amd64, Linux also makes struct epoll_event
+	// __attribute__((packed)), such that there is no padding between Events
+	// and Data.
+	Data [2]int32
+}
diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go
new file mode 100644
index 000000000..a35939cc9
--- /dev/null
+++ b/pkg/abi/linux/epoll_arm64.go
@@ -0,0 +1,28 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+// EpollEvent is equivalent to struct epoll_event from epoll(2).
+//
+// +marshal slice:EpollEventSlice
+type EpollEvent struct {
+	Events uint32
+	// Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding
+	// here.
+	_    int32
+	Data [2]int32
+}
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index c3ab15a4f..055ac1d7c 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -241,6 +241,8 @@ const (
 )
 
 // Statx represents struct statx.
+//
+// +marshal
 type Statx struct {
 	Mask           uint32
 	Blksize        uint32
@@ -264,6 +266,9 @@ type Statx struct {
 	DevMinor       uint32
 }
 
+// SizeOfStatx is the size of a Statx struct.
+var SizeOfStatx = binary.Size(Statx{})
+
 // FileMode represents a mode_t.
 type FileMode uint16
 
@@ -282,6 +287,11 @@ func (m FileMode) ExtraBits() FileMode {
 	return m &^ (PermissionsMask | FileTypeMask)
 }
 
+// IsDir returns true if file type represents a directory.
+func (m FileMode) IsDir() bool {
+	return m.FileType() == S_IFDIR
+}
+
 // String returns a string representation of m.
 func (m FileMode) String() string {
 	var s []string
diff --git a/pkg/abi/linux/file_amd64.go b/pkg/abi/linux/file_amd64.go
index 9d307e840..6b72364ea 100644
--- a/pkg/abi/linux/file_amd64.go
+++ b/pkg/abi/linux/file_amd64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build amd64
+
 package linux
 
 // Constants for open(2).
@@ -23,6 +25,8 @@ const (
 )
 
 // Stat represents struct stat.
+//
+// +marshal
 type Stat struct {
 	Dev     uint64
 	Ino     uint64
diff --git a/pkg/abi/linux/file_arm64.go b/pkg/abi/linux/file_arm64.go
index 26a54f416..6492c9038 100644
--- a/pkg/abi/linux/file_arm64.go
+++ b/pkg/abi/linux/file_arm64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package linux
 
 // Constants for open(2).
@@ -23,6 +25,8 @@ const (
 )
 
 // Stat represents struct stat.
+//
+// +marshal
 type Stat struct {
 	Dev     uint64
 	Ino     uint64
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 2c652baa2..158d2db5b 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -38,6 +38,8 @@ const (
 )
 
 // Statfs is struct statfs, from uapi/asm-generic/statfs.h.
+//
+// +marshal
 type Statfs struct {
 	// Type is one of the filesystem magic values, defined above.
 	Type uint64
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 0e18db9ef..2062e6a4b 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -72,3 +72,29 @@ const (
 	SIOCGMIIPHY   = 0x8947
 	SIOCGMIIREG   = 0x8948
 )
+
+// ioctl(2) directions. Used to calculate requests number.
+// Constants from asm-generic/ioctl.h.
+const (
+	_IOC_NONE  = 0
+	_IOC_WRITE = 1
+	_IOC_READ  = 2
+)
+
+// Constants from asm-generic/ioctl.h.
+const (
+	_IOC_NRBITS   = 8
+	_IOC_TYPEBITS = 8
+	_IOC_SIZEBITS = 14
+	_IOC_DIRBITS  = 2
+
+	_IOC_NRSHIFT   = 0
+	_IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS
+	_IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS
+	_IOC_DIRSHIFT  = _IOC_SIZESHIFT + _IOC_SIZEBITS
+)
+
+// IOC outputs the result of _IOC macro in asm-generic/ioctl.h.
+func IOC(dir, typ, nr, size uint32) uint32 {
+	return uint32(dir)<<_IOC_DIRSHIFT | typ<<_IOC_TYPESHIFT | nr<<_IOC_NRSHIFT | size<<_IOC_SIZESHIFT
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/abi/linux/ioctl_tun.go
index 6af5c400f..c59c9c136 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
+++ b/pkg/abi/linux/ioctl_tun.go
@@ -12,14 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package vfs2
+package linux
 
-import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/syscalls"
+// ioctl(2) request numbers from linux/if_tun.h
+var (
+	TUNSETIFF = IOC(_IOC_WRITE, 'T', 202, 4)
+	TUNGETIFF = IOC(_IOC_READ, 'T', 210, 4)
 )
 
-// Override syscall table to add syscalls implementations from this package.
-func Override(table map[uintptr]kernel.Syscall) {
-	table[63] = syscalls.Supported("read", Read)
-}
+// Flags from net/if_tun.h
+const (
+	IFF_TUN      = 0x0001
+	IFF_TAP      = 0x0002
+	IFF_NO_PI    = 0x1000
+	IFF_NOFILTER = 0x1000
+)
diff --git a/pkg/abi/linux/ip.go b/pkg/abi/linux/ip.go
index 31e56ffa6..ef6d1093e 100644
--- a/pkg/abi/linux/ip.go
+++ b/pkg/abi/linux/ip.go
@@ -92,6 +92,16 @@ const (
 	IP_UNICAST_IF             = 50
 )
 
+// IP_MTU_DISCOVER values from uapi/linux/in.h
+const (
+	IP_PMTUDISC_DONT      = 0
+	IP_PMTUDISC_WANT      = 1
+	IP_PMTUDISC_DO        = 2
+	IP_PMTUDISC_PROBE     = 3
+	IP_PMTUDISC_INTERFACE = 4
+	IP_PMTUDISC_OMIT      = 5
+)
+
 // Socket options from uapi/linux/in6.h
 const (
 	IPV6_ADDRFORM         = 1
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index cd043dac3..07cc1895e 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -90,14 +90,19 @@ const (
 	MS_SYNC       = 1 << 2
 )
 
+// NumaPolicy is the NUMA memory policy for a memory range. See numa(7).
+//
+// +marshal
+type NumaPolicy int32
+
 // Policies for get_mempolicy(2)/set_mempolicy(2).
 const (
-	MPOL_DEFAULT    = 0
-	MPOL_PREFERRED  = 1
-	MPOL_BIND       = 2
-	MPOL_INTERLEAVE = 3
-	MPOL_LOCAL      = 4
-	MPOL_MAX        = 5
+	MPOL_DEFAULT    NumaPolicy = 0
+	MPOL_PREFERRED  NumaPolicy = 1
+	MPOL_BIND       NumaPolicy = 2
+	MPOL_INTERLEAVE NumaPolicy = 3
+	MPOL_LOCAL      NumaPolicy = 4
+	MPOL_MAX        NumaPolicy = 5
 )
 
 // Flags for get_mempolicy(2).
diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index bbc4df74c..46d8b0b42 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -146,7 +146,7 @@ type IPTIP struct {
 	// OutputInterface is the output network interface.
 	OutputInterface [IFNAMSIZ]byte
 
-	// InputInterfaceMask is the intput interface mask.
+	// InputInterfaceMask is the input interface mask.
 	InputInterfaceMask [IFNAMSIZ]byte
 
 	// OuputInterfaceMask is the output interface mask.
@@ -158,10 +158,32 @@ type IPTIP struct {
 	// Flags define matching behavior for the IP header.
 	Flags uint8
 
-	// InverseFlags invert the meaning of fields in struct IPTIP.
+	// InverseFlags invert the meaning of fields in struct IPTIP. See the
+	// IPT_INV_* flags.
 	InverseFlags uint8
 }
 
+// Flags in IPTIP.InverseFlags. Corresponding constants are in
+// include/uapi/linux/netfilter_ipv4/ip_tables.h.
+const (
+	// Invert the meaning of InputInterface.
+	IPT_INV_VIA_IN = 0x01
+	// Invert the meaning of OutputInterface.
+	IPT_INV_VIA_OUT = 0x02
+	// Unclear what this is, as no references to it exist in the kernel.
+	IPT_INV_TOS = 0x04
+	// Invert the meaning of Src.
+	IPT_INV_SRCIP = 0x08
+	// Invert the meaning of Dst.
+	IPT_INV_DSTIP = 0x10
+	// Invert the meaning of the IPT_F_FRAG flag.
+	IPT_INV_FRAG = 0x20
+	// Invert the meaning of the Protocol field.
+	IPT_INV_PROTO = 0x40
+	// Enable all flags.
+	IPT_INV_MASK = 0x7F
+)
+
 // SizeOfIPTIP is the size of an IPTIP.
 const SizeOfIPTIP = 84
 
@@ -225,11 +247,14 @@ type XTEntryTarget struct {
 // SizeOfXTEntryTarget is the size of an XTEntryTarget.
 const SizeOfXTEntryTarget = 32
 
-// XTStandardTarget is a builtin target, one of ACCEPT, DROP, JUMP, QUEUE, or
-// RETURN. It corresponds to struct xt_standard_target in
+// XTStandardTarget is a built-in target, one of ACCEPT, DROP, JUMP, QUEUE,
+// RETURN, or jump. It corresponds to struct xt_standard_target in
 // include/uapi/linux/netfilter/x_tables.h.
 type XTStandardTarget struct {
-	Target  XTEntryTarget
+	Target XTEntryTarget
+	// A positive verdict indicates a jump, and is the offset from the
+	// start of the table to jump to. A negative value means one of the
+	// other built-in targets.
 	Verdict int32
 	_       [4]byte
 }
@@ -250,6 +275,50 @@ type XTErrorTarget struct {
 // SizeOfXTErrorTarget is the size of an XTErrorTarget.
 const SizeOfXTErrorTarget = 64
 
+// Flag values for NfNATIPV4Range. The values indicate whether to map
+// protocol specific part(ports) or IPs. It corresponds to values in
+// include/uapi/linux/netfilter/nf_nat.h.
+const (
+	NF_NAT_RANGE_MAP_IPS            = 1 << 0
+	NF_NAT_RANGE_PROTO_SPECIFIED    = 1 << 1
+	NF_NAT_RANGE_PROTO_RANDOM       = 1 << 2
+	NF_NAT_RANGE_PERSISTENT         = 1 << 3
+	NF_NAT_RANGE_PROTO_RANDOM_FULLY = 1 << 4
+	NF_NAT_RANGE_PROTO_RANDOM_ALL   = (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
+	NF_NAT_RANGE_MASK               = (NF_NAT_RANGE_MAP_IPS |
+		NF_NAT_RANGE_PROTO_SPECIFIED | NF_NAT_RANGE_PROTO_RANDOM |
+		NF_NAT_RANGE_PERSISTENT | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
+)
+
+// NfNATIPV4Range corresponds to struct nf_nat_ipv4_range
+// in include/uapi/linux/netfilter/nf_nat.h. The fields are in
+// network byte order.
+type NfNATIPV4Range struct {
+	Flags   uint32
+	MinIP   [4]byte
+	MaxIP   [4]byte
+	MinPort uint16
+	MaxPort uint16
+}
+
+// NfNATIPV4MultiRangeCompat corresponds to struct
+// nf_nat_ipv4_multi_range_compat in include/uapi/linux/netfilter/nf_nat.h.
+type NfNATIPV4MultiRangeCompat struct {
+	RangeSize uint32
+	RangeIPV4 NfNATIPV4Range
+}
+
+// XTRedirectTarget triggers a redirect when reached.
+// Adding 4 bytes of padding to make the struct 8 byte aligned.
+type XTRedirectTarget struct {
+	Target  XTEntryTarget
+	NfRange NfNATIPV4MultiRangeCompat
+	_       [4]byte
+}
+
+// SizeOfXTRedirectTarget is the size of an XTRedirectTarget.
+const SizeOfXTRedirectTarget = 56
+
 // IPTGetinfo is the argument for the IPT_SO_GET_INFO sockopt. It corresponds
 // to struct ipt_getinfo in include/uapi/linux/netfilter_ipv4/ip_tables.h.
 type IPTGetinfo struct {
@@ -440,3 +509,44 @@ const (
 	// Enable all flags.
 	XT_UDP_INV_MASK = 0x03
 )
+
+// IPTOwnerInfo holds data for matching packets with owner. It corresponds
+// to struct ipt_owner_info in libxt_owner.c of iptables binary.
+type IPTOwnerInfo struct {
+	// UID is user id which created the packet.
+	UID uint32
+
+	// GID is group id which created the packet.
+	GID uint32
+
+	// PID is process id of the process which created the packet.
+	PID uint32
+
+	// SID is session id which created the packet.
+	SID uint32
+
+	// Comm is the command name which created the packet.
+	Comm [16]byte
+
+	// Match is used to match UID/GID of the socket. See the
+	// XT_OWNER_* flags below.
+	Match uint8
+
+	// Invert flips the meaning of Match field.
+	Invert uint8
+}
+
+// SizeOfIPTOwnerInfo is the size of an XTOwnerMatchInfo.
+const SizeOfIPTOwnerInfo = 34
+
+// Flags in IPTOwnerInfo.Match. Corresponding constants are in
+// include/uapi/linux/netfilter/xt_owner.h.
+const (
+	// Match the UID of the packet.
+	XT_OWNER_UID = 1 << 0
+	// Match the GID of the packet.
+	XT_OWNER_GID = 1 << 1
+	// Match if the socket exists for the packet. Forwarded
+	// packets do not have an associated socket.
+	XT_OWNER_SOCKET = 1 << 2
+)
diff --git a/pkg/abi/linux/netfilter_test.go b/pkg/abi/linux/netfilter_test.go
index 21e237f92..565dd550e 100644
--- a/pkg/abi/linux/netfilter_test.go
+++ b/pkg/abi/linux/netfilter_test.go
@@ -29,6 +29,7 @@ func TestSizes(t *testing.T) {
 		{IPTGetEntries{}, SizeOfIPTGetEntries},
 		{IPTGetinfo{}, SizeOfIPTGetinfo},
 		{IPTIP{}, SizeOfIPTIP},
+		{IPTOwnerInfo{}, SizeOfIPTOwnerInfo},
 		{IPTReplace{}, SizeOfIPTReplace},
 		{XTCounters{}, SizeOfXTCounters},
 		{XTEntryMatch{}, SizeOfXTEntryMatch},
diff --git a/pkg/abi/linux/ptrace_amd64.go b/pkg/abi/linux/ptrace_amd64.go
new file mode 100644
index 000000000..ed3881e27
--- /dev/null
+++ b/pkg/abi/linux/ptrace_amd64.go
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+// PtraceRegs is the set of CPU registers exposed by ptrace. Source:
+// syscall.PtraceRegs.
+//
+// +marshal
+// +stateify savable
+type PtraceRegs struct {
+	R15      uint64
+	R14      uint64
+	R13      uint64
+	R12      uint64
+	Rbp      uint64
+	Rbx      uint64
+	R11      uint64
+	R10      uint64
+	R9       uint64
+	R8       uint64
+	Rax      uint64
+	Rcx      uint64
+	Rdx      uint64
+	Rsi      uint64
+	Rdi      uint64
+	Orig_rax uint64
+	Rip      uint64
+	Cs       uint64
+	Eflags   uint64
+	Rsp      uint64
+	Ss       uint64
+	Fs_base  uint64
+	Gs_base  uint64
+	Ds       uint64
+	Es       uint64
+	Fs       uint64
+	Gs       uint64
+}
diff --git a/runsc/boot/loader_arm64.go b/pkg/abi/linux/ptrace_arm64.go
index cf64d28c8..6147738b3 100644
--- a/runsc/boot/loader_arm64.go
+++ b/pkg/abi/linux/ptrace_arm64.go
@@ -14,14 +14,16 @@
 
 // +build arm64
 
-package boot
+package linux
 
-import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-)
-
-func init() {
-	// Register the global syscall table.
-	kernel.RegisterSyscallTable(linux.ARM64)
+// PtraceRegs is the set of CPU registers exposed by ptrace. Source:
+// syscall.PtraceRegs.
+//
+// +marshal
+// +stateify savable
+type PtraceRegs struct {
+	Regs   [31]uint64
+	Sp     uint64
+	Pc     uint64
+	Pstate uint64
 }
diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index 4eeb5cd7a..d0607e256 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -63,3 +63,10 @@ func (a BPFAction) String() string {
 func (a BPFAction) Data() uint16 {
 	return uint16(a & SECCOMP_RET_DATA)
 }
+
+// SockFprog is sock_fprog taken from <linux/filter.h>.
+type SockFprog struct {
+	Len    uint16
+	pad    [6]byte
+	Filter *BPFInstruction
+}
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index c69b04ea9..1c330e763 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -115,6 +115,8 @@ const (
 )
 
 // SignalSet is a signal mask with a bit corresponding to each signal.
+//
+// +marshal
 type SignalSet uint64
 
 // SignalSetSize is the size in bytes of a SignalSet.
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index 5c5a58cd4..e6860ed49 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -101,6 +101,8 @@ func NsecToTimeT(nsec int64) TimeT {
 }
 
 // Timespec represents struct timespec in <time.h>.
+//
+// +marshal
 type Timespec struct {
 	Sec  int64
 	Nsec int64
@@ -155,6 +157,8 @@ func DurationToTimespec(dur time.Duration) Timespec {
 const SizeOfTimeval = 16
 
 // Timeval represents struct timeval in <time.h>.
+//
+// +marshal
 type Timeval struct {
 	Sec  int64
 	Usec int64
@@ -228,6 +232,8 @@ type Tms struct {
 type TimerID int32
 
 // StatxTimestamp represents struct statx_timestamp.
+//
+// +marshal
 type StatxTimestamp struct {
 	Sec  int64
 	Nsec uint32
@@ -256,6 +262,8 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) {
 }
 
 // Utime represents struct utimbuf used by utimes(2).
+//
+// +marshal
 type Utime struct {
 	Actime  int64
 	Modtime int64
diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go
index a3b6406fa..99180b208 100644
--- a/pkg/abi/linux/xattr.go
+++ b/pkg/abi/linux/xattr.go
@@ -18,6 +18,7 @@ package linux
 const (
 	XATTR_NAME_MAX = 255
 	XATTR_SIZE_MAX = 65536
+	XATTR_LIST_MAX = 65536
 
 	XATTR_CREATE  = 1
 	XATTR_REPLACE = 2
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index 9612f072e..ffc918846 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "amutex",
     srcs = ["amutex.go"],
     visibility = ["//:sandbox"],
+    deps = ["//pkg/syserror"],
 )
 
 go_test(
diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go
index 1c4fd1784..a078a31db 100644
--- a/pkg/amutex/amutex.go
+++ b/pkg/amutex/amutex.go
@@ -18,6 +18,8 @@ package amutex
 
 import (
 	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // Sleeper must be implemented by users of the abortable mutex to allow for
@@ -53,6 +55,21 @@ func (NoopSleeper) SleepFinish(success bool) {}
 // Interrupted implements Sleeper.Interrupted.
 func (NoopSleeper) Interrupted() bool { return false }
 
+// Block blocks until either receiving from ch succeeds (in which case it
+// returns nil) or sleeper is interrupted (in which case it returns
+// syserror.ErrInterrupted).
+func Block(sleeper Sleeper, ch <-chan struct{}) error {
+	cancel := sleeper.SleepStart()
+	select {
+	case <-ch:
+		sleeper.SleepFinish(true)
+		return nil
+	case <-cancel:
+		sleeper.SleepFinish(false)
+		return syserror.ErrInterrupted
+	}
+}
+
 // AbortableMutex is an abortable mutex. It allows Lock() to be aborted while it
 // waits to acquire the mutex.
 type AbortableMutex struct {
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index 3948074ba..1a30f6967 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -5,10 +5,10 @@ package(licenses = ["notice"])
 go_library(
     name = "atomicbitops",
     srcs = [
-        "atomic_bitops.go",
-        "atomic_bitops_amd64.s",
-        "atomic_bitops_arm64.s",
-        "atomic_bitops_common.go",
+        "atomicbitops.go",
+        "atomicbitops_amd64.s",
+        "atomicbitops_arm64.s",
+        "atomicbitops_noasm.go",
     ],
     visibility = ["//:sandbox"],
 )
@@ -16,7 +16,7 @@ go_library(
 go_test(
     name = "atomicbitops_test",
     size = "small",
-    srcs = ["atomic_bitops_test.go"],
+    srcs = ["atomicbitops_test.go"],
     library = ":atomicbitops",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/atomicbitops/atomic_bitops.go b/pkg/atomicbitops/atomicbitops.go
index fcc41a9ea..1be081719 100644
--- a/pkg/atomicbitops/atomic_bitops.go
+++ b/pkg/atomicbitops/atomicbitops.go
@@ -14,47 +14,34 @@
 
 // +build amd64 arm64
 
-// Package atomicbitops provides basic bitwise operations in an atomic way.
-// The implementation on amd64 leverages the LOCK prefix directly instead of
-// relying on the generic cas primitives, and the arm64 leverages the LDAXR
-// and STLXR pair primitives.
+// Package atomicbitops provides extensions to the sync/atomic package.
 //
-// WARNING: the bitwise ops provided in this package doesn't imply any memory
-// ordering. Using them to construct locks must employ proper memory barriers.
+// All read-modify-write operations implemented by this package have
+// acquire-release memory ordering (like sync/atomic).
 package atomicbitops
 
-// AndUint32 atomically applies bitwise and operation to *addr with val.
+// AndUint32 atomically applies bitwise AND operation to *addr with val.
 func AndUint32(addr *uint32, val uint32)
 
-// OrUint32 atomically applies bitwise or operation to *addr with val.
+// OrUint32 atomically applies bitwise OR operation to *addr with val.
 func OrUint32(addr *uint32, val uint32)
 
-// XorUint32 atomically applies bitwise xor operation to *addr with val.
+// XorUint32 atomically applies bitwise XOR operation to *addr with val.
 func XorUint32(addr *uint32, val uint32)
 
 // CompareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
 // the value previously stored at addr.
 func CompareAndSwapUint32(addr *uint32, old, new uint32) uint32
 
-// AndUint64 atomically applies bitwise and operation to *addr with val.
+// AndUint64 atomically applies bitwise AND operation to *addr with val.
 func AndUint64(addr *uint64, val uint64)
 
-// OrUint64 atomically applies bitwise or operation to *addr with val.
+// OrUint64 atomically applies bitwise OR operation to *addr with val.
 func OrUint64(addr *uint64, val uint64)
 
-// XorUint64 atomically applies bitwise xor operation to *addr with val.
+// XorUint64 atomically applies bitwise XOR operation to *addr with val.
 func XorUint64(addr *uint64, val uint64)
 
 // CompareAndSwapUint64 is like sync/atomic.CompareAndSwapUint64, but returns
 // the value previously stored at addr.
 func CompareAndSwapUint64(addr *uint64, old, new uint64) uint64
-
-// IncUnlessZeroInt32 increments the value stored at the given address and
-// returns true; unless the value stored in the pointer is zero, in which case
-// it is left unmodified and false is returned.
-func IncUnlessZeroInt32(addr *int32) bool
-
-// DecUnlessOneInt32 decrements the value stored at the given address and
-// returns true; unless the value stored in the pointer is 1, in which case it
-// is left unmodified and false is returned.
-func DecUnlessOneInt32(addr *int32) bool
diff --git a/pkg/atomicbitops/atomic_bitops_amd64.s b/pkg/atomicbitops/atomicbitops_amd64.s
index db0972001..54c887ee5 100644
--- a/pkg/atomicbitops/atomic_bitops_amd64.s
+++ b/pkg/atomicbitops/atomicbitops_amd64.s
@@ -75,41 +75,3 @@ TEXT ·CompareAndSwapUint64(SB),$0-32
   CMPXCHGQ DX, 0(DI)
   MOVQ  AX, ret+24(FP)
   RET
-
-TEXT ·IncUnlessZeroInt32(SB),NOSPLIT,$0-9
-  MOVQ     addr+0(FP), DI
-  MOVL     0(DI), AX
-
-retry:
-  TESTL    AX, AX
-  JZ       fail
-  LEAL     1(AX), DX
-  LOCK
-  CMPXCHGL DX, 0(DI)
-  JNZ      retry
-
-  SETEQ    ret+8(FP)
-  RET
-
-fail:
-  MOVB     AX, ret+8(FP)
-  RET
-
-TEXT ·DecUnlessOneInt32(SB),NOSPLIT,$0-9
-  MOVQ     addr+0(FP), DI
-  MOVL     0(DI), AX
-
-retry:
-  LEAL     -1(AX), DX
-  TESTL    DX, DX
-  JZ       fail
-  LOCK
-  CMPXCHGL DX, 0(DI)
-  JNZ      retry
-
-  SETEQ    ret+8(FP)
-  RET
-
-fail:
-  MOVB     DX, ret+8(FP)
-  RET
diff --git a/pkg/atomicbitops/atomic_bitops_arm64.s b/pkg/atomicbitops/atomicbitops_arm64.s
index 97f8808c1..5c780851b 100644
--- a/pkg/atomicbitops/atomic_bitops_arm64.s
+++ b/pkg/atomicbitops/atomicbitops_arm64.s
@@ -50,7 +50,6 @@ TEXT ·CompareAndSwapUint32(SB),$0-20
   MOVD addr+0(FP), R0
   MOVW old+8(FP), R1
   MOVW new+12(FP), R2
-
 again:
   LDAXRW (R0), R3
   CMPW R1, R3
@@ -95,7 +94,6 @@ TEXT ·CompareAndSwapUint64(SB),$0-32
   MOVD addr+0(FP), R0
   MOVD old+8(FP), R1
   MOVD new+16(FP), R2
-
 again:
   LDAXR (R0), R3
   CMP R1, R3
@@ -105,35 +103,3 @@ again:
 done:
   MOVD R3, prev+24(FP)
   RET
-
-TEXT ·IncUnlessZeroInt32(SB),NOSPLIT,$0-9
-  MOVD addr+0(FP), R0
-
-again:
-  LDAXRW (R0), R1
-  CBZ R1, fail
-  ADDW $1, R1
-  STLXRW R1, (R0), R2
-  CBNZ R2, again
-  MOVW $1, R2
-  MOVB R2, ret+8(FP)
-  RET
-fail:
-  MOVB ZR, ret+8(FP)
-  RET
-
-TEXT ·DecUnlessOneInt32(SB),NOSPLIT,$0-9
-  MOVD addr+0(FP), R0
-
-again:
-  LDAXRW (R0), R1
-  SUBSW $1, R1, R1
-  BEQ fail
-  STLXRW R1, (R0), R2
-  CBNZ R2, again
-  MOVW $1, R2
-  MOVB R2, ret+8(FP)
-  RET
-fail:
-  MOVB ZR, ret+8(FP)
-  RET
diff --git a/pkg/atomicbitops/atomic_bitops_common.go b/pkg/atomicbitops/atomicbitops_noasm.go
index 85163ad62..3b2898256 100644
--- a/pkg/atomicbitops/atomic_bitops_common.go
+++ b/pkg/atomicbitops/atomicbitops_noasm.go
@@ -20,7 +20,6 @@ import (
 	"sync/atomic"
 )
 
-// AndUint32 atomically applies bitwise and operation to *addr with val.
 func AndUint32(addr *uint32, val uint32) {
 	for {
 		o := atomic.LoadUint32(addr)
@@ -31,7 +30,6 @@ func AndUint32(addr *uint32, val uint32) {
 	}
 }
 
-// OrUint32 atomically applies bitwise or operation to *addr with val.
 func OrUint32(addr *uint32, val uint32) {
 	for {
 		o := atomic.LoadUint32(addr)
@@ -42,7 +40,6 @@ func OrUint32(addr *uint32, val uint32) {
 	}
 }
 
-// XorUint32 atomically applies bitwise xor operation to *addr with val.
 func XorUint32(addr *uint32, val uint32) {
 	for {
 		o := atomic.LoadUint32(addr)
@@ -53,8 +50,6 @@ func XorUint32(addr *uint32, val uint32) {
 	}
 }
 
-// CompareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
-// the value previously stored at addr.
 func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) {
 	for {
 		prev = atomic.LoadUint32(addr)
@@ -67,7 +62,6 @@ func CompareAndSwapUint32(addr *uint32, old, new uint32) (prev uint32) {
 	}
 }
 
-// AndUint64 atomically applies bitwise and operation to *addr with val.
 func AndUint64(addr *uint64, val uint64) {
 	for {
 		o := atomic.LoadUint64(addr)
@@ -78,7 +72,6 @@ func AndUint64(addr *uint64, val uint64) {
 	}
 }
 
-// OrUint64 atomically applies bitwise or operation to *addr with val.
 func OrUint64(addr *uint64, val uint64) {
 	for {
 		o := atomic.LoadUint64(addr)
@@ -89,7 +82,6 @@ func OrUint64(addr *uint64, val uint64) {
 	}
 }
 
-// XorUint64 atomically applies bitwise xor operation to *addr with val.
 func XorUint64(addr *uint64, val uint64) {
 	for {
 		o := atomic.LoadUint64(addr)
@@ -100,8 +92,6 @@ func XorUint64(addr *uint64, val uint64) {
 	}
 }
 
-// CompareAndSwapUint64 is like sync/atomic.CompareAndSwapUint64, but returns
-// the value previously stored at addr.
 func CompareAndSwapUint64(addr *uint64, old, new uint64) (prev uint64) {
 	for {
 		prev = atomic.LoadUint64(addr)
@@ -113,35 +103,3 @@ func CompareAndSwapUint64(addr *uint64, old, new uint64) (prev uint64) {
 		}
 	}
 }
-
-// IncUnlessZeroInt32 increments the value stored at the given address and
-// returns true; unless the value stored in the pointer is zero, in which case
-// it is left unmodified and false is returned.
-func IncUnlessZeroInt32(addr *int32) bool {
-	for {
-		v := atomic.LoadInt32(addr)
-		if v == 0 {
-			return false
-		}
-
-		if atomic.CompareAndSwapInt32(addr, v, v+1) {
-			return true
-		}
-	}
-}
-
-// DecUnlessOneInt32 decrements the value stored at the given address and
-// returns true; unless the value stored in the pointer is 1, in which case it
-// is left unmodified and false is returned.
-func DecUnlessOneInt32(addr *int32) bool {
-	for {
-		v := atomic.LoadInt32(addr)
-		if v == 1 {
-			return false
-		}
-
-		if atomic.CompareAndSwapInt32(addr, v, v-1) {
-			return true
-		}
-	}
-}
diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomicbitops_test.go
index 9466d3e23..73af71bb4 100644
--- a/pkg/atomicbitops/atomic_bitops_test.go
+++ b/pkg/atomicbitops/atomicbitops_test.go
@@ -196,67 +196,3 @@ func TestCompareAndSwapUint64(t *testing.T) {
 		}
 	}
 }
-
-func TestIncUnlessZeroInt32(t *testing.T) {
-	for _, test := range []struct {
-		initial int32
-		final   int32
-		ret     bool
-	}{
-		{
-			initial: 0,
-			final:   0,
-			ret:     false,
-		},
-		{
-			initial: 1,
-			final:   2,
-			ret:     true,
-		},
-		{
-			initial: 2,
-			final:   3,
-			ret:     true,
-		},
-	} {
-		val := test.initial
-		if got, want := IncUnlessZeroInt32(&val), test.ret; got != want {
-			t.Errorf("For initial value of %d: incorrect return value: got %v, wanted %v", test.initial, got, want)
-		}
-		if got, want := val, test.final; got != want {
-			t.Errorf("For initial value of %d: incorrect final value: got %d, wanted %d", test.initial, got, want)
-		}
-	}
-}
-
-func TestDecUnlessOneInt32(t *testing.T) {
-	for _, test := range []struct {
-		initial int32
-		final   int32
-		ret     bool
-	}{
-		{
-			initial: 0,
-			final:   -1,
-			ret:     true,
-		},
-		{
-			initial: 1,
-			final:   1,
-			ret:     false,
-		},
-		{
-			initial: 2,
-			final:   1,
-			ret:     true,
-		},
-	} {
-		val := test.initial
-		if got, want := DecUnlessOneInt32(&val), test.ret; got != want {
-			t.Errorf("For initial value of %d: incorrect return value: got %v, wanted %v", test.initial, got, want)
-		}
-		if got, want := val, test.final; got != want {
-			t.Errorf("For initial value of %d: incorrect final value: got %d, wanted %d", test.initial, got, want)
-		}
-	}
-}
diff --git a/pkg/bits/bits_template.go b/pkg/bits/bits_template.go
index 93a435b80..998645388 100644
--- a/pkg/bits/bits_template.go
+++ b/pkg/bits/bits_template.go
@@ -42,3 +42,11 @@ func Mask(is ...int) T {
 func MaskOf(i int) T {
 	return T(1) << T(i)
 }
+
+// IsPowerOfTwo returns true if v is power of 2.
+func IsPowerOfTwo(v T) bool {
+	if v == 0 {
+		return false
+	}
+	return v&(v-1) == 0
+}
diff --git a/pkg/bits/uint64_test.go b/pkg/bits/uint64_test.go
index 1b018d808..193d1ebcd 100644
--- a/pkg/bits/uint64_test.go
+++ b/pkg/bits/uint64_test.go
@@ -114,3 +114,21 @@ func TestIsOn(t *testing.T) {
 		}
 	}
 }
+
+func TestIsPowerOfTwo(t *testing.T) {
+	for _, tc := range []struct {
+		v    uint64
+		want bool
+	}{
+		{v: 0, want: false},
+		{v: 1, want: true},
+		{v: 2, want: true},
+		{v: 3, want: false},
+		{v: 4, want: true},
+		{v: 5, want: false},
+	} {
+		if got := IsPowerOfTwo64(tc.v); got != tc.want {
+			t.Errorf("IsPowerOfTwo(%d) = %t, want: %t", tc.v, got, tc.want)
+		}
+	}
+}
diff --git a/pkg/buffer/BUILD b/pkg/buffer/BUILD
new file mode 100644
index 000000000..dcd086298
--- /dev/null
+++ b/pkg/buffer/BUILD
@@ -0,0 +1,43 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "buffer_list",
+    out = "buffer_list.go",
+    package = "buffer",
+    prefix = "buffer",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*buffer",
+        "Linker": "*buffer",
+    },
+)
+
+go_library(
+    name = "buffer",
+    srcs = [
+        "buffer.go",
+        "buffer_list.go",
+        "safemem.go",
+        "view.go",
+        "view_unsafe.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/safemem",
+    ],
+)
+
+go_test(
+    name = "buffer_test",
+    size = "small",
+    srcs = [
+        "safemem_test.go",
+        "view_test.go",
+    ],
+    library = ":buffer",
+    deps = ["//pkg/safemem"],
+)
diff --git a/pkg/buffer/buffer.go b/pkg/buffer/buffer.go
new file mode 100644
index 000000000..c6d089fd9
--- /dev/null
+++ b/pkg/buffer/buffer.go
@@ -0,0 +1,94 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package buffer provides the implementation of a buffer view.
+//
+// A view is an flexible buffer, backed by a pool, supporting the safecopy
+// operations natively as well as the ability to grow via either prepend or
+// append, as well as shrink.
+package buffer
+
+import (
+	"sync"
+)
+
+const bufferSize = 8144 // See below.
+
+// buffer encapsulates a queueable byte buffer.
+//
+// Note that the total size is slightly less than two pages. This is done
+// intentionally to ensure that the buffer object aligns with runtime
+// internals. We have no hard size or alignment requirements. This two page
+// size will effectively minimize internal fragmentation, but still have a
+// large enough chunk to limit excessive segmentation.
+//
+// +stateify savable
+type buffer struct {
+	data  [bufferSize]byte
+	read  int
+	write int
+	bufferEntry
+}
+
+// reset resets internal data.
+//
+// This must be called before returning the buffer to the pool.
+func (b *buffer) Reset() {
+	b.read = 0
+	b.write = 0
+}
+
+// Full indicates the buffer is full.
+//
+// This indicates there is no capacity left to write.
+func (b *buffer) Full() bool {
+	return b.write == len(b.data)
+}
+
+// ReadSize returns the number of bytes available for reading.
+func (b *buffer) ReadSize() int {
+	return b.write - b.read
+}
+
+// ReadMove advances the read index by the given amount.
+func (b *buffer) ReadMove(n int) {
+	b.read += n
+}
+
+// ReadSlice returns the read slice for this buffer.
+func (b *buffer) ReadSlice() []byte {
+	return b.data[b.read:b.write]
+}
+
+// WriteSize returns the number of bytes available for writing.
+func (b *buffer) WriteSize() int {
+	return len(b.data) - b.write
+}
+
+// WriteMove advances the write index by the given amount.
+func (b *buffer) WriteMove(n int) {
+	b.write += n
+}
+
+// WriteSlice returns the write slice for this buffer.
+func (b *buffer) WriteSlice() []byte {
+	return b.data[b.write:]
+}
+
+// bufferPool is a pool for buffers.
+var bufferPool = sync.Pool{
+	New: func() interface{} {
+		return new(buffer)
+	},
+}
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
new file mode 100644
index 000000000..b789e56e9
--- /dev/null
+++ b/pkg/buffer/safemem.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// WriteBlock returns this buffer as a write Block.
+func (b *buffer) WriteBlock() safemem.Block {
+	return safemem.BlockFromSafeSlice(b.WriteSlice())
+}
+
+// ReadBlock returns this buffer as a read Block.
+func (b *buffer) ReadBlock() safemem.Block {
+	return safemem.BlockFromSafeSlice(b.ReadSlice())
+}
+
+// WriteFromSafememReader writes up to count bytes from r to v and advances the
+// write index by the number of bytes written. It calls r.ReadToBlocks() at
+// most once.
+func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, error) {
+	if count == 0 {
+		return 0, nil
+	}
+
+	var (
+		dst    safemem.BlockSeq
+		blocks []safemem.Block
+	)
+
+	// Need at least one buffer.
+	firstBuf := v.data.Back()
+	if firstBuf == nil {
+		firstBuf = bufferPool.Get().(*buffer)
+		v.data.PushBack(firstBuf)
+	}
+
+	// Does the last block have sufficient capacity alone?
+	if l := uint64(firstBuf.WriteSize()); l >= count {
+		dst = safemem.BlockSeqOf(firstBuf.WriteBlock().TakeFirst64(count))
+	} else {
+		// Append blocks until sufficient.
+		count -= l
+		blocks = append(blocks, firstBuf.WriteBlock())
+		for count > 0 {
+			emptyBuf := bufferPool.Get().(*buffer)
+			v.data.PushBack(emptyBuf)
+			block := emptyBuf.WriteBlock().TakeFirst64(count)
+			count -= uint64(block.Len())
+			blocks = append(blocks, block)
+		}
+		dst = safemem.BlockSeqFromSlice(blocks)
+	}
+
+	// Perform I/O.
+	n, err := r.ReadToBlocks(dst)
+	v.size += int64(n)
+
+	// Update all indices.
+	for left := n; left > 0; firstBuf = firstBuf.Next() {
+		if l := firstBuf.WriteSize(); left >= uint64(l) {
+			firstBuf.WriteMove(l) // Whole block.
+			left -= uint64(l)
+		} else {
+			firstBuf.WriteMove(int(left)) // Partial block.
+			left = 0
+		}
+	}
+
+	return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. It advances the
+// write index by the number of bytes written.
+func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	return v.WriteFromSafememReader(&safemem.BlockSeqReader{srcs}, srcs.NumBytes())
+}
+
+// ReadToSafememWriter reads up to count bytes from v to w. It does not advance
+// the read index. It calls w.WriteFromBlocks() at most once.
+func (v *View) ReadToSafememWriter(w safemem.Writer, count uint64) (uint64, error) {
+	if count == 0 {
+		return 0, nil
+	}
+
+	var (
+		src    safemem.BlockSeq
+		blocks []safemem.Block
+	)
+
+	firstBuf := v.data.Front()
+	if firstBuf == nil {
+		return 0, nil // No EOF.
+	}
+
+	// Is all the data in a single block?
+	if l := uint64(firstBuf.ReadSize()); l >= count {
+		src = safemem.BlockSeqOf(firstBuf.ReadBlock().TakeFirst64(count))
+	} else {
+		// Build a list of all the buffers.
+		count -= l
+		blocks = append(blocks, firstBuf.ReadBlock())
+		for buf := firstBuf.Next(); buf != nil && count > 0; buf = buf.Next() {
+			block := buf.ReadBlock().TakeFirst64(count)
+			count -= uint64(block.Len())
+			blocks = append(blocks, block)
+		}
+		src = safemem.BlockSeqFromSlice(blocks)
+	}
+
+	// Perform I/O. As documented, we don't advance the read index.
+	return w.WriteFromBlocks(src)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks. It does not advance the
+// read index by the number of bytes read, such that it's only safe to call if
+// the caller guarantees that ReadToBlocks will only be called once.
+func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	return v.ReadToSafememWriter(&safemem.BlockSeqWriter{dsts}, dsts.NumBytes())
+}
diff --git a/pkg/buffer/safemem_test.go b/pkg/buffer/safemem_test.go
new file mode 100644
index 000000000..47f357e0c
--- /dev/null
+++ b/pkg/buffer/safemem_test.go
@@ -0,0 +1,170 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+func TestSafemem(t *testing.T) {
+	testCases := []struct {
+		name    string
+		input   string
+		output  string
+		readLen int
+		op      func(*View)
+	}{
+		// Basic coverage.
+		{
+			name:   "short",
+			input:  "010",
+			output: "010",
+		},
+		{
+			name:   "long",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: "0" + strings.Repeat("1", bufferSize) + "0",
+		},
+		{
+			name:    "short-read",
+			input:   "0",
+			readLen: 100, // > size.
+			output:  "0",
+		},
+		{
+			name:   "zero-read",
+			input:  "0",
+			output: "",
+		},
+		{
+			name:    "read-empty",
+			input:   "",
+			readLen: 1, // > size.
+			output:  "",
+		},
+
+		// Ensure offsets work.
+		{
+			name:   "offsets-short",
+			input:  "012",
+			output: "2",
+			op: func(v *View) {
+				v.TrimFront(2)
+			},
+		},
+		{
+			name:   "offsets-long0",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: strings.Repeat("1", bufferSize) + "0",
+			op: func(v *View) {
+				v.TrimFront(1)
+			},
+		},
+		{
+			name:   "offsets-long1",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: strings.Repeat("1", bufferSize-1) + "0",
+			op: func(v *View) {
+				v.TrimFront(2)
+			},
+		},
+		{
+			name:   "offsets-long2",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: "10",
+			op: func(v *View) {
+				v.TrimFront(bufferSize)
+			},
+		},
+
+		// Ensure truncation works.
+		{
+			name:   "truncate-short",
+			input:  "012",
+			output: "01",
+			op: func(v *View) {
+				v.Truncate(2)
+			},
+		},
+		{
+			name:   "truncate-long0",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: "0" + strings.Repeat("1", bufferSize),
+			op: func(v *View) {
+				v.Truncate(bufferSize + 1)
+			},
+		},
+		{
+			name:   "truncate-long1",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: "0" + strings.Repeat("1", bufferSize-1),
+			op: func(v *View) {
+				v.Truncate(bufferSize)
+			},
+		},
+		{
+			name:   "truncate-long2",
+			input:  "0" + strings.Repeat("1", bufferSize) + "0",
+			output: "01",
+			op: func(v *View) {
+				v.Truncate(2)
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Construct the new view.
+			var view View
+			bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice([]byte(tc.input)))
+			n, err := view.WriteFromBlocks(bs)
+			if err != nil {
+				t.Errorf("expected err nil, got %v", err)
+			}
+			if n != uint64(len(tc.input)) {
+				t.Errorf("expected %d bytes, got %d", len(tc.input), n)
+			}
+
+			// Run the operation.
+			if tc.op != nil {
+				tc.op(&view)
+			}
+
+			// Read and validate.
+			readLen := tc.readLen
+			if readLen == 0 {
+				readLen = len(tc.output) // Default.
+			}
+			out := make([]byte, readLen)
+			bs = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(out))
+			n, err = view.ReadToBlocks(bs)
+			if err != nil {
+				t.Errorf("expected nil, got %v", err)
+			}
+			if n != uint64(len(tc.output)) {
+				t.Errorf("expected %d bytes, got %d", len(tc.output), n)
+			}
+
+			// Ensure the contents are correct.
+			if !bytes.Equal(out[:n], []byte(tc.output[:n])) {
+				t.Errorf("contents are wrong: expected %q, got %q", tc.output, string(out))
+			}
+		})
+	}
+}
diff --git a/pkg/buffer/view.go b/pkg/buffer/view.go
new file mode 100644
index 000000000..e6901eadb
--- /dev/null
+++ b/pkg/buffer/view.go
@@ -0,0 +1,390 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"fmt"
+	"io"
+)
+
+// View is a non-linear buffer.
+//
+// All methods are thread compatible.
+//
+// +stateify savable
+type View struct {
+	data bufferList
+	size int64
+}
+
+// TrimFront removes the first count bytes from the buffer.
+func (v *View) TrimFront(count int64) {
+	if count >= v.size {
+		v.advanceRead(v.size)
+	} else {
+		v.advanceRead(count)
+	}
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (v *View) ReadAt(p []byte, offset int64) (int, error) {
+	var (
+		skipped int64
+		done    int64
+	)
+	for buf := v.data.Front(); buf != nil && done < int64(len(p)); buf = buf.Next() {
+		needToSkip := int(offset - skipped)
+		if sz := buf.ReadSize(); sz <= needToSkip {
+			skipped += int64(sz)
+			continue
+		}
+
+		// Actually read data.
+		n := copy(p[done:], buf.ReadSlice()[needToSkip:])
+		skipped += int64(needToSkip)
+		done += int64(n)
+	}
+	if int(done) < len(p) || offset+done == v.size {
+		return int(done), io.EOF
+	}
+	return int(done), nil
+}
+
+// advanceRead advances the view's read index.
+//
+// Precondition: there must be sufficient bytes in the buffer.
+func (v *View) advanceRead(count int64) {
+	for buf := v.data.Front(); buf != nil && count > 0; {
+		sz := int64(buf.ReadSize())
+		if sz > count {
+			// There is still data for reading.
+			buf.ReadMove(int(count))
+			v.size -= count
+			count = 0
+			break
+		}
+
+		// Consume the whole buffer.
+		oldBuf := buf
+		buf = buf.Next() // Iterate.
+		v.data.Remove(oldBuf)
+		oldBuf.Reset()
+		bufferPool.Put(oldBuf)
+
+		// Update counts.
+		count -= sz
+		v.size -= sz
+	}
+	if count > 0 {
+		panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count))
+	}
+}
+
+// Truncate truncates the view to the given bytes.
+//
+// This will not grow the view, only shrink it. If a length is passed that is
+// greater than the current size of the view, then nothing will happen.
+//
+// Precondition: length must be >= 0.
+func (v *View) Truncate(length int64) {
+	if length < 0 {
+		panic("negative length provided")
+	}
+	if length >= v.size {
+		return // Nothing to do.
+	}
+	for buf := v.data.Back(); buf != nil && v.size > length; buf = v.data.Back() {
+		sz := int64(buf.ReadSize())
+		if after := v.size - sz; after < length {
+			// Truncate the buffer locally.
+			left := (length - after)
+			buf.write = buf.read + int(left)
+			v.size = length
+			break
+		}
+
+		// Drop the buffer completely; see above.
+		v.data.Remove(buf)
+		buf.Reset()
+		bufferPool.Put(buf)
+		v.size -= sz
+	}
+}
+
+// Grow grows the given view to the number of bytes, which will be appended. If
+// zero is true, all these bytes will be zero. If zero is false, then this is
+// the caller's responsibility.
+//
+// Precondition: length must be >= 0.
+func (v *View) Grow(length int64, zero bool) {
+	if length < 0 {
+		panic("negative length provided")
+	}
+	for v.size < length {
+		buf := v.data.Back()
+
+		// Is there some space in the last buffer?
+		if buf == nil || buf.Full() {
+			buf = bufferPool.Get().(*buffer)
+			v.data.PushBack(buf)
+		}
+
+		// Write up to length bytes.
+		sz := buf.WriteSize()
+		if int64(sz) > length-v.size {
+			sz = int(length - v.size)
+		}
+
+		// Zero the written section; note that this pattern is
+		// specifically recognized and optimized by the compiler.
+		if zero {
+			for i := buf.write; i < buf.write+sz; i++ {
+				buf.data[i] = 0
+			}
+		}
+
+		// Advance the index.
+		buf.WriteMove(sz)
+		v.size += int64(sz)
+	}
+}
+
+// Prepend prepends the given data.
+func (v *View) Prepend(data []byte) {
+	// Is there any space in the first buffer?
+	if buf := v.data.Front(); buf != nil && buf.read > 0 {
+		// Fill up before the first write.
+		avail := buf.read
+		bStart := 0
+		dStart := len(data) - avail
+		if avail > len(data) {
+			bStart = avail - len(data)
+			dStart = 0
+		}
+		n := copy(buf.data[bStart:], data[dStart:])
+		data = data[:dStart]
+		v.size += int64(n)
+		buf.read -= n
+	}
+
+	for len(data) > 0 {
+		// Do we need an empty buffer?
+		buf := bufferPool.Get().(*buffer)
+		v.data.PushFront(buf)
+
+		// The buffer is empty; copy last chunk.
+		avail := len(buf.data)
+		bStart := 0
+		dStart := len(data) - avail
+		if avail > len(data) {
+			bStart = avail - len(data)
+			dStart = 0
+		}
+
+		// We have to put the data at the end of the current
+		// buffer in order to ensure that the next prepend will
+		// correctly fill up the beginning of this buffer.
+		n := copy(buf.data[bStart:], data[dStart:])
+		data = data[:dStart]
+		v.size += int64(n)
+		buf.read = len(buf.data) - n
+		buf.write = len(buf.data)
+	}
+}
+
+// Append appends the given data.
+func (v *View) Append(data []byte) {
+	for done := 0; done < len(data); {
+		buf := v.data.Back()
+
+		// Ensure there's a buffer with space.
+		if buf == nil || buf.Full() {
+			buf = bufferPool.Get().(*buffer)
+			v.data.PushBack(buf)
+		}
+
+		// Copy in to the given buffer.
+		n := copy(buf.WriteSlice(), data[done:])
+		done += n
+		buf.WriteMove(n)
+		v.size += int64(n)
+	}
+}
+
+// Flatten returns a flattened copy of this data.
+//
+// This method should not be used in any performance-sensitive paths. It may
+// allocate a fresh byte slice sufficiently large to contain all the data in
+// the buffer. This is principally for debugging.
+//
+// N.B. Tee data still belongs to this view, as if there is a single buffer
+// present, then it will be returned directly. This should be used for
+// temporary use only, and a reference to the given slice should not be held.
+func (v *View) Flatten() []byte {
+	if buf := v.data.Front(); buf == nil {
+		return nil // No data at all.
+	} else if buf.Next() == nil {
+		return buf.ReadSlice() // Only one buffer.
+	}
+	data := make([]byte, 0, v.size) // Need to flatten.
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		// Copy to the allocated slice.
+		data = append(data, buf.ReadSlice()...)
+	}
+	return data
+}
+
+// Size indicates the total amount of data available in this view.
+func (v *View) Size() int64 {
+	return v.size
+}
+
+// Copy makes a strict copy of this view.
+func (v *View) Copy() (other View) {
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		other.Append(buf.ReadSlice())
+	}
+	return
+}
+
+// Apply applies the given function across all valid data.
+func (v *View) Apply(fn func([]byte)) {
+	for buf := v.data.Front(); buf != nil; buf = buf.Next() {
+		fn(buf.ReadSlice())
+	}
+}
+
+// Merge merges the provided View with this one.
+//
+// The other view will be appended to v, and other will be empty after this
+// operation completes.
+func (v *View) Merge(other *View) {
+	// Copy over all buffers.
+	for buf := other.data.Front(); buf != nil; buf = other.data.Front() {
+		other.data.Remove(buf)
+		v.data.PushBack(buf)
+	}
+
+	// Adjust sizes.
+	v.size += other.size
+	other.size = 0
+}
+
+// WriteFromReader writes to the buffer from an io.Reader.
+//
+// A minimum read size equal to unsafe.Sizeof(unintptr) is enforced,
+// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
+func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
+	var (
+		done int64
+		n    int
+		err  error
+	)
+	for done < count {
+		buf := v.data.Back()
+
+		// Ensure we have an empty buffer.
+		if buf == nil || buf.Full() {
+			buf = bufferPool.Get().(*buffer)
+			v.data.PushBack(buf)
+		}
+
+		// Is this less than the minimum batch?
+		if buf.WriteSize() < minBatch && (count-done) >= int64(minBatch) {
+			tmp := make([]byte, minBatch)
+			n, err = r.Read(tmp)
+			v.Append(tmp[:n])
+			done += int64(n)
+			if err != nil {
+				break
+			}
+			continue
+		}
+
+		// Limit the read, if necessary.
+		sz := buf.WriteSize()
+		if left := count - done; int64(sz) > left {
+			sz = int(left)
+		}
+
+		// Pass the relevant portion of the buffer.
+		n, err = r.Read(buf.WriteSlice()[:sz])
+		buf.WriteMove(n)
+		done += int64(n)
+		v.size += int64(n)
+		if err == io.EOF {
+			err = nil // Short write allowed.
+			break
+		} else if err != nil {
+			break
+		}
+	}
+	return done, err
+}
+
+// ReadToWriter reads from the buffer into an io.Writer.
+//
+// N.B. This does not consume the bytes read. TrimFront should
+// be called appropriately after this call in order to do so.
+//
+// A minimum write size equal to unsafe.Sizeof(unintptr) is enforced,
+// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
+func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) {
+	var (
+		done int64
+		n    int
+		err  error
+	)
+	offset := 0 // Spill-over for batching.
+	for buf := v.data.Front(); buf != nil && done < count; buf = buf.Next() {
+		// Has this been consumed? Skip it.
+		sz := buf.ReadSize()
+		if sz <= offset {
+			offset -= sz
+			continue
+		}
+		sz -= offset
+
+		// Is this less than the minimum batch?
+		left := count - done
+		if sz < minBatch && left >= int64(minBatch) && (v.size-done) >= int64(minBatch) {
+			tmp := make([]byte, minBatch)
+			n, err = v.ReadAt(tmp, done)
+			w.Write(tmp[:n])
+			done += int64(n)
+			offset = n - sz // Reset below.
+			if err != nil {
+				break
+			}
+			continue
+		}
+
+		// Limit the write if necessary.
+		if int64(sz) >= left {
+			sz = int(left)
+		}
+
+		// Perform the actual write.
+		n, err = w.Write(buf.ReadSlice()[offset : offset+sz])
+		done += int64(n)
+		if err != nil {
+			break
+		}
+
+		// Reset spill-over.
+		offset = 0
+	}
+	return done, err
+}
diff --git a/pkg/buffer/view_test.go b/pkg/buffer/view_test.go
new file mode 100644
index 000000000..3db1bc6ee
--- /dev/null
+++ b/pkg/buffer/view_test.go
@@ -0,0 +1,467 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+import (
+	"bytes"
+	"io"
+	"strings"
+	"testing"
+)
+
+func fillAppend(v *View, data []byte) {
+	v.Append(data)
+}
+
+func fillAppendEnd(v *View, data []byte) {
+	v.Grow(bufferSize-1, false)
+	v.Append(data)
+	v.TrimFront(bufferSize - 1)
+}
+
+func fillWriteFromReader(v *View, data []byte) {
+	b := bytes.NewBuffer(data)
+	v.WriteFromReader(b, int64(len(data)))
+}
+
+func fillWriteFromReaderEnd(v *View, data []byte) {
+	v.Grow(bufferSize-1, false)
+	b := bytes.NewBuffer(data)
+	v.WriteFromReader(b, int64(len(data)))
+	v.TrimFront(bufferSize - 1)
+}
+
+var fillFuncs = map[string]func(*View, []byte){
+	"append":             fillAppend,
+	"appendEnd":          fillAppendEnd,
+	"writeFromReader":    fillWriteFromReader,
+	"writeFromReaderEnd": fillWriteFromReaderEnd,
+}
+
+func testReadAt(t *testing.T, v *View, offset int64, n int, wantStr string, wantErr error) {
+	t.Helper()
+	d := make([]byte, n)
+	n, err := v.ReadAt(d, offset)
+	if n != len(wantStr) {
+		t.Errorf("got %d, want %d", n, len(wantStr))
+	}
+	if err != wantErr {
+		t.Errorf("got err %v, want %v", err, wantErr)
+	}
+	if !bytes.Equal(d[:n], []byte(wantStr)) {
+		t.Errorf("got %q, want %q", string(d[:n]), wantStr)
+	}
+}
+
+func TestView(t *testing.T) {
+	testCases := []struct {
+		name   string
+		input  string
+		output string
+		op     func(*testing.T, *View)
+	}{
+		// Preconditions.
+		{
+			name:   "truncate-check",
+			input:  "hello",
+			output: "hello", // Not touched.
+			op: func(t *testing.T, v *View) {
+				defer func() {
+					if r := recover(); r == nil {
+						t.Errorf("Truncate(-1) did not panic")
+					}
+				}()
+				v.Truncate(-1)
+			},
+		},
+		{
+			name:   "grow-check",
+			input:  "hello",
+			output: "hello", // Not touched.
+			op: func(t *testing.T, v *View) {
+				defer func() {
+					if r := recover(); r == nil {
+						t.Errorf("Grow(-1) did not panic")
+					}
+				}()
+				v.Grow(-1, false)
+			},
+		},
+		{
+			name:   "advance-check",
+			input:  "hello",
+			output: "", // Consumed.
+			op: func(t *testing.T, v *View) {
+				defer func() {
+					if r := recover(); r == nil {
+						t.Errorf("advanceRead(Size()+1) did not panic")
+					}
+				}()
+				v.advanceRead(v.Size() + 1)
+			},
+		},
+
+		// Prepend.
+		{
+			name:   "prepend",
+			input:  "world",
+			output: "hello world",
+			op: func(t *testing.T, v *View) {
+				v.Prepend([]byte("hello "))
+			},
+		},
+		{
+			name:   "prepend-backfill-full",
+			input:  "hello world",
+			output: "jello world",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(1)
+				v.Prepend([]byte("j"))
+			},
+		},
+		{
+			name:   "prepend-backfill-under",
+			input:  "hello world",
+			output: "hola world",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(5)
+				v.Prepend([]byte("hola"))
+			},
+		},
+		{
+			name:   "prepend-backfill-over",
+			input:  "hello world",
+			output: "smello world",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(1)
+				v.Prepend([]byte("sm"))
+			},
+		},
+		{
+			name:   "prepend-fill",
+			input:  strings.Repeat("1", bufferSize-1),
+			output: "0" + strings.Repeat("1", bufferSize-1),
+			op: func(t *testing.T, v *View) {
+				v.Prepend([]byte("0"))
+			},
+		},
+		{
+			name:   "prepend-overflow",
+			input:  strings.Repeat("1", bufferSize),
+			output: "0" + strings.Repeat("1", bufferSize),
+			op: func(t *testing.T, v *View) {
+				v.Prepend([]byte("0"))
+			},
+		},
+		{
+			name:   "prepend-multiple-buffers",
+			input:  strings.Repeat("1", bufferSize-1),
+			output: strings.Repeat("0", bufferSize*3) + strings.Repeat("1", bufferSize-1),
+			op: func(t *testing.T, v *View) {
+				v.Prepend([]byte(strings.Repeat("0", bufferSize*3)))
+			},
+		},
+
+		// Append and write.
+		{
+			name:   "append",
+			input:  "hello",
+			output: "hello world",
+			op: func(t *testing.T, v *View) {
+				v.Append([]byte(" world"))
+			},
+		},
+		{
+			name:   "append-fill",
+			input:  strings.Repeat("1", bufferSize-1),
+			output: strings.Repeat("1", bufferSize-1) + "0",
+			op: func(t *testing.T, v *View) {
+				v.Append([]byte("0"))
+			},
+		},
+		{
+			name:   "append-overflow",
+			input:  strings.Repeat("1", bufferSize),
+			output: strings.Repeat("1", bufferSize) + "0",
+			op: func(t *testing.T, v *View) {
+				v.Append([]byte("0"))
+			},
+		},
+		{
+			name:   "append-multiple-buffers",
+			input:  strings.Repeat("1", bufferSize-1),
+			output: strings.Repeat("1", bufferSize-1) + strings.Repeat("0", bufferSize*3),
+			op: func(t *testing.T, v *View) {
+				v.Append([]byte(strings.Repeat("0", bufferSize*3)))
+			},
+		},
+
+		// Truncate.
+		{
+			name:   "truncate",
+			input:  "hello world",
+			output: "hello",
+			op: func(t *testing.T, v *View) {
+				v.Truncate(5)
+			},
+		},
+		{
+			name:   "truncate-noop",
+			input:  "hello world",
+			output: "hello world",
+			op: func(t *testing.T, v *View) {
+				v.Truncate(v.Size() + 1)
+			},
+		},
+		{
+			name:   "truncate-multiple-buffers",
+			input:  strings.Repeat("1", bufferSize*2),
+			output: strings.Repeat("1", bufferSize*2-1),
+			op: func(t *testing.T, v *View) {
+				v.Truncate(bufferSize*2 - 1)
+			},
+		},
+		{
+			name:   "truncate-multiple-buffers-to-one",
+			input:  strings.Repeat("1", bufferSize*2),
+			output: "11111",
+			op: func(t *testing.T, v *View) {
+				v.Truncate(5)
+			},
+		},
+
+		// TrimFront.
+		{
+			name:   "trim",
+			input:  "hello world",
+			output: "world",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(6)
+			},
+		},
+		{
+			name:   "trim-too-large",
+			input:  "hello world",
+			output: "",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(v.Size() + 1)
+			},
+		},
+		{
+			name:   "trim-multiple-buffers",
+			input:  strings.Repeat("1", bufferSize*2),
+			output: strings.Repeat("1", bufferSize*2-1),
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(1)
+			},
+		},
+		{
+			name:   "trim-multiple-buffers-to-one-buffer",
+			input:  strings.Repeat("1", bufferSize*2),
+			output: "1",
+			op: func(t *testing.T, v *View) {
+				v.TrimFront(bufferSize*2 - 1)
+			},
+		},
+
+		// Grow.
+		{
+			name:   "grow",
+			input:  "hello world",
+			output: "hello world",
+			op: func(t *testing.T, v *View) {
+				v.Grow(1, true)
+			},
+		},
+		{
+			name:   "grow-from-zero",
+			output: strings.Repeat("\x00", 1024),
+			op: func(t *testing.T, v *View) {
+				v.Grow(1024, true)
+			},
+		},
+		{
+			name:   "grow-from-non-zero",
+			input:  strings.Repeat("1", bufferSize),
+			output: strings.Repeat("1", bufferSize) + strings.Repeat("\x00", bufferSize),
+			op: func(t *testing.T, v *View) {
+				v.Grow(bufferSize*2, true)
+			},
+		},
+
+		// Copy.
+		{
+			name:   "copy",
+			input:  "hello",
+			output: "hello",
+			op: func(t *testing.T, v *View) {
+				other := v.Copy()
+				bs := other.Flatten()
+				want := []byte("hello")
+				if !bytes.Equal(bs, want) {
+					t.Errorf("expected %v, got %v", want, bs)
+				}
+			},
+		},
+		{
+			name:   "copy-large",
+			input:  strings.Repeat("1", bufferSize+1),
+			output: strings.Repeat("1", bufferSize+1),
+			op: func(t *testing.T, v *View) {
+				other := v.Copy()
+				bs := other.Flatten()
+				want := []byte(strings.Repeat("1", bufferSize+1))
+				if !bytes.Equal(bs, want) {
+					t.Errorf("expected %v, got %v", want, bs)
+				}
+			},
+		},
+
+		// Merge.
+		{
+			name:   "merge",
+			input:  "hello",
+			output: "hello world",
+			op: func(t *testing.T, v *View) {
+				var other View
+				other.Append([]byte(" world"))
+				v.Merge(&other)
+				if sz := other.Size(); sz != 0 {
+					t.Errorf("expected 0, got %d", sz)
+				}
+			},
+		},
+		{
+			name:   "merge-large",
+			input:  strings.Repeat("1", bufferSize+1),
+			output: strings.Repeat("1", bufferSize+1) + strings.Repeat("0", bufferSize+1),
+			op: func(t *testing.T, v *View) {
+				var other View
+				other.Append([]byte(strings.Repeat("0", bufferSize+1)))
+				v.Merge(&other)
+				if sz := other.Size(); sz != 0 {
+					t.Errorf("expected 0, got %d", sz)
+				}
+			},
+		},
+
+		// ReadAt.
+		{
+			name:   "readat",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 0, 6, "hello", io.EOF) },
+		},
+		{
+			name:   "readat-long",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 0, 8, "hello", io.EOF) },
+		},
+		{
+			name:   "readat-short",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 0, 3, "hel", nil) },
+		},
+		{
+			name:   "readat-offset",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 2, 3, "llo", io.EOF) },
+		},
+		{
+			name:   "readat-long-offset",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 2, 8, "llo", io.EOF) },
+		},
+		{
+			name:   "readat-short-offset",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, 2, 2, "ll", nil) },
+		},
+		{
+			name:   "readat-skip-all",
+			input:  "hello",
+			output: "hello",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, bufferSize+1, 1, "", io.EOF) },
+		},
+		{
+			name:   "readat-second-buffer",
+			input:  strings.Repeat("0", bufferSize+1) + "12",
+			output: strings.Repeat("0", bufferSize+1) + "12",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, bufferSize+1, 1, "1", nil) },
+		},
+		{
+			name:   "readat-second-buffer-end",
+			input:  strings.Repeat("0", bufferSize+1) + "12",
+			output: strings.Repeat("0", bufferSize+1) + "12",
+			op:     func(t *testing.T, v *View) { testReadAt(t, v, bufferSize+1, 2, "12", io.EOF) },
+		},
+	}
+
+	for _, tc := range testCases {
+		for fillName, fn := range fillFuncs {
+			t.Run(fillName+"/"+tc.name, func(t *testing.T) {
+				// Construct & fill the view.
+				var view View
+				fn(&view, []byte(tc.input))
+
+				// Run the operation.
+				if tc.op != nil {
+					tc.op(t, &view)
+				}
+
+				// Flatten and validate.
+				out := view.Flatten()
+				if !bytes.Equal([]byte(tc.output), out) {
+					t.Errorf("expected %q, got %q", tc.output, string(out))
+				}
+
+				// Ensure the size is correct.
+				if len(out) != int(view.Size()) {
+					t.Errorf("size is wrong: expected %d, got %d", len(out), view.Size())
+				}
+
+				// Calculate contents via apply.
+				var appliedOut []byte
+				view.Apply(func(b []byte) {
+					appliedOut = append(appliedOut, b...)
+				})
+				if len(appliedOut) != len(out) {
+					t.Errorf("expected %d, got %d", len(out), len(appliedOut))
+				}
+				if !bytes.Equal(appliedOut, out) {
+					t.Errorf("expected %v, got %v", out, appliedOut)
+				}
+
+				// Calculate contents via ReadToWriter.
+				var b bytes.Buffer
+				n, err := view.ReadToWriter(&b, int64(len(out)))
+				if n != int64(len(out)) {
+					t.Errorf("expected %d, got %d", len(out), n)
+				}
+				if err != nil {
+					t.Errorf("expected nil, got %v", err)
+				}
+				if !bytes.Equal(b.Bytes(), out) {
+					t.Errorf("expected %v, got %v", out, b.Bytes())
+				}
+			})
+		}
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/buffer/view_unsafe.go
index c134714ee..d1ef39b26 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/buffer/view_unsafe.go
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package vfs2
+package buffer
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/syscalls"
+	"unsafe"
 )
 
-// Override syscall table to add syscalls implementations from this package.
-func Override(table map[uintptr]kernel.Syscall) {
-	table[0] = syscalls.Supported("read", Read)
-}
+// minBatch is the smallest Read or Write operation that the
+// WriteFromReader and ReadToWriter functions will use.
+//
+// This is defined as the size of a native pointer.
+const minBatch = int(unsafe.Sizeof(uintptr(0)))
diff --git a/pkg/cleanup/BUILD b/pkg/cleanup/BUILD
new file mode 100644
index 000000000..5c34b9872
--- /dev/null
+++ b/pkg/cleanup/BUILD
@@ -0,0 +1,17 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "cleanup",
+    srcs = ["cleanup.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+    ],
+)
+
+go_test(
+    name = "cleanup_test",
+    srcs = ["cleanup_test.go"],
+    library = ":cleanup",
+)
diff --git a/pkg/cleanup/cleanup.go b/pkg/cleanup/cleanup.go
new file mode 100644
index 000000000..14a05f076
--- /dev/null
+++ b/pkg/cleanup/cleanup.go
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cleanup provides utilities to clean "stuff" on defers.
+package cleanup
+
+// Cleanup allows defers to be aborted when cleanup needs to happen
+// conditionally. Usage:
+// 	 cu := cleanup.Make(func() { f.Close() })
+// 	 defer cu.Clean() // failure before release is called will close the file.
+// 	 ...
+//   cu.Add(func() { f2.Close() })  // Adds another cleanup function
+//   ...
+// 	 cu.Release() // on success, aborts closing the file.
+// 	 return f
+type Cleanup struct {
+	cleaners []func()
+}
+
+// Make creates a new Cleanup object.
+func Make(f func()) Cleanup {
+	return Cleanup{cleaners: []func(){f}}
+}
+
+// Add adds a new function to be called on Clean().
+func (c *Cleanup) Add(f func()) {
+	c.cleaners = append(c.cleaners, f)
+}
+
+// Clean calls all cleanup functions in reverse order.
+func (c *Cleanup) Clean() {
+	clean(c.cleaners)
+	c.cleaners = nil
+}
+
+// Release releases the cleanup from its duties, i.e. cleanup functions are not
+// called after this point. Returns a function that calls all registered
+// functions in case the caller has use for them.
+func (c *Cleanup) Release() func() {
+	old := c.cleaners
+	c.cleaners = nil
+	return func() { clean(old) }
+}
+
+func clean(cleaners []func()) {
+	for i := len(cleaners) - 1; i >= 0; i-- {
+		cleaners[i]()
+	}
+}
diff --git a/pkg/cleanup/cleanup_test.go b/pkg/cleanup/cleanup_test.go
new file mode 100644
index 000000000..ab3d9ed95
--- /dev/null
+++ b/pkg/cleanup/cleanup_test.go
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cleanup
+
+import "testing"
+
+func testCleanupHelper(clean, cleanAdd *bool, release bool) func() {
+	cu := Make(func() {
+		*clean = true
+	})
+	cu.Add(func() {
+		*cleanAdd = true
+	})
+	defer cu.Clean()
+	if release {
+		return cu.Release()
+	}
+	return nil
+}
+
+func TestCleanup(t *testing.T) {
+	clean := false
+	cleanAdd := false
+	testCleanupHelper(&clean, &cleanAdd, false)
+	if !clean {
+		t.Fatalf("cleanup function was not called.")
+	}
+	if !cleanAdd {
+		t.Fatalf("added cleanup function was not called.")
+	}
+}
+
+func TestRelease(t *testing.T) {
+	clean := false
+	cleanAdd := false
+	cleaner := testCleanupHelper(&clean, &cleanAdd, true)
+
+	// Check that clean was not called after release.
+	if clean {
+		t.Fatalf("cleanup function was called.")
+	}
+	if cleanAdd {
+		t.Fatalf("added cleanup function was called.")
+	}
+
+	// Call the cleaner function and check that both cleanup functions are called.
+	cleaner()
+	if !clean {
+		t.Fatalf("cleanup function was not called.")
+	}
+	if !cleanAdd {
+		t.Fatalf("added cleanup function was not called.")
+	}
+}
diff --git a/pkg/context/context.go b/pkg/context/context.go
index 23e009ef3..5319b6d8d 100644
--- a/pkg/context/context.go
+++ b/pkg/context/context.go
@@ -127,10 +127,6 @@ func (logContext) Value(key interface{}) interface{} {
 var bgContext = &logContext{Logger: log.Log()}
 
 // Background returns an empty context using the default logger.
-//
-// Users should be wary of using a Background context. Please tag any use with
-// FIXME(b/38173783) and a note to remove this use.
-//
 // Generally, one should use the Task as their context when available, or avoid
 // having to use a context in places where a Task is unavailable.
 //
diff --git a/pkg/cpuid/cpuid_parse_x86_test.go b/pkg/cpuid/cpuid_parse_x86_test.go
index d48418e69..c9bd40e1b 100644
--- a/pkg/cpuid/cpuid_parse_x86_test.go
+++ b/pkg/cpuid/cpuid_parse_x86_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package cpuid
 
diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go
index 333ca0a04..562f8f405 100644
--- a/pkg/cpuid/cpuid_x86.go
+++ b/pkg/cpuid/cpuid_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package cpuid
 
@@ -235,7 +235,9 @@ const (
 	X86FeaturePERFCTR_TSC
 	X86FeaturePERFCTR_LLC
 	X86FeatureMWAITX
-	// ECX[31:30] are reserved.
+	// TODO(b/152776797): Some CPUs set this but it is not documented anywhere.
+	X86FeatureBlock5Bit30
+	_ // ecx bit 31 is reserved.
 )
 
 // Block 6 constants are the extended feature bits in
@@ -438,6 +440,9 @@ var x86FeatureParseOnlyStrings = map[Feature]string{
 
 	// Block 3.
 	X86FeaturePREFETCHWT1: "prefetchwt1",
+
+	// Block 5.
+	X86FeatureBlock5Bit30: "block5_bit30",
 }
 
 // intelCacheDescriptors describe the caches and TLBs on the system. They are
@@ -725,6 +730,18 @@ func vendorIDFromRegs(bx, cx, dx uint32) string {
 	return string(bytes)
 }
 
+var maxXsaveSize = func() uint32 {
+	// Leaf 0 of xsaveinfo function returns the size for currently
+	// enabled xsave features in ebx, the maximum size if all valid
+	// features are saved with xsave in ecx, and valid XCR0 bits in
+	// edx:eax.
+	//
+	// If xSaveInfo isn't supported, cpuid will not fault but will
+	// return bogus values.
+	_, _, maxXsaveSize, _ := HostID(uint32(xSaveInfo), 0)
+	return maxXsaveSize
+}()
+
 // ExtendedStateSize returns the number of bytes needed to save the "extended
 // state" for this processor and the boundary it must be aligned to. Extended
 // state includes floating point registers, and other cpu state that's not
@@ -736,12 +753,7 @@ func vendorIDFromRegs(bx, cx, dx uint32) string {
 // about 2.5K worst case, with avx512).
 func (fs *FeatureSet) ExtendedStateSize() (size, align uint) {
 	if fs.UseXsave() {
-		// Leaf 0 of xsaveinfo function returns the size for currently
-		// enabled xsave features in ebx, the maximum size if all valid
-		// features are saved with xsave in ecx, and valid XCR0 bits in
-		// edx:eax.
-		_, _, maxSize, _ := HostID(uint32(xSaveInfo), 0)
-		return uint(maxSize), 64
+		return uint(maxXsaveSize), 64
 	}
 
 	// If we don't support xsave, we fall back to fxsave, which requires
diff --git a/pkg/cpuid/cpuid_x86_test.go b/pkg/cpuid/cpuid_x86_test.go
index 0fe20c213..bacf345c8 100644
--- a/pkg/cpuid/cpuid_x86_test.go
+++ b/pkg/cpuid/cpuid_x86_test.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package cpuid
 
diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go
index 7f41b4a27..43750360b 100644
--- a/pkg/eventchannel/event_test.go
+++ b/pkg/eventchannel/event_test.go
@@ -78,7 +78,7 @@ func TestMultiEmitter(t *testing.T) {
 	for _, name := range names {
 		m := testMessage{name: name}
 		if _, err := me.Emit(m); err != nil {
-			t.Fatal("me.Emit(%v) failed: %v", m, err)
+			t.Fatalf("me.Emit(%v) failed: %v", m, err)
 		}
 	}
 
@@ -96,7 +96,7 @@ func TestMultiEmitter(t *testing.T) {
 
 	// Close multiEmitter.
 	if err := me.Close(); err != nil {
-		t.Fatal("me.Close() failed: %v", err)
+		t.Fatalf("me.Close() failed: %v", err)
 	}
 
 	// All testEmitters should be closed.
diff --git a/pkg/flipcall/packet_window_allocator.go b/pkg/flipcall/packet_window_allocator.go
index ccb918fab..af9cc3d21 100644
--- a/pkg/flipcall/packet_window_allocator.go
+++ b/pkg/flipcall/packet_window_allocator.go
@@ -134,7 +134,7 @@ func (pwa *PacketWindowAllocator) Allocate(size int) (PacketWindowDescriptor, er
 	start := pwa.nextAlloc
 	pwa.nextAlloc = end
 	return PacketWindowDescriptor{
-		FD:     pwa.fd,
+		FD:     pwa.FD(),
 		Offset: start,
 		Length: size,
 	}, nil
@@ -158,7 +158,7 @@ func (pwa *PacketWindowAllocator) ensureFileSize(min int64) error {
 		}
 		newSize = newNewSize
 	}
-	if err := syscall.Ftruncate(pwa.fd, newSize); err != nil {
+	if err := syscall.Ftruncate(pwa.FD(), newSize); err != nil {
 		return fmt.Errorf("ftruncate failed: %v", err)
 	}
 	pwa.fileSize = newSize
diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD
index ee84471b2..67dd1e225 100644
--- a/pkg/fspath/BUILD
+++ b/pkg/fspath/BUILD
@@ -8,9 +8,11 @@ go_library(
     name = "fspath",
     srcs = [
         "builder.go",
-        "builder_unsafe.go",
         "fspath.go",
     ],
+    deps = [
+        "//pkg/gohacks",
+    ],
 )
 
 go_test(
diff --git a/pkg/fspath/builder.go b/pkg/fspath/builder.go
index 7ddb36826..6318d3874 100644
--- a/pkg/fspath/builder.go
+++ b/pkg/fspath/builder.go
@@ -16,6 +16,8 @@ package fspath
 
 import (
 	"fmt"
+
+	"gvisor.dev/gvisor/pkg/gohacks"
 )
 
 // Builder is similar to strings.Builder, but is used to produce pathnames
@@ -102,3 +104,9 @@ func (b *Builder) AppendString(str string) {
 	copy(b.buf[b.start:], b.buf[oldStart:])
 	copy(b.buf[len(b.buf)-len(str):], str)
 }
+
+// String returns the accumulated string. No other methods should be called
+// after String.
+func (b *Builder) String() string {
+	return gohacks.StringFromImmutableBytes(b.buf[b.start:])
+}
diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go
index 9fb3fee24..4c983d5fd 100644
--- a/pkg/fspath/fspath.go
+++ b/pkg/fspath/fspath.go
@@ -67,7 +67,8 @@ func Parse(pathname string) Path {
 
 // Path contains the information contained in a pathname string.
 //
-// Path is copyable by value.
+// Path is copyable by value. The zero value for Path is equivalent to
+// fspath.Parse(""), i.e. the empty path.
 type Path struct {
 	// Begin is an iterator to the first path component in the relative part of
 	// the path.
diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go
index 850693df8..316015e06 100644
--- a/pkg/gate/gate_test.go
+++ b/pkg/gate/gate_test.go
@@ -15,6 +15,7 @@
 package gate_test
 
 import (
+	"runtime"
 	"testing"
 	"time"
 
@@ -165,6 +166,8 @@ func worker(g *gate.Gate, done *sync.WaitGroup) {
 		if !g.Enter() {
 			break
 		}
+		// Golang before v1.14 doesn't preempt busyloops.
+		runtime.Gosched()
 		g.Leave()
 	}
 	done.Done()
diff --git a/pkg/gohacks/BUILD b/pkg/gohacks/BUILD
new file mode 100644
index 000000000..798a65eca
--- /dev/null
+++ b/pkg/gohacks/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "gohacks",
+    srcs = [
+        "gohacks_unsafe.go",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/pkg/gohacks/gohacks_unsafe.go b/pkg/gohacks/gohacks_unsafe.go
new file mode 100644
index 000000000..aad675172
--- /dev/null
+++ b/pkg/gohacks/gohacks_unsafe.go
@@ -0,0 +1,57 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gohacks contains utilities for subverting the Go compiler.
+package gohacks
+
+import (
+	"reflect"
+	"unsafe"
+)
+
+// Noescape hides a pointer from escape analysis. Noescape is the identity
+// function but escape analysis doesn't think the output depends on the input.
+// Noescape is inlined and currently compiles down to zero instructions.
+// USE CAREFULLY!
+//
+// (Noescape is copy/pasted from Go's runtime/stubs.go:noescape().)
+//
+//go:nosplit
+func Noescape(p unsafe.Pointer) unsafe.Pointer {
+	x := uintptr(p)
+	return unsafe.Pointer(x ^ 0)
+}
+
+// ImmutableBytesFromString is equivalent to []byte(s), except that it uses the
+// same memory backing s instead of making a heap-allocated copy. This is only
+// valid if the returned slice is never mutated.
+func ImmutableBytesFromString(s string) []byte {
+	shdr := (*reflect.StringHeader)(unsafe.Pointer(&s))
+	var bs []byte
+	bshdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
+	bshdr.Data = shdr.Data
+	bshdr.Len = shdr.Len
+	bshdr.Cap = shdr.Len
+	return bs
+}
+
+// StringFromImmutableBytes is equivalent to string(bs), except that it uses
+// the same memory backing bs instead of making a heap-allocated copy. This is
+// only valid if bs is never mutated after StringFromImmutableBytes returns.
+func StringFromImmutableBytes(bs []byte) string {
+	// This is cheaper than messing with reflect.StringHeader and
+	// reflect.SliceHeader, which as of this writing produces many dead stores
+	// of zeroes. Compare strings.Builder.String().
+	return *(*string)(unsafe.Pointer(&bs))
+}
diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD
index ea8d2422c..7a82631c5 100644
--- a/pkg/goid/BUILD
+++ b/pkg/goid/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "goid.go",
         "goid_amd64.s",
+        "goid_arm64.s",
         "goid_race.go",
         "goid_unsafe.go",
     ],
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64.go b/pkg/goid/goid_arm64.s
index 19ee36081..a7465b75d 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64.go
+++ b/pkg/goid/goid_arm64.s
@@ -12,5 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package vfs2 provides syscall implementations that use VFS2.
-package vfs2
+#include "textflag.h"
+
+// func getg() *g
+TEXT ·getg(SB),NOSPLIT,$0-8
+        MOVD g, R0      // g
+        MOVD R0, ret+0(FP)
+        RET
diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index 019caadca..0d07da3b1 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -86,11 +86,21 @@ func (l *List) Back() Element {
 	return l.tail
 }
 
+// Len returns the number of elements in the list.
+//
+// NOTE: This is an O(n) operation.
+func (l *List) Len() (count int) {
+	for e := l.Front(); e != nil; e = e.Next() {
+		count++
+	}
+	return count
+}
+
 // PushFront inserts the element e at the front of list l.
 func (l *List) PushFront(e Element) {
-	ElementMapper{}.linkerFor(e).SetNext(l.head)
-	ElementMapper{}.linkerFor(e).SetPrev(nil)
-
+	linker := ElementMapper{}.linkerFor(e)
+	linker.SetNext(l.head)
+	linker.SetPrev(nil)
 	if l.head != nil {
 		ElementMapper{}.linkerFor(l.head).SetPrev(e)
 	} else {
@@ -102,9 +112,9 @@ func (l *List) PushFront(e Element) {
 
 // PushBack inserts the element e at the back of list l.
 func (l *List) PushBack(e Element) {
-	ElementMapper{}.linkerFor(e).SetNext(nil)
-	ElementMapper{}.linkerFor(e).SetPrev(l.tail)
-
+	linker := ElementMapper{}.linkerFor(e)
+	linker.SetNext(nil)
+	linker.SetPrev(l.tail)
 	if l.tail != nil {
 		ElementMapper{}.linkerFor(l.tail).SetNext(e)
 	} else {
@@ -125,17 +135,20 @@ func (l *List) PushBackList(m *List) {
 
 		l.tail = m.tail
 	}
-
 	m.head = nil
 	m.tail = nil
 }
 
 // InsertAfter inserts e after b.
 func (l *List) InsertAfter(b, e Element) {
-	a := ElementMapper{}.linkerFor(b).Next()
-	ElementMapper{}.linkerFor(e).SetNext(a)
-	ElementMapper{}.linkerFor(e).SetPrev(b)
-	ElementMapper{}.linkerFor(b).SetNext(e)
+	bLinker := ElementMapper{}.linkerFor(b)
+	eLinker := ElementMapper{}.linkerFor(e)
+
+	a := bLinker.Next()
+
+	eLinker.SetNext(a)
+	eLinker.SetPrev(b)
+	bLinker.SetNext(e)
 
 	if a != nil {
 		ElementMapper{}.linkerFor(a).SetPrev(e)
@@ -146,10 +159,13 @@ func (l *List) InsertAfter(b, e Element) {
 
 // InsertBefore inserts e before a.
 func (l *List) InsertBefore(a, e Element) {
-	b := ElementMapper{}.linkerFor(a).Prev()
-	ElementMapper{}.linkerFor(e).SetNext(a)
-	ElementMapper{}.linkerFor(e).SetPrev(b)
-	ElementMapper{}.linkerFor(a).SetPrev(e)
+	aLinker := ElementMapper{}.linkerFor(a)
+	eLinker := ElementMapper{}.linkerFor(e)
+
+	b := aLinker.Prev()
+	eLinker.SetNext(a)
+	eLinker.SetPrev(b)
+	aLinker.SetPrev(e)
 
 	if b != nil {
 		ElementMapper{}.linkerFor(b).SetNext(e)
@@ -160,8 +176,9 @@ func (l *List) InsertBefore(a, e Element) {
 
 // Remove removes e from l.
 func (l *List) Remove(e Element) {
-	prev := ElementMapper{}.linkerFor(e).Prev()
-	next := ElementMapper{}.linkerFor(e).Next()
+	linker := ElementMapper{}.linkerFor(e)
+	prev := linker.Prev()
+	next := linker.Next()
 
 	if prev != nil {
 		ElementMapper{}.linkerFor(prev).SetNext(next)
@@ -174,6 +191,9 @@ func (l *List) Remove(e Element) {
 	} else {
 		l.tail = prev
 	}
+
+	linker.SetNext(nil)
+	linker.SetPrev(nil)
 }
 
 // Entry is a default implementation of Linker. Users can add anonymous fields
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index 41bf104d0..f84d03700 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -5,6 +5,8 @@ package(licenses = ["notice"])
 go_library(
     name = "linewriter",
     srcs = ["linewriter.go"],
+    marshal = False,
+    stateify = False,
     visibility = ["//visibility:public"],
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index a7c8f7bef..3ed6aba5c 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -10,6 +10,8 @@ go_library(
         "json_k8s.go",
         "log.go",
     ],
+    marshal = False,
+    stateify = False,
     visibility = [
         "//visibility:public",
     ],
diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index cab5fae55..f57c4427b 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -25,7 +25,7 @@ import (
 // GoogleEmitter is a wrapper that emits logs in a format compatible with
 // package github.com/golang/glog.
 type GoogleEmitter struct {
-	Writer
+	*Writer
 }
 
 // pid is used for the threadid component of the header.
@@ -46,7 +46,7 @@ var pid = os.Getpid()
 //   line             The line number
 //   msg              The user-supplied message
 //
-func (g *GoogleEmitter) Emit(level Level, timestamp time.Time, format string, args ...interface{}) {
+func (g GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format string, args ...interface{}) {
 	// Log level.
 	prefix := byte('?')
 	switch level {
@@ -64,9 +64,7 @@ func (g *GoogleEmitter) Emit(level Level, timestamp time.Time, format string, ar
 	microsecond := int(timestamp.Nanosecond() / 1000)
 
 	// 0 = this frame.
-	// 1 = Debugf, etc.
-	// 2 = Caller.
-	_, file, line, ok := runtime.Caller(2)
+	_, file, line, ok := runtime.Caller(depth + 1)
 	if ok {
 		// Trim any directory path from the file.
 		slash := strings.LastIndexByte(file, byte('/'))
@@ -83,5 +81,5 @@ func (g *GoogleEmitter) Emit(level Level, timestamp time.Time, format string, ar
 	message := fmt.Sprintf(format, args...)
 
 	// Emit the formatted result.
-	fmt.Fprintf(&g.Writer, "%c%02d%02d %02d:%02d:%02d.%06d % 7d %s:%d] %s\n", prefix, int(month), day, hour, minute, second, microsecond, pid, file, line, message)
+	fmt.Fprintf(g.Writer, "%c%02d%02d %02d:%02d:%02d.%06d % 7d %s:%d] %s\n", prefix, int(month), day, hour, minute, second, microsecond, pid, file, line, message)
 }
diff --git a/pkg/log/json.go b/pkg/log/json.go
index a278c8fc8..bdf9d691e 100644
--- a/pkg/log/json.go
+++ b/pkg/log/json.go
@@ -58,11 +58,11 @@ func (lv *Level) UnmarshalJSON(b []byte) error {
 
 // JSONEmitter logs messages in json format.
 type JSONEmitter struct {
-	Writer
+	*Writer
 }
 
 // Emit implements Emitter.Emit.
-func (e JSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (e JSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	j := jsonLog{
 		Msg:   fmt.Sprintf(format, v...),
 		Level: level,
diff --git a/pkg/log/json_k8s.go b/pkg/log/json_k8s.go
index cee6eb514..5883e95e1 100644
--- a/pkg/log/json_k8s.go
+++ b/pkg/log/json_k8s.go
@@ -29,11 +29,11 @@ type k8sJSONLog struct {
 // K8sJSONEmitter logs messages in json format that is compatible with
 // Kubernetes fluent configuration.
 type K8sJSONEmitter struct {
-	Writer
+	*Writer
 }
 
 // Emit implements Emitter.Emit.
-func (e *K8sJSONEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (e K8sJSONEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	j := k8sJSONLog{
 		Log:   fmt.Sprintf(format, v...),
 		Level: level,
diff --git a/pkg/log/log.go b/pkg/log/log.go
index 5056f17e6..37e0605ad 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -79,7 +79,7 @@ func (l Level) String() string {
 type Emitter interface {
 	// Emit emits the given log statement. This allows for control over the
 	// timestamp used for logging.
-	Emit(level Level, timestamp time.Time, format string, v ...interface{})
+	Emit(depth int, level Level, timestamp time.Time, format string, v ...interface{})
 }
 
 // Writer writes the output to the given writer.
@@ -142,7 +142,7 @@ func (l *Writer) Write(data []byte) (int, error) {
 }
 
 // Emit emits the message.
-func (l *Writer) Emit(level Level, timestamp time.Time, format string, args ...interface{}) {
+func (l *Writer) Emit(_ int, _ Level, _ time.Time, format string, args ...interface{}) {
 	fmt.Fprintf(l, format, args...)
 }
 
@@ -150,9 +150,9 @@ func (l *Writer) Emit(level Level, timestamp time.Time, format string, args ...i
 type MultiEmitter []Emitter
 
 // Emit emits to all emitters.
-func (m *MultiEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (m *MultiEmitter) Emit(depth int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	for _, e := range *m {
-		e.Emit(level, timestamp, format, v...)
+		e.Emit(1+depth, level, timestamp, format, v...)
 	}
 }
 
@@ -167,7 +167,7 @@ type TestEmitter struct {
 }
 
 // Emit emits to the TestLogger.
-func (t *TestEmitter) Emit(level Level, timestamp time.Time, format string, v ...interface{}) {
+func (t *TestEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
 	t.Logf(format, v...)
 }
 
@@ -198,22 +198,37 @@ type BasicLogger struct {
 
 // Debugf implements logger.Debugf.
 func (l *BasicLogger) Debugf(format string, v ...interface{}) {
-	if l.IsLogging(Debug) {
-		l.Emit(Debug, time.Now(), format, v...)
-	}
+	l.DebugfAtDepth(1, format, v...)
 }
 
 // Infof implements logger.Infof.
 func (l *BasicLogger) Infof(format string, v ...interface{}) {
-	if l.IsLogging(Info) {
-		l.Emit(Info, time.Now(), format, v...)
-	}
+	l.InfofAtDepth(1, format, v...)
 }
 
 // Warningf implements logger.Warningf.
 func (l *BasicLogger) Warningf(format string, v ...interface{}) {
+	l.WarningfAtDepth(1, format, v...)
+}
+
+// DebugfAtDepth logs at a specific depth.
+func (l *BasicLogger) DebugfAtDepth(depth int, format string, v ...interface{}) {
+	if l.IsLogging(Debug) {
+		l.Emit(1+depth, Debug, time.Now(), format, v...)
+	}
+}
+
+// InfofAtDepth logs at a specific depth.
+func (l *BasicLogger) InfofAtDepth(depth int, format string, v ...interface{}) {
+	if l.IsLogging(Info) {
+		l.Emit(1+depth, Info, time.Now(), format, v...)
+	}
+}
+
+// WarningfAtDepth logs at a specific depth.
+func (l *BasicLogger) WarningfAtDepth(depth int, format string, v ...interface{}) {
 	if l.IsLogging(Warning) {
-		l.Emit(Warning, time.Now(), format, v...)
+		l.Emit(1+depth, Warning, time.Now(), format, v...)
 	}
 }
 
@@ -257,17 +272,32 @@ func SetLevel(newLevel Level) {
 
 // Debugf logs to the global logger.
 func Debugf(format string, v ...interface{}) {
-	Log().Debugf(format, v...)
+	Log().DebugfAtDepth(1, format, v...)
 }
 
 // Infof logs to the global logger.
 func Infof(format string, v ...interface{}) {
-	Log().Infof(format, v...)
+	Log().InfofAtDepth(1, format, v...)
 }
 
 // Warningf logs to the global logger.
 func Warningf(format string, v ...interface{}) {
-	Log().Warningf(format, v...)
+	Log().WarningfAtDepth(1, format, v...)
+}
+
+// DebugfAtDepth logs to the global logger.
+func DebugfAtDepth(depth int, format string, v ...interface{}) {
+	Log().DebugfAtDepth(1+depth, format, v...)
+}
+
+// InfofAtDepth logs to the global logger.
+func InfofAtDepth(depth int, format string, v ...interface{}) {
+	Log().InfofAtDepth(1+depth, format, v...)
+}
+
+// WarningfAtDepth logs to the global logger.
+func WarningfAtDepth(depth int, format string, v ...interface{}) {
+	Log().WarningfAtDepth(1+depth, format, v...)
 }
 
 // defaultStackSize is the default buffer size to allocate for stack traces.
@@ -344,5 +374,5 @@ func CopyStandardLogTo(l Level) error {
 
 func init() {
 	// Store the initial value for the log.
-	log.Store(&BasicLogger{Level: Info, Emitter: &GoogleEmitter{Writer{Next: os.Stderr}}})
+	log.Store(&BasicLogger{Level: Info, Emitter: GoogleEmitter{&Writer{Next: os.Stderr}}})
 }
diff --git a/pkg/log/log_test.go b/pkg/log/log_test.go
index 402cc29ae..9ff18559b 100644
--- a/pkg/log/log_test.go
+++ b/pkg/log/log_test.go
@@ -52,7 +52,7 @@ func TestDropMessages(t *testing.T) {
 		t.Fatalf("Write should have failed")
 	}
 
-	fmt.Printf("writer: %+v\n", w)
+	fmt.Printf("writer: %#v\n", &w)
 
 	tw.fail = false
 	if _, err := w.Write([]byte("line 2\n")); err != nil {
@@ -76,7 +76,7 @@ func TestDropMessages(t *testing.T) {
 
 func TestCaller(t *testing.T) {
 	tw := &testWriter{}
-	e := &GoogleEmitter{Writer: Writer{Next: tw}}
+	e := GoogleEmitter{Writer: &Writer{Next: tw}}
 	bl := &BasicLogger{
 		Emitter: e,
 		Level:   Debug,
@@ -94,7 +94,7 @@ func BenchmarkGoogleLogging(b *testing.B) {
 	tw := &testWriter{
 		limit: 1, // Only record one message.
 	}
-	e := &GoogleEmitter{Writer: Writer{Next: tw}}
+	e := GoogleEmitter{Writer: &Writer{Next: tw}}
 	bl := &BasicLogger{
 		Emitter: e,
 		Level:   Debug,
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 006fcd9ab..64aa365ce 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -39,16 +39,11 @@ var (
 // Uint64Metric encapsulates a uint64 that represents some kind of metric to be
 // monitored.
 //
-// All metrics must be cumulative, meaning that their values will only increase
-// over time.
-//
 // Metrics are not saved across save/restore and thus reset to zero on restore.
 //
-// TODO(b/67298402): Support non-cumulative metrics.
 // TODO(b/67298427): Support metric fields.
 type Uint64Metric struct {
-	// value is the actual value of the metric. It must be accessed
-	// atomically.
+	// value is the actual value of the metric. It must be accessed atomically.
 	value uint64
 }
 
@@ -110,13 +105,10 @@ type customUint64Metric struct {
 // Register must only be called at init and will return and error if called
 // after Initialized.
 //
-// All metrics must be cumulative, meaning that the return values of value must
-// only increase over time.
-//
 // Preconditions:
 //  * name must be globally unique.
 //  * Initialize/Disable have not been called.
-func RegisterCustomUint64Metric(name string, sync bool, description string, value func() uint64) error {
+func RegisterCustomUint64Metric(name string, cumulative, sync bool, units pb.MetricMetadata_Units, description string, value func() uint64) error {
 	if initialized {
 		return ErrInitializationDone
 	}
@@ -129,9 +121,10 @@ func RegisterCustomUint64Metric(name string, sync bool, description string, valu
 		metadata: &pb.MetricMetadata{
 			Name:        name,
 			Description: description,
-			Cumulative:  true,
+			Cumulative:  cumulative,
 			Sync:        sync,
-			Type:        pb.MetricMetadata_UINT64,
+			Type:        pb.MetricMetadata_TYPE_UINT64,
+			Units:       units,
 		},
 		value: value,
 	}
@@ -140,24 +133,32 @@ func RegisterCustomUint64Metric(name string, sync bool, description string, valu
 
 // MustRegisterCustomUint64Metric calls RegisterCustomUint64Metric and panics
 // if it returns an error.
-func MustRegisterCustomUint64Metric(name string, sync bool, description string, value func() uint64) {
-	if err := RegisterCustomUint64Metric(name, sync, description, value); err != nil {
+func MustRegisterCustomUint64Metric(name string, cumulative, sync bool, description string, value func() uint64) {
+	if err := RegisterCustomUint64Metric(name, cumulative, sync, pb.MetricMetadata_UNITS_NONE, description, value); err != nil {
 		panic(fmt.Sprintf("Unable to register metric %q: %v", name, err))
 	}
 }
 
-// NewUint64Metric creates and registers a new metric with the given name.
+// NewUint64Metric creates and registers a new cumulative metric with the given name.
 //
 // Metrics must be statically defined (i.e., at init).
-func NewUint64Metric(name string, sync bool, description string) (*Uint64Metric, error) {
+func NewUint64Metric(name string, sync bool, units pb.MetricMetadata_Units, description string) (*Uint64Metric, error) {
 	var m Uint64Metric
-	return &m, RegisterCustomUint64Metric(name, sync, description, m.Value)
+	return &m, RegisterCustomUint64Metric(name, true /* cumulative */, sync, units, description, m.Value)
 }
 
-// MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns an
-// error.
+// MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns an error.
 func MustCreateNewUint64Metric(name string, sync bool, description string) *Uint64Metric {
-	m, err := NewUint64Metric(name, sync, description)
+	m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NONE, description)
+	if err != nil {
+		panic(fmt.Sprintf("Unable to create metric %q: %v", name, err))
+	}
+	return m
+}
+
+// MustCreateNewUint64NanosecondsMetric calls NewUint64Metric and panics if it returns an error.
+func MustCreateNewUint64NanosecondsMetric(name string, sync bool, description string) *Uint64Metric {
+	m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NANOSECONDS, description)
 	if err != nil {
 		panic(fmt.Sprintf("Unable to create metric %q: %v", name, err))
 	}
@@ -244,6 +245,6 @@ func EmitMetricUpdate() {
 		return
 	}
 
-	log.Debugf("Emitting metrics: %v", m)
+	log.Debugf("Emitting metrics: %v", &m)
 	eventchannel.Emit(&m)
 }
diff --git a/pkg/metric/metric.proto b/pkg/metric/metric.proto
index a2c2bd1ba..3cc89047d 100644
--- a/pkg/metric/metric.proto
+++ b/pkg/metric/metric.proto
@@ -36,10 +36,18 @@ message MetricMetadata {
   // the monitoring system.
   bool sync = 4;
 
-  enum Type { UINT64 = 0; }
+  enum Type { TYPE_UINT64 = 0; }
 
   // type is the type of the metric value.
   Type type = 5;
+
+  enum Units {
+    UNITS_NONE = 0;
+    UNITS_NANOSECONDS = 1;
+  }
+
+  // units is the units of the metric value.
+  Units units = 6;
 }
 
 // MetricRegistration contains the metadata for all metrics that will be in
diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go
index 34969385a..c425ea532 100644
--- a/pkg/metric/metric_test.go
+++ b/pkg/metric/metric_test.go
@@ -66,12 +66,12 @@ const (
 func TestInitialize(t *testing.T) {
 	defer reset()
 
-	_, err := NewUint64Metric("/foo", false, fooDescription)
+	_, err := NewUint64Metric("/foo", false, pb.MetricMetadata_UNITS_NONE, fooDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
 
-	_, err = NewUint64Metric("/bar", true, barDescription)
+	_, err = NewUint64Metric("/bar", true, pb.MetricMetadata_UNITS_NANOSECONDS, barDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
@@ -94,8 +94,8 @@ func TestInitialize(t *testing.T) {
 	foundFoo := false
 	foundBar := false
 	for _, m := range mr.Metrics {
-		if m.Type != pb.MetricMetadata_UINT64 {
-			t.Errorf("Metadata %+v Type got %v want %v", m, m.Type, pb.MetricMetadata_UINT64)
+		if m.Type != pb.MetricMetadata_TYPE_UINT64 {
+			t.Errorf("Metadata %+v Type got %v want %v", m, m.Type, pb.MetricMetadata_TYPE_UINT64)
 		}
 		if !m.Cumulative {
 			t.Errorf("Metadata %+v Cumulative got false want true", m)
@@ -110,6 +110,9 @@ func TestInitialize(t *testing.T) {
 			if m.Sync {
 				t.Errorf("/foo %+v Sync got true want false", m)
 			}
+			if m.Units != pb.MetricMetadata_UNITS_NONE {
+				t.Errorf("/foo %+v Units got %v want %v", m, m.Units, pb.MetricMetadata_UNITS_NONE)
+			}
 		case "/bar":
 			foundBar = true
 			if m.Description != barDescription {
@@ -118,6 +121,9 @@ func TestInitialize(t *testing.T) {
 			if !m.Sync {
 				t.Errorf("/bar %+v Sync got true want false", m)
 			}
+			if m.Units != pb.MetricMetadata_UNITS_NANOSECONDS {
+				t.Errorf("/bar %+v Units got %v want %v", m, m.Units, pb.MetricMetadata_UNITS_NANOSECONDS)
+			}
 		}
 	}
 
@@ -132,12 +138,12 @@ func TestInitialize(t *testing.T) {
 func TestDisable(t *testing.T) {
 	defer reset()
 
-	_, err := NewUint64Metric("/foo", false, fooDescription)
+	_, err := NewUint64Metric("/foo", false, pb.MetricMetadata_UNITS_NONE, fooDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
 
-	_, err = NewUint64Metric("/bar", true, barDescription)
+	_, err = NewUint64Metric("/bar", true, pb.MetricMetadata_UNITS_NONE, barDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
@@ -161,12 +167,12 @@ func TestDisable(t *testing.T) {
 func TestEmitMetricUpdate(t *testing.T) {
 	defer reset()
 
-	foo, err := NewUint64Metric("/foo", false, fooDescription)
+	foo, err := NewUint64Metric("/foo", false, pb.MetricMetadata_UNITS_NONE, fooDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
 
-	_, err = NewUint64Metric("/bar", true, barDescription)
+	_, err = NewUint64Metric("/bar", true, pb.MetricMetadata_UNITS_NONE, barDescription)
 	if err != nil {
 		t.Fatalf("NewUint64Metric got err %v want nil", err)
 	}
diff --git a/pkg/p9/buffer.go b/pkg/p9/buffer.go
index 249536d8a..6a4951821 100644
--- a/pkg/p9/buffer.go
+++ b/pkg/p9/buffer.go
@@ -20,16 +20,16 @@ import (
 
 // encoder is used for messages and 9P primitives.
 type encoder interface {
-	// Decode decodes from the given buffer. Decode may be called more than once
+	// decode decodes from the given buffer. decode may be called more than once
 	// to reuse the instance. It must clear any previous state.
 	//
 	// This may not fail, exhaustion will be recorded in the buffer.
-	Decode(b *buffer)
+	decode(b *buffer)
 
-	// Encode encodes to the given buffer.
+	// encode encodes to the given buffer.
 	//
 	// This may not fail.
-	Encode(b *buffer)
+	encode(b *buffer)
 }
 
 // order is the byte order used for encoding.
@@ -39,7 +39,7 @@ var order = binary.LittleEndian
 //
 // This is passed to the encoder methods.
 type buffer struct {
-	// data is the underlying data. This may grow during Encode.
+	// data is the underlying data. This may grow during encode.
 	data []byte
 
 	// overflow indicates whether an overflow has occurred.
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index a6f493b82..71e944c30 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -174,7 +174,7 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client
 		// our sendRecv function to use that functionality.  Otherwise,
 		// we stick to sendRecvLegacy.
 		rversion := Rversion{}
-		err := c.sendRecvLegacy(&Tversion{
+		_, err := c.sendRecvLegacy(&Tversion{
 			Version: versionString(requested),
 			MSize:   messageSize,
 		}, &rversion)
@@ -219,11 +219,11 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client
 			c.sendRecv = c.sendRecvChannel
 		} else {
 			// Channel setup failed; fallback.
-			c.sendRecv = c.sendRecvLegacy
+			c.sendRecv = c.sendRecvLegacySyscallErr
 		}
 	} else {
 		// No channels available: use the legacy mechanism.
-		c.sendRecv = c.sendRecvLegacy
+		c.sendRecv = c.sendRecvLegacySyscallErr
 	}
 
 	// Ensure that the socket and channels are closed when the socket is shut
@@ -305,7 +305,7 @@ func (c *Client) openChannel(id int) error {
 	)
 
 	// Open the data channel.
-	if err := c.sendRecvLegacy(&Tchannel{
+	if _, err := c.sendRecvLegacy(&Tchannel{
 		ID:      uint32(id),
 		Control: 0,
 	}, &rchannel0); err != nil {
@@ -319,7 +319,7 @@ func (c *Client) openChannel(id int) error {
 	defer rchannel0.FilePayload().Close()
 
 	// Open the channel for file descriptors.
-	if err := c.sendRecvLegacy(&Tchannel{
+	if _, err := c.sendRecvLegacy(&Tchannel{
 		ID:      uint32(id),
 		Control: 1,
 	}, &rchannel1); err != nil {
@@ -431,13 +431,28 @@ func (c *Client) waitAndRecv(done chan error) error {
 	}
 }
 
+// sendRecvLegacySyscallErr is a wrapper for sendRecvLegacy that converts all
+// non-syscall errors to EIO.
+func (c *Client) sendRecvLegacySyscallErr(t message, r message) error {
+	received, err := c.sendRecvLegacy(t, r)
+	if !received {
+		log.Warningf("p9.Client.sendRecvChannel: %v", err)
+		return syscall.EIO
+	}
+	return err
+}
+
 // sendRecvLegacy performs a roundtrip message exchange.
 //
+// sendRecvLegacy returns true if a message was received. This allows us to
+// differentiate between failed receives and successful receives where the
+// response was an error message.
+//
 // This is called by internal functions.
-func (c *Client) sendRecvLegacy(t message, r message) error {
+func (c *Client) sendRecvLegacy(t message, r message) (bool, error) {
 	tag, ok := c.tagPool.Get()
 	if !ok {
-		return ErrOutOfTags
+		return false, ErrOutOfTags
 	}
 	defer c.tagPool.Put(tag)
 
@@ -457,12 +472,12 @@ func (c *Client) sendRecvLegacy(t message, r message) error {
 	err := send(c.socket, Tag(tag), t)
 	c.sendMu.Unlock()
 	if err != nil {
-		return err
+		return false, err
 	}
 
 	// Co-ordinate with other receivers.
 	if err := c.waitAndRecv(resp.done); err != nil {
-		return err
+		return false, err
 	}
 
 	// Is it an error message?
@@ -470,14 +485,14 @@ func (c *Client) sendRecvLegacy(t message, r message) error {
 	// For convenience, we transform these directly
 	// into errors. Handlers need not handle this case.
 	if rlerr, ok := resp.r.(*Rlerror); ok {
-		return syscall.Errno(rlerr.Error)
+		return true, syscall.Errno(rlerr.Error)
 	}
 
 	// At this point, we know it matches.
 	//
 	// Per recv call above, we will only allow a type
 	// match (and give our r) or an instance of Rlerror.
-	return nil
+	return true, nil
 }
 
 // sendRecvChannel uses channels to send a message.
@@ -486,7 +501,7 @@ func (c *Client) sendRecvChannel(t message, r message) error {
 	c.channelsMu.Lock()
 	if len(c.availableChannels) == 0 {
 		c.channelsMu.Unlock()
-		return c.sendRecvLegacy(t, r)
+		return c.sendRecvLegacySyscallErr(t, r)
 	}
 	idx := len(c.availableChannels) - 1
 	ch := c.availableChannels[idx]
@@ -526,7 +541,11 @@ func (c *Client) sendRecvChannel(t message, r message) error {
 	}
 
 	// Parse the server's response.
-	_, retErr := ch.recv(r, rsz)
+	resp, retErr := ch.recv(r, rsz)
+	if resp == nil {
+		log.Warningf("p9.Client.sendRecvChannel: p9.channel.recv: %v", retErr)
+		retErr = syscall.EIO
+	}
 
 	// Release the channel.
 	c.channelsMu.Lock()
diff --git a/pkg/p9/client_test.go b/pkg/p9/client_test.go
index 29a0afadf..c757583e0 100644
--- a/pkg/p9/client_test.go
+++ b/pkg/p9/client_test.go
@@ -96,7 +96,12 @@ func benchmarkSendRecv(b *testing.B, fn func(c *Client) func(message, message) e
 }
 
 func BenchmarkSendRecvLegacy(b *testing.B) {
-	benchmarkSendRecv(b, func(c *Client) func(message, message) error { return c.sendRecvLegacy })
+	benchmarkSendRecv(b, func(c *Client) func(message, message) error {
+		return func(t message, r message) error {
+			_, err := c.sendRecvLegacy(t, r)
+			return err
+		}
+	})
 }
 
 func BenchmarkSendRecvChannel(b *testing.B) {
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index d4ffbc8e3..cab35896f 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -97,12 +97,12 @@ type File interface {
 	// free to ignore the hint entirely (i.e. the value returned may be larger
 	// than size). All size checking is done independently at the syscall layer.
 	//
-	// TODO(b/127675828): Determine concurrency guarantees once implemented.
+	// On the server, GetXattr has a read concurrency guarantee.
 	GetXattr(name string, size uint64) (string, error)
 
 	// SetXattr sets extended attributes on this node.
 	//
-	// TODO(b/127675828): Determine concurrency guarantees once implemented.
+	// On the server, SetXattr has a write concurrency guarantee.
 	SetXattr(name, value string, flags uint32) error
 
 	// ListXattr lists the names of the extended attributes on this node.
@@ -113,12 +113,12 @@ type File interface {
 	// free to ignore the hint entirely (i.e. the value returned may be larger
 	// than size). All size checking is done independently at the syscall layer.
 	//
-	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	// On the server, ListXattr has a read concurrency guarantee.
 	ListXattr(size uint64) (map[string]struct{}, error)
 
 	// RemoveXattr removes extended attributes on this node.
 	//
-	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	// On the server, RemoveXattr has a write concurrency guarantee.
 	RemoveXattr(name string) error
 
 	// Allocate allows the caller to directly manipulate the allocated disk space
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 2ac45eb80..1db5797dd 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -48,6 +48,8 @@ func ExtractErrno(err error) syscall.Errno {
 		return ExtractErrno(e.Err)
 	case *os.SyscallError:
 		return ExtractErrno(e.Err)
+	case *os.LinkError:
+		return ExtractErrno(e.Err)
 	}
 
 	// Default case.
@@ -920,8 +922,15 @@ func (t *Tgetxattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	val, err := ref.file.GetXattr(t.Name, t.Size)
-	if err != nil {
+	var val string
+	if err := ref.safelyRead(func() (err error) {
+		// Don't allow getxattr on files that have been deleted.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+		val, err = ref.file.GetXattr(t.Name, t.Size)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 	return &Rgetxattr{Value: val}
@@ -935,7 +944,13 @@ func (t *Tsetxattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	if err := ref.file.SetXattr(t.Name, t.Value, t.Flags); err != nil {
+	if err := ref.safelyWrite(func() error {
+		// Don't allow setxattr on files that have been deleted.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+		return ref.file.SetXattr(t.Name, t.Value, t.Flags)
+	}); err != nil {
 		return newErr(err)
 	}
 	return &Rsetxattr{}
@@ -949,10 +964,18 @@ func (t *Tlistxattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	xattrs, err := ref.file.ListXattr(t.Size)
-	if err != nil {
+	var xattrs map[string]struct{}
+	if err := ref.safelyRead(func() (err error) {
+		// Don't allow listxattr on files that have been deleted.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+		xattrs, err = ref.file.ListXattr(t.Size)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
+
 	xattrList := make([]string, 0, len(xattrs))
 	for x := range xattrs {
 		xattrList = append(xattrList, x)
@@ -968,7 +991,13 @@ func (t *Tremovexattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	if err := ref.file.RemoveXattr(t.Name); err != nil {
+	if err := ref.safelyWrite(func() error {
+		// Don't allow removexattr on files that have been deleted.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+		return ref.file.RemoveXattr(t.Name)
+	}); err != nil {
 		return newErr(err)
 	}
 	return &Rremovexattr{}
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index b1cede5f5..57b89ad7d 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -51,7 +51,7 @@ type payloader interface {
 	// SetPayload returns the decoded message.
 	//
 	// This is going to be total message size - FixedSize. But this should
-	// be validated during Decode, which will be called after SetPayload.
+	// be validated during decode, which will be called after SetPayload.
 	SetPayload([]byte)
 }
 
@@ -90,14 +90,14 @@ type Tversion struct {
 	Version string
 }
 
-// Decode implements encoder.Decode.
-func (t *Tversion) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tversion) decode(b *buffer) {
 	t.MSize = b.Read32()
 	t.Version = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tversion) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tversion) encode(b *buffer) {
 	b.Write32(t.MSize)
 	b.WriteString(t.Version)
 }
@@ -121,14 +121,14 @@ type Rversion struct {
 	Version string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rversion) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rversion) decode(b *buffer) {
 	r.MSize = b.Read32()
 	r.Version = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rversion) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rversion) encode(b *buffer) {
 	b.Write32(r.MSize)
 	b.WriteString(r.Version)
 }
@@ -149,13 +149,13 @@ type Tflush struct {
 	OldTag Tag
 }
 
-// Decode implements encoder.Decode.
-func (t *Tflush) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tflush) decode(b *buffer) {
 	t.OldTag = b.ReadTag()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tflush) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tflush) encode(b *buffer) {
 	b.WriteTag(t.OldTag)
 }
 
@@ -173,12 +173,12 @@ func (t *Tflush) String() string {
 type Rflush struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rflush) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rflush) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rflush) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rflush) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -203,8 +203,8 @@ type Twalk struct {
 	Names []string
 }
 
-// Decode implements encoder.Decode.
-func (t *Twalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Twalk) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.NewFID = b.ReadFID()
 	n := b.Read16()
@@ -214,8 +214,8 @@ func (t *Twalk) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (t *Twalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Twalk) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.NewFID)
 	b.Write16(uint16(len(t.Names)))
@@ -240,22 +240,22 @@ type Rwalk struct {
 	QIDs []QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rwalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rwalk) decode(b *buffer) {
 	n := b.Read16()
 	r.QIDs = r.QIDs[:0]
 	for i := 0; i < int(n); i++ {
 		var q QID
-		q.Decode(b)
+		q.decode(b)
 		r.QIDs = append(r.QIDs, q)
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rwalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rwalk) encode(b *buffer) {
 	b.Write16(uint16(len(r.QIDs)))
 	for _, q := range r.QIDs {
-		q.Encode(b)
+		q.encode(b)
 	}
 }
 
@@ -275,13 +275,13 @@ type Tclunk struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tclunk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tclunk) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tclunk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tclunk) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -299,12 +299,12 @@ func (t *Tclunk) String() string {
 type Rclunk struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rclunk) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rclunk) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rclunk) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rclunk) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -325,13 +325,13 @@ type Tremove struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tremove) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tremove) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tremove) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tremove) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -349,12 +349,12 @@ func (t *Tremove) String() string {
 type Rremove struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rremove) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rremove) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rremove) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rremove) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -374,13 +374,13 @@ type Rlerror struct {
 	Error uint32
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlerror) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rlerror) decode(b *buffer) {
 	r.Error = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rlerror) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rlerror) encode(b *buffer) {
 	b.Write32(r.Error)
 }
 
@@ -409,16 +409,16 @@ type Tauth struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tauth) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tauth) decode(b *buffer) {
 	t.AuthenticationFID = b.ReadFID()
 	t.UserName = b.ReadString()
 	t.AttachName = b.ReadString()
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tauth) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tauth) encode(b *buffer) {
 	b.WriteFID(t.AuthenticationFID)
 	b.WriteString(t.UserName)
 	b.WriteString(t.AttachName)
@@ -437,7 +437,7 @@ func (t *Tauth) String() string {
 
 // Rauth is an authentication response.
 //
-// Encode, Decode and Length are inherited directly from QID.
+// encode and decode are inherited directly from QID.
 type Rauth struct {
 	QID
 }
@@ -463,16 +463,16 @@ type Tattach struct {
 	Auth Tauth
 }
 
-// Decode implements encoder.Decode.
-func (t *Tattach) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tattach) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.Auth.Decode(b)
+	t.Auth.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (t *Tattach) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tattach) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.Auth.Encode(b)
+	t.Auth.encode(b)
 }
 
 // Type implements message.Type.
@@ -509,14 +509,14 @@ type Tlopen struct {
 	Flags OpenFlags
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlopen) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlopen) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Flags = b.ReadOpenFlags()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlopen) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlopen) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteOpenFlags(t.Flags)
 }
@@ -542,15 +542,15 @@ type Rlopen struct {
 	filePayload
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlopen) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rlopen) decode(b *buffer) {
+	r.QID.decode(b)
 	r.IoUnit = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rlopen) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rlopen) encode(b *buffer) {
+	r.QID.encode(b)
 	b.Write32(r.IoUnit)
 }
 
@@ -587,8 +587,8 @@ type Tlcreate struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlcreate) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlcreate) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.OpenFlags = b.ReadOpenFlags()
@@ -596,8 +596,8 @@ func (t *Tlcreate) Decode(b *buffer) {
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlcreate) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlcreate) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.WriteOpenFlags(t.OpenFlags)
@@ -617,7 +617,7 @@ func (t *Tlcreate) String() string {
 
 // Rlcreate is a create response.
 //
-// The Encode, Decode, etc. methods are inherited from Rlopen.
+// The encode, decode, etc. methods are inherited from Rlopen.
 type Rlcreate struct {
 	Rlopen
 }
@@ -647,16 +647,16 @@ type Tsymlink struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tsymlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tsymlink) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Target = b.ReadString()
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tsymlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tsymlink) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.WriteString(t.Target)
@@ -679,14 +679,14 @@ type Rsymlink struct {
 	QID QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rsymlink) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rsymlink) decode(b *buffer) {
+	r.QID.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rsymlink) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rsymlink) encode(b *buffer) {
+	r.QID.encode(b)
 }
 
 // Type implements message.Type.
@@ -711,15 +711,15 @@ type Tlink struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlink) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Target = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlink) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteFID(t.Target)
 	b.WriteString(t.Name)
@@ -744,12 +744,12 @@ func (*Rlink) Type() MsgType {
 	return MsgRlink
 }
 
-// Decode implements encoder.Decode.
-func (*Rlink) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rlink) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rlink) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rlink) encode(*buffer) {
 }
 
 // String implements fmt.Stringer.
@@ -772,16 +772,16 @@ type Trenameat struct {
 	NewName string
 }
 
-// Decode implements encoder.Decode.
-func (t *Trenameat) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Trenameat) decode(b *buffer) {
 	t.OldDirectory = b.ReadFID()
 	t.OldName = b.ReadString()
 	t.NewDirectory = b.ReadFID()
 	t.NewName = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Trenameat) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Trenameat) encode(b *buffer) {
 	b.WriteFID(t.OldDirectory)
 	b.WriteString(t.OldName)
 	b.WriteFID(t.NewDirectory)
@@ -802,12 +802,12 @@ func (t *Trenameat) String() string {
 type Rrenameat struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rrenameat) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rrenameat) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rrenameat) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rrenameat) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -832,15 +832,15 @@ type Tunlinkat struct {
 	Flags uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tunlinkat) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tunlinkat) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Flags = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tunlinkat) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tunlinkat) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.Write32(t.Flags)
@@ -860,12 +860,12 @@ func (t *Tunlinkat) String() string {
 type Runlinkat struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Runlinkat) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Runlinkat) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Runlinkat) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Runlinkat) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -893,15 +893,15 @@ type Trename struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Trename) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Trename) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Trename) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Trename) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
@@ -921,12 +921,12 @@ func (t *Trename) String() string {
 type Rrename struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rrename) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rrename) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rrename) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rrename) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -945,13 +945,13 @@ type Treadlink struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Treadlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Treadlink) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Treadlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Treadlink) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -971,13 +971,13 @@ type Rreadlink struct {
 	Target string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rreadlink) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rreadlink) decode(b *buffer) {
 	r.Target = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rreadlink) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rreadlink) encode(b *buffer) {
 	b.WriteString(r.Target)
 }
 
@@ -1003,15 +1003,15 @@ type Tread struct {
 	Count uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tread) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tread) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Offset = b.Read64()
 	t.Count = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tread) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tread) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.Write64(t.Offset)
 	b.Write32(t.Count)
@@ -1033,20 +1033,20 @@ type Rread struct {
 	Data []byte
 }
 
-// Decode implements encoder.Decode.
+// decode implements encoder.decode.
 //
 // Data is automatically decoded via Payload.
-func (r *Rread) Decode(b *buffer) {
+func (r *Rread) decode(b *buffer) {
 	count := b.Read32()
 	if count != uint32(len(r.Data)) {
 		b.markOverrun()
 	}
 }
 
-// Encode implements encoder.Encode.
+// encode implements encoder.encode.
 //
 // Data is automatically encoded via Payload.
-func (r *Rread) Encode(b *buffer) {
+func (r *Rread) encode(b *buffer) {
 	b.Write32(uint32(len(r.Data)))
 }
 
@@ -1087,8 +1087,8 @@ type Twrite struct {
 	Data []byte
 }
 
-// Decode implements encoder.Decode.
-func (t *Twrite) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Twrite) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Offset = b.Read64()
 	count := b.Read32()
@@ -1097,10 +1097,10 @@ func (t *Twrite) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
+// encode implements encoder.encode.
 //
 // This uses the buffer payload to avoid a copy.
-func (t *Twrite) Encode(b *buffer) {
+func (t *Twrite) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.Write64(t.Offset)
 	b.Write32(uint32(len(t.Data)))
@@ -1137,13 +1137,13 @@ type Rwrite struct {
 	Count uint32
 }
 
-// Decode implements encoder.Decode.
-func (r *Rwrite) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rwrite) decode(b *buffer) {
 	r.Count = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rwrite) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rwrite) encode(b *buffer) {
 	b.Write32(r.Count)
 }
 
@@ -1178,8 +1178,8 @@ type Tmknod struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tmknod) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tmknod) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Mode = b.ReadFileMode()
@@ -1188,8 +1188,8 @@ func (t *Tmknod) Decode(b *buffer) {
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tmknod) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tmknod) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.WriteFileMode(t.Mode)
@@ -1214,14 +1214,14 @@ type Rmknod struct {
 	QID QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rmknod) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rmknod) decode(b *buffer) {
+	r.QID.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rmknod) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rmknod) encode(b *buffer) {
+	r.QID.encode(b)
 }
 
 // Type implements message.Type.
@@ -1249,16 +1249,16 @@ type Tmkdir struct {
 	GID GID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tmkdir) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tmkdir) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Permissions = b.ReadPermissions()
 	t.GID = b.ReadGID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tmkdir) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tmkdir) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
 	b.WritePermissions(t.Permissions)
@@ -1281,14 +1281,14 @@ type Rmkdir struct {
 	QID QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rmkdir) Decode(b *buffer) {
-	r.QID.Decode(b)
+// decode implements encoder.decode.
+func (r *Rmkdir) decode(b *buffer) {
+	r.QID.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rmkdir) Encode(b *buffer) {
-	r.QID.Encode(b)
+// encode implements encoder.encode.
+func (r *Rmkdir) encode(b *buffer) {
+	r.QID.encode(b)
 }
 
 // Type implements message.Type.
@@ -1310,16 +1310,16 @@ type Tgetattr struct {
 	AttrMask AttrMask
 }
 
-// Decode implements encoder.Decode.
-func (t *Tgetattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tgetattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.AttrMask.Decode(b)
+	t.AttrMask.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (t *Tgetattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tgetattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.AttrMask.Encode(b)
+	t.AttrMask.encode(b)
 }
 
 // Type implements message.Type.
@@ -1344,18 +1344,18 @@ type Rgetattr struct {
 	Attr Attr
 }
 
-// Decode implements encoder.Decode.
-func (r *Rgetattr) Decode(b *buffer) {
-	r.Valid.Decode(b)
-	r.QID.Decode(b)
-	r.Attr.Decode(b)
+// decode implements encoder.decode.
+func (r *Rgetattr) decode(b *buffer) {
+	r.Valid.decode(b)
+	r.QID.decode(b)
+	r.Attr.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rgetattr) Encode(b *buffer) {
-	r.Valid.Encode(b)
-	r.QID.Encode(b)
-	r.Attr.Encode(b)
+// encode implements encoder.encode.
+func (r *Rgetattr) encode(b *buffer) {
+	r.Valid.encode(b)
+	r.QID.encode(b)
+	r.Attr.encode(b)
 }
 
 // Type implements message.Type.
@@ -1380,18 +1380,18 @@ type Tsetattr struct {
 	SetAttr SetAttr
 }
 
-// Decode implements encoder.Decode.
-func (t *Tsetattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tsetattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.Valid.Decode(b)
-	t.SetAttr.Decode(b)
+	t.Valid.decode(b)
+	t.SetAttr.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (t *Tsetattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tsetattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.Valid.Encode(b)
-	t.SetAttr.Encode(b)
+	t.Valid.encode(b)
+	t.SetAttr.encode(b)
 }
 
 // Type implements message.Type.
@@ -1408,12 +1408,12 @@ func (t *Tsetattr) String() string {
 type Rsetattr struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rsetattr) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rsetattr) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rsetattr) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rsetattr) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1435,18 +1435,18 @@ type Tallocate struct {
 	Length uint64
 }
 
-// Decode implements encoder.Decode.
-func (t *Tallocate) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tallocate) decode(b *buffer) {
 	t.FID = b.ReadFID()
-	t.Mode.Decode(b)
+	t.Mode.decode(b)
 	t.Offset = b.Read64()
 	t.Length = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tallocate) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tallocate) encode(b *buffer) {
 	b.WriteFID(t.FID)
-	t.Mode.Encode(b)
+	t.Mode.encode(b)
 	b.Write64(t.Offset)
 	b.Write64(t.Length)
 }
@@ -1465,12 +1465,12 @@ func (t *Tallocate) String() string {
 type Rallocate struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rallocate) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rallocate) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rallocate) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rallocate) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1492,14 +1492,14 @@ type Tlistxattr struct {
 	Size uint64
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlistxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlistxattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Size = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlistxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlistxattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.Write64(t.Size)
 }
@@ -1520,8 +1520,8 @@ type Rlistxattr struct {
 	Xattrs []string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlistxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rlistxattr) decode(b *buffer) {
 	n := b.Read16()
 	r.Xattrs = r.Xattrs[:0]
 	for i := 0; i < int(n); i++ {
@@ -1529,8 +1529,8 @@ func (r *Rlistxattr) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rlistxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rlistxattr) encode(b *buffer) {
 	b.Write16(uint16(len(r.Xattrs)))
 	for _, x := range r.Xattrs {
 		b.WriteString(x)
@@ -1559,15 +1559,15 @@ type Txattrwalk struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Txattrwalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Txattrwalk) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.NewFID = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Txattrwalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Txattrwalk) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.NewFID)
 	b.WriteString(t.Name)
@@ -1589,13 +1589,13 @@ type Rxattrwalk struct {
 	Size uint64
 }
 
-// Decode implements encoder.Decode.
-func (r *Rxattrwalk) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rxattrwalk) decode(b *buffer) {
 	r.Size = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rxattrwalk) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rxattrwalk) encode(b *buffer) {
 	b.Write64(r.Size)
 }
 
@@ -1627,16 +1627,16 @@ type Txattrcreate struct {
 	Flags uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Txattrcreate) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Txattrcreate) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.AttrSize = b.Read64()
 	t.Flags = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Txattrcreate) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Txattrcreate) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.Write64(t.AttrSize)
@@ -1657,12 +1657,12 @@ func (t *Txattrcreate) String() string {
 type Rxattrcreate struct {
 }
 
-// Decode implements encoder.Decode.
-func (r *Rxattrcreate) Decode(*buffer) {
+// decode implements encoder.decode.
+func (r *Rxattrcreate) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (r *Rxattrcreate) Encode(*buffer) {
+// encode implements encoder.encode.
+func (r *Rxattrcreate) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1687,15 +1687,15 @@ type Tgetxattr struct {
 	Size uint64
 }
 
-// Decode implements encoder.Decode.
-func (t *Tgetxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tgetxattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Size = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tgetxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tgetxattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.Write64(t.Size)
@@ -1717,13 +1717,13 @@ type Rgetxattr struct {
 	Value string
 }
 
-// Decode implements encoder.Decode.
-func (r *Rgetxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rgetxattr) decode(b *buffer) {
 	r.Value = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rgetxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rgetxattr) encode(b *buffer) {
 	b.WriteString(r.Value)
 }
 
@@ -1752,16 +1752,16 @@ type Tsetxattr struct {
 	Flags uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tsetxattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tsetxattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 	t.Value = b.ReadString()
 	t.Flags = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tsetxattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tsetxattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 	b.WriteString(t.Value)
@@ -1782,12 +1782,12 @@ func (t *Tsetxattr) String() string {
 type Rsetxattr struct {
 }
 
-// Decode implements encoder.Decode.
-func (r *Rsetxattr) Decode(*buffer) {
+// decode implements encoder.decode.
+func (r *Rsetxattr) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (r *Rsetxattr) Encode(*buffer) {
+// encode implements encoder.encode.
+func (r *Rsetxattr) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1809,14 +1809,14 @@ type Tremovexattr struct {
 	Name string
 }
 
-// Decode implements encoder.Decode.
-func (t *Tremovexattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tremovexattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tremovexattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tremovexattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteString(t.Name)
 }
@@ -1835,12 +1835,12 @@ func (t *Tremovexattr) String() string {
 type Rremovexattr struct {
 }
 
-// Decode implements encoder.Decode.
-func (r *Rremovexattr) Decode(*buffer) {
+// decode implements encoder.decode.
+func (r *Rremovexattr) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (r *Rremovexattr) Encode(*buffer) {
+// encode implements encoder.encode.
+func (r *Rremovexattr) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1865,15 +1865,15 @@ type Treaddir struct {
 	Count uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Treaddir) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Treaddir) decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Offset = b.Read64()
 	t.Count = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Treaddir) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Treaddir) encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.Write64(t.Offset)
 	b.Write32(t.Count)
@@ -1907,14 +1907,14 @@ type Rreaddir struct {
 	payload []byte
 }
 
-// Decode implements encoder.Decode.
-func (r *Rreaddir) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rreaddir) decode(b *buffer) {
 	r.Count = b.Read32()
 	entriesBuf := buffer{data: r.payload}
 	r.Entries = r.Entries[:0]
 	for {
 		var d Dirent
-		d.Decode(&entriesBuf)
+		d.decode(&entriesBuf)
 		if entriesBuf.isOverrun() {
 			// Couldn't decode a complete entry.
 			break
@@ -1923,22 +1923,20 @@ func (r *Rreaddir) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rreaddir) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rreaddir) encode(b *buffer) {
 	entriesBuf := buffer{}
+	payloadSize := 0
 	for _, d := range r.Entries {
-		d.Encode(&entriesBuf)
-		if len(entriesBuf.data) >= int(r.Count) {
+		d.encode(&entriesBuf)
+		if len(entriesBuf.data) > int(r.Count) {
 			break
 		}
+		payloadSize = len(entriesBuf.data)
 	}
-	if len(entriesBuf.data) < int(r.Count) {
-		r.Count = uint32(len(entriesBuf.data))
-		r.payload = entriesBuf.data
-	} else {
-		r.payload = entriesBuf.data[:r.Count]
-	}
-	b.Write32(uint32(r.Count))
+	r.Count = uint32(payloadSize)
+	r.payload = entriesBuf.data[:payloadSize]
+	b.Write32(r.Count)
 }
 
 // Type implements message.Type.
@@ -1972,13 +1970,13 @@ type Tfsync struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tfsync) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tfsync) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tfsync) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tfsync) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -1996,12 +1994,12 @@ func (t *Tfsync) String() string {
 type Rfsync struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rfsync) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rfsync) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rfsync) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rfsync) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -2020,13 +2018,13 @@ type Tstatfs struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tstatfs) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tstatfs) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tstatfs) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tstatfs) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -2046,14 +2044,14 @@ type Rstatfs struct {
 	FSStat FSStat
 }
 
-// Decode implements encoder.Decode.
-func (r *Rstatfs) Decode(b *buffer) {
-	r.FSStat.Decode(b)
+// decode implements encoder.decode.
+func (r *Rstatfs) decode(b *buffer) {
+	r.FSStat.decode(b)
 }
 
-// Encode implements encoder.Encode.
-func (r *Rstatfs) Encode(b *buffer) {
-	r.FSStat.Encode(b)
+// encode implements encoder.encode.
+func (r *Rstatfs) encode(b *buffer) {
+	r.FSStat.encode(b)
 }
 
 // Type implements message.Type.
@@ -2072,13 +2070,13 @@ type Tflushf struct {
 	FID FID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tflushf) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tflushf) decode(b *buffer) {
 	t.FID = b.ReadFID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tflushf) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tflushf) encode(b *buffer) {
 	b.WriteFID(t.FID)
 }
 
@@ -2096,12 +2094,12 @@ func (t *Tflushf) String() string {
 type Rflushf struct {
 }
 
-// Decode implements encoder.Decode.
-func (*Rflushf) Decode(*buffer) {
+// decode implements encoder.decode.
+func (*Rflushf) decode(*buffer) {
 }
 
-// Encode implements encoder.Encode.
-func (*Rflushf) Encode(*buffer) {
+// encode implements encoder.encode.
+func (*Rflushf) encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -2126,8 +2124,8 @@ type Twalkgetattr struct {
 	Names []string
 }
 
-// Decode implements encoder.Decode.
-func (t *Twalkgetattr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Twalkgetattr) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.NewFID = b.ReadFID()
 	n := b.Read16()
@@ -2137,8 +2135,8 @@ func (t *Twalkgetattr) Decode(b *buffer) {
 	}
 }
 
-// Encode implements encoder.Encode.
-func (t *Twalkgetattr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Twalkgetattr) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteFID(t.NewFID)
 	b.Write16(uint16(len(t.Names)))
@@ -2169,26 +2167,26 @@ type Rwalkgetattr struct {
 	QIDs []QID
 }
 
-// Decode implements encoder.Decode.
-func (r *Rwalkgetattr) Decode(b *buffer) {
-	r.Valid.Decode(b)
-	r.Attr.Decode(b)
+// decode implements encoder.decode.
+func (r *Rwalkgetattr) decode(b *buffer) {
+	r.Valid.decode(b)
+	r.Attr.decode(b)
 	n := b.Read16()
 	r.QIDs = r.QIDs[:0]
 	for i := 0; i < int(n); i++ {
 		var q QID
-		q.Decode(b)
+		q.decode(b)
 		r.QIDs = append(r.QIDs, q)
 	}
 }
 
-// Encode implements encoder.Encode.
-func (r *Rwalkgetattr) Encode(b *buffer) {
-	r.Valid.Encode(b)
-	r.Attr.Encode(b)
+// encode implements encoder.encode.
+func (r *Rwalkgetattr) encode(b *buffer) {
+	r.Valid.encode(b)
+	r.Attr.encode(b)
 	b.Write16(uint16(len(r.QIDs)))
 	for _, q := range r.QIDs {
-		q.Encode(b)
+		q.encode(b)
 	}
 }
 
@@ -2210,15 +2208,15 @@ type Tucreate struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tucreate) Decode(b *buffer) {
-	t.Tlcreate.Decode(b)
+// decode implements encoder.decode.
+func (t *Tucreate) decode(b *buffer) {
+	t.Tlcreate.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tucreate) Encode(b *buffer) {
-	t.Tlcreate.Encode(b)
+// encode implements encoder.encode.
+func (t *Tucreate) encode(b *buffer) {
+	t.Tlcreate.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2255,15 +2253,15 @@ type Tumkdir struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tumkdir) Decode(b *buffer) {
-	t.Tmkdir.Decode(b)
+// decode implements encoder.decode.
+func (t *Tumkdir) decode(b *buffer) {
+	t.Tmkdir.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tumkdir) Encode(b *buffer) {
-	t.Tmkdir.Encode(b)
+// encode implements encoder.encode.
+func (t *Tumkdir) encode(b *buffer) {
+	t.Tmkdir.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2300,15 +2298,15 @@ type Tumknod struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tumknod) Decode(b *buffer) {
-	t.Tmknod.Decode(b)
+// decode implements encoder.decode.
+func (t *Tumknod) decode(b *buffer) {
+	t.Tmknod.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tumknod) Encode(b *buffer) {
-	t.Tmknod.Encode(b)
+// encode implements encoder.encode.
+func (t *Tumknod) encode(b *buffer) {
+	t.Tmknod.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2345,15 +2343,15 @@ type Tusymlink struct {
 	UID UID
 }
 
-// Decode implements encoder.Decode.
-func (t *Tusymlink) Decode(b *buffer) {
-	t.Tsymlink.Decode(b)
+// decode implements encoder.decode.
+func (t *Tusymlink) decode(b *buffer) {
+	t.Tsymlink.decode(b)
 	t.UID = b.ReadUID()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tusymlink) Encode(b *buffer) {
-	t.Tsymlink.Encode(b)
+// encode implements encoder.encode.
+func (t *Tusymlink) encode(b *buffer) {
+	t.Tsymlink.encode(b)
 	b.WriteUID(t.UID)
 }
 
@@ -2391,14 +2389,14 @@ type Tlconnect struct {
 	Flags ConnectFlags
 }
 
-// Decode implements encoder.Decode.
-func (t *Tlconnect) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tlconnect) decode(b *buffer) {
 	t.FID = b.ReadFID()
 	t.Flags = b.ReadConnectFlags()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tlconnect) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tlconnect) encode(b *buffer) {
 	b.WriteFID(t.FID)
 	b.WriteConnectFlags(t.Flags)
 }
@@ -2418,11 +2416,11 @@ type Rlconnect struct {
 	filePayload
 }
 
-// Decode implements encoder.Decode.
-func (r *Rlconnect) Decode(*buffer) {}
+// decode implements encoder.decode.
+func (r *Rlconnect) decode(*buffer) {}
 
-// Encode implements encoder.Encode.
-func (r *Rlconnect) Encode(*buffer) {}
+// encode implements encoder.encode.
+func (r *Rlconnect) encode(*buffer) {}
 
 // Type implements message.Type.
 func (*Rlconnect) Type() MsgType {
@@ -2445,14 +2443,14 @@ type Tchannel struct {
 	Control uint32
 }
 
-// Decode implements encoder.Decode.
-func (t *Tchannel) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (t *Tchannel) decode(b *buffer) {
 	t.ID = b.Read32()
 	t.Control = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (t *Tchannel) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (t *Tchannel) encode(b *buffer) {
 	b.Write32(t.ID)
 	b.Write32(t.Control)
 }
@@ -2474,14 +2472,14 @@ type Rchannel struct {
 	filePayload
 }
 
-// Decode implements encoder.Decode.
-func (r *Rchannel) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (r *Rchannel) decode(b *buffer) {
 	r.Offset = b.Read64()
 	r.Length = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (r *Rchannel) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (r *Rchannel) encode(b *buffer) {
 	b.Write64(r.Offset)
 	b.Write64(r.Length)
 }
@@ -2577,7 +2575,7 @@ func calculateSize(m message) uint32 {
 		return p.FixedSize()
 	}
 	var dataBuf buffer
-	m.Encode(&dataBuf)
+	m.encode(&dataBuf)
 	return uint32(len(dataBuf.data))
 }
 
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index 825c939da..7facc9f5e 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -216,7 +216,7 @@ func TestEncodeDecode(t *testing.T) {
 		},
 		&Rreaddir{
 			// Count must be sufficient to encode a dirent.
-			Count:   0x18,
+			Count:   0x1a,
 			Entries: []Dirent{{QID: QID{Type: 2}}},
 		},
 		&Tfsync{
@@ -382,7 +382,7 @@ func TestEncodeDecode(t *testing.T) {
 		// Encode the original.
 		data := make([]byte, initialBufferLength)
 		buf := buffer{data: data[:0]}
-		enc.Encode(&buf)
+		enc.encode(&buf)
 
 		// Create a new object, same as the first.
 		enc2 := reflect.New(reflect.ValueOf(enc).Elem().Type()).Interface().(encoder)
@@ -399,7 +399,7 @@ func TestEncodeDecode(t *testing.T) {
 		}
 
 		// Mark sure it was okay.
-		enc2.Decode(&buf2)
+		enc2.decode(&buf2)
 		if buf2.isOverrun() {
 			t.Errorf("object %#v->%#v got overrun on decode", enc, enc2)
 			continue
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 20ab31f7a..28d851ff5 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -450,15 +450,15 @@ func (q QID) String() string {
 	return fmt.Sprintf("QID{Type: %d, Version: %d, Path: %d}", q.Type, q.Version, q.Path)
 }
 
-// Decode implements encoder.Decode.
-func (q *QID) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (q *QID) decode(b *buffer) {
 	q.Type = b.ReadQIDType()
 	q.Version = b.Read32()
 	q.Path = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (q *QID) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (q *QID) encode(b *buffer) {
 	b.WriteQIDType(q.Type)
 	b.Write32(q.Version)
 	b.Write64(q.Path)
@@ -515,8 +515,8 @@ type FSStat struct {
 	NameLength uint32
 }
 
-// Decode implements encoder.Decode.
-func (f *FSStat) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (f *FSStat) decode(b *buffer) {
 	f.Type = b.Read32()
 	f.BlockSize = b.Read32()
 	f.Blocks = b.Read64()
@@ -528,8 +528,8 @@ func (f *FSStat) Decode(b *buffer) {
 	f.NameLength = b.Read32()
 }
 
-// Encode implements encoder.Encode.
-func (f *FSStat) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (f *FSStat) encode(b *buffer) {
 	b.Write32(f.Type)
 	b.Write32(f.BlockSize)
 	b.Write64(f.Blocks)
@@ -679,8 +679,8 @@ func (a AttrMask) String() string {
 	return fmt.Sprintf("AttrMask{with: %s}", strings.Join(masks, " "))
 }
 
-// Decode implements encoder.Decode.
-func (a *AttrMask) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (a *AttrMask) decode(b *buffer) {
 	mask := b.Read64()
 	a.Mode = mask&0x00000001 != 0
 	a.NLink = mask&0x00000002 != 0
@@ -698,8 +698,8 @@ func (a *AttrMask) Decode(b *buffer) {
 	a.DataVersion = mask&0x00002000 != 0
 }
 
-// Encode implements encoder.Encode.
-func (a *AttrMask) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (a *AttrMask) encode(b *buffer) {
 	var mask uint64
 	if a.Mode {
 		mask |= 0x00000001
@@ -774,8 +774,8 @@ func (a Attr) String() string {
 		a.Mode, a.UID, a.GID, a.NLink, a.RDev, a.Size, a.BlockSize, a.Blocks, a.ATimeSeconds, a.ATimeNanoSeconds, a.MTimeSeconds, a.MTimeNanoSeconds, a.CTimeSeconds, a.CTimeNanoSeconds, a.BTimeSeconds, a.BTimeNanoSeconds, a.Gen, a.DataVersion)
 }
 
-// Encode implements encoder.Encode.
-func (a *Attr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (a *Attr) encode(b *buffer) {
 	b.WriteFileMode(a.Mode)
 	b.WriteUID(a.UID)
 	b.WriteGID(a.GID)
@@ -796,8 +796,8 @@ func (a *Attr) Encode(b *buffer) {
 	b.Write64(a.DataVersion)
 }
 
-// Decode implements encoder.Decode.
-func (a *Attr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (a *Attr) decode(b *buffer) {
 	a.Mode = b.ReadFileMode()
 	a.UID = b.ReadUID()
 	a.GID = b.ReadGID()
@@ -926,8 +926,8 @@ func (s SetAttrMask) Empty() bool {
 	return !s.Permissions && !s.UID && !s.GID && !s.Size && !s.ATime && !s.MTime && !s.CTime && !s.ATimeNotSystemTime && !s.MTimeNotSystemTime
 }
 
-// Decode implements encoder.Decode.
-func (s *SetAttrMask) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (s *SetAttrMask) decode(b *buffer) {
 	mask := b.Read32()
 	s.Permissions = mask&0x00000001 != 0
 	s.UID = mask&0x00000002 != 0
@@ -972,8 +972,8 @@ func (s SetAttrMask) bitmask() uint32 {
 	return mask
 }
 
-// Encode implements encoder.Encode.
-func (s *SetAttrMask) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (s *SetAttrMask) encode(b *buffer) {
 	b.Write32(s.bitmask())
 }
 
@@ -994,8 +994,8 @@ func (s SetAttr) String() string {
 	return fmt.Sprintf("SetAttr{Permissions: 0o%o, UID: %d, GID: %d, Size: %d, ATime: {Sec: %d, NanoSec: %d}, MTime: {Sec: %d, NanoSec: %d}}", s.Permissions, s.UID, s.GID, s.Size, s.ATimeSeconds, s.ATimeNanoSeconds, s.MTimeSeconds, s.MTimeNanoSeconds)
 }
 
-// Decode implements encoder.Decode.
-func (s *SetAttr) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (s *SetAttr) decode(b *buffer) {
 	s.Permissions = b.ReadPermissions()
 	s.UID = b.ReadUID()
 	s.GID = b.ReadGID()
@@ -1006,8 +1006,8 @@ func (s *SetAttr) Decode(b *buffer) {
 	s.MTimeNanoSeconds = b.Read64()
 }
 
-// Encode implements encoder.Encode.
-func (s *SetAttr) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (s *SetAttr) encode(b *buffer) {
 	b.WritePermissions(s.Permissions)
 	b.WriteUID(s.UID)
 	b.WriteGID(s.GID)
@@ -1064,17 +1064,17 @@ func (d Dirent) String() string {
 	return fmt.Sprintf("Dirent{QID: %d, Offset: %d, Type: 0x%X, Name: %s}", d.QID, d.Offset, d.Type, d.Name)
 }
 
-// Decode implements encoder.Decode.
-func (d *Dirent) Decode(b *buffer) {
-	d.QID.Decode(b)
+// decode implements encoder.decode.
+func (d *Dirent) decode(b *buffer) {
+	d.QID.decode(b)
 	d.Offset = b.Read64()
 	d.Type = b.ReadQIDType()
 	d.Name = b.ReadString()
 }
 
-// Encode implements encoder.Encode.
-func (d *Dirent) Encode(b *buffer) {
-	d.QID.Encode(b)
+// encode implements encoder.encode.
+func (d *Dirent) encode(b *buffer) {
+	d.QID.encode(b)
 	b.Write64(d.Offset)
 	b.WriteQIDType(d.Type)
 	b.WriteString(d.Name)
@@ -1118,8 +1118,8 @@ func (a *AllocateMode) ToLinux() uint32 {
 	return rv
 }
 
-// Decode implements encoder.Decode.
-func (a *AllocateMode) Decode(b *buffer) {
+// decode implements encoder.decode.
+func (a *AllocateMode) decode(b *buffer) {
 	mask := b.Read32()
 	a.KeepSize = mask&0x01 != 0
 	a.PunchHole = mask&0x02 != 0
@@ -1130,8 +1130,8 @@ func (a *AllocateMode) Decode(b *buffer) {
 	a.Unshare = mask&0x40 != 0
 }
 
-// Encode implements encoder.Encode.
-func (a *AllocateMode) Encode(b *buffer) {
+// encode implements encoder.encode.
+func (a *AllocateMode) encode(b *buffer) {
 	mask := uint32(0)
 	if a.KeepSize {
 		mask |= 0x01
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index 9c11e28ce..7cec0e86d 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -80,7 +80,7 @@ func send(s *unet.Socket, tag Tag, m message) error {
 	}
 
 	// Encode the message. The buffer will grow automatically.
-	m.Encode(&dataBuf)
+	m.encode(&dataBuf)
 
 	// Get our vectors to send.
 	var hdr [headerLength]byte
@@ -316,7 +316,7 @@ func recv(s *unet.Socket, msize uint32, lookup lookupTagAndType) (Tag, message,
 	}
 
 	// Decode the message data.
-	m.Decode(&dataBuf)
+	m.decode(&dataBuf)
 	if dataBuf.isOverrun() {
 		// No need to drain the socket.
 		return NoTag, nil, ErrNoValidMessage
diff --git a/pkg/p9/transport_flipcall.go b/pkg/p9/transport_flipcall.go
index 233f825e3..38038abdf 100644
--- a/pkg/p9/transport_flipcall.go
+++ b/pkg/p9/transport_flipcall.go
@@ -151,7 +151,7 @@ func (ch *channel) send(m message) (uint32, error) {
 	} else {
 		ch.buf.Write8(0) // No incoming FD.
 	}
-	m.Encode(&ch.buf)
+	m.encode(&ch.buf)
 	ssz := uint32(len(ch.buf.data)) // Updated below.
 
 	// Is there a payload?
@@ -205,7 +205,7 @@ func (ch *channel) recv(r message, rsz uint32) (message, error) {
 		ch.buf.data = ch.buf.data[:fs]
 	}
 
-	r.Decode(&ch.buf)
+	r.decode(&ch.buf)
 	if ch.buf.isOverrun() {
 		// Nothing valid was available.
 		log.Debugf("recv [got %d bytes, needed more]", rsz)
@@ -236,7 +236,7 @@ func (ch *channel) recv(r message, rsz uint32) (message, error) {
 
 	// Convert errors appropriately; see above.
 	if rlerr, ok := r.(*Rlerror); ok {
-		return nil, syscall.Errno(rlerr.Error)
+		return r, syscall.Errno(rlerr.Error)
 	}
 
 	return r, nil
diff --git a/pkg/p9/transport_test.go b/pkg/p9/transport_test.go
index 2f50ff3ea..3668fcad7 100644
--- a/pkg/p9/transport_test.go
+++ b/pkg/p9/transport_test.go
@@ -56,8 +56,8 @@ func TestSendRecv(t *testing.T) {
 // badDecode overruns on decode.
 type badDecode struct{}
 
-func (*badDecode) Decode(b *buffer) { b.markOverrun() }
-func (*badDecode) Encode(b *buffer) {}
+func (*badDecode) decode(b *buffer) { b.markOverrun() }
+func (*badDecode) encode(b *buffer) {}
 func (*badDecode) Type() MsgType    { return MsgTypeBadDecode }
 func (*badDecode) String() string   { return "badDecode{}" }
 
@@ -81,8 +81,8 @@ func TestRecvOverrun(t *testing.T) {
 // unregistered is not registered on decode.
 type unregistered struct{}
 
-func (*unregistered) Decode(b *buffer) {}
-func (*unregistered) Encode(b *buffer) {}
+func (*unregistered) decode(b *buffer) {}
+func (*unregistered) encode(b *buffer) {}
 func (*unregistered) Type() MsgType    { return MsgTypeUnregistered }
 func (*unregistered) String() string   { return "unregistered{}" }
 
diff --git a/pkg/procid/procid_amd64.s b/pkg/procid/procid_amd64.s
index 38cea9be3..7c622e5d7 100644
--- a/pkg/procid/procid_amd64.s
+++ b/pkg/procid/procid_amd64.s
@@ -14,7 +14,7 @@
 
 // +build amd64
 // +build go1.8
-// +build !go1.15
+// +build !go1.16
 
 #include "textflag.h"
 
diff --git a/pkg/procid/procid_arm64.s b/pkg/procid/procid_arm64.s
index 4f4b70fef..48ebb5fd1 100644
--- a/pkg/procid/procid_arm64.s
+++ b/pkg/procid/procid_arm64.s
@@ -14,7 +14,7 @@
 
 // +build arm64
 // +build go1.8
-// +build !go1.15
+// +build !go1.16
 
 #include "textflag.h"
 
diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index 0bdad5fad..fa6a21026 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -17,6 +17,7 @@
 package rand
 
 import (
+	"bufio"
 	"crypto/rand"
 	"io"
 
@@ -45,8 +46,22 @@ func (r *reader) Read(p []byte) (int, error) {
 	return rand.Read(p)
 }
 
+// bufferedReader implements a threadsafe buffered io.Reader.
+type bufferedReader struct {
+	mu sync.Mutex
+	r  *bufio.Reader
+}
+
+// Read implements io.Reader.Read.
+func (b *bufferedReader) Read(p []byte) (int, error) {
+	b.mu.Lock()
+	n, err := b.r.Read(p)
+	b.mu.Unlock()
+	return n, err
+}
+
 // Reader is the default reader.
-var Reader io.Reader = &reader{}
+var Reader io.Reader = &bufferedReader{r: bufio.NewReader(&reader{})}
 
 // Read reads from the default reader.
 func Read(b []byte) (int, error) {
diff --git a/pkg/safecopy/memcpy_amd64.s b/pkg/safecopy/memcpy_amd64.s
index 129691d68..00b46c18f 100644
--- a/pkg/safecopy/memcpy_amd64.s
+++ b/pkg/safecopy/memcpy_amd64.s
@@ -55,15 +55,9 @@ TEXT ·memcpy(SB), NOSPLIT, $0-36
 	MOVQ	from+8(FP), SI
 	MOVQ	n+16(FP), BX
 
-	// REP instructions have a high startup cost, so we handle small sizes
-	// with some straightline code. The REP MOVSQ instruction is really fast
-	// for large sizes. The cutover is approximately 2K.
 tail:
-	// move_129through256 or smaller work whether or not the source and the
-	// destination memory regions overlap because they load all data into
-	// registers before writing it back.  move_256through2048 on the other
-	// hand can be used only when the memory regions don't overlap or the copy
-	// direction is forward.
+	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not
+	// worth doing.
 	TESTQ	BX, BX
 	JEQ	move_0
 	CMPQ	BX, $2
@@ -83,31 +77,45 @@ tail:
 	JBE	move_65through128
 	CMPQ	BX, $256
 	JBE	move_129through256
-	// TODO: use branch table and BSR to make this just a single dispatch
 
-/*
- * forward copy loop
- */
-	CMPQ	BX, $2048
-	JLS	move_256through2048
-
-	// Check alignment
-	MOVL	SI, AX
-	ORL	DI, AX
-	TESTL	$7, AX
-	JEQ	fwdBy8
-
-	// Do 1 byte at a time
-	MOVQ	BX, CX
-	REP;	MOVSB
-	RET
-
-fwdBy8:
-	// Do 8 bytes at a time
-	MOVQ	BX, CX
-	SHRQ	$3, CX
-	ANDQ	$7, BX
-	REP;	MOVSQ
+move_257plus:
+	SUBQ	$256, BX
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	64(SI), X4
+	MOVOU	X4, 64(DI)
+	MOVOU	80(SI), X5
+	MOVOU	X5, 80(DI)
+	MOVOU	96(SI), X6
+	MOVOU	X6, 96(DI)
+	MOVOU	112(SI), X7
+	MOVOU	X7, 112(DI)
+	MOVOU	128(SI), X8
+	MOVOU	X8, 128(DI)
+	MOVOU	144(SI), X9
+	MOVOU	X9, 144(DI)
+	MOVOU	160(SI), X10
+	MOVOU	X10, 160(DI)
+	MOVOU	176(SI), X11
+	MOVOU	X11, 176(DI)
+	MOVOU	192(SI), X12
+	MOVOU	X12, 192(DI)
+	MOVOU	208(SI), X13
+	MOVOU	X13, 208(DI)
+	MOVOU	224(SI), X14
+	MOVOU	X14, 224(DI)
+	MOVOU	240(SI), X15
+	MOVOU	X15, 240(DI)
+	CMPQ	BX, $256
+	LEAQ	256(SI), SI
+	LEAQ	256(DI), DI
+	JGE	move_257plus
 	JMP	tail
 
 move_1or2:
@@ -209,42 +217,3 @@ move_129through256:
 	MOVOU	-16(SI)(BX*1), X15
 	MOVOU	X15, -16(DI)(BX*1)
 	RET
-move_256through2048:
-	SUBQ	$256, BX
-	MOVOU	(SI), X0
-	MOVOU	X0, (DI)
-	MOVOU	16(SI), X1
-	MOVOU	X1, 16(DI)
-	MOVOU	32(SI), X2
-	MOVOU	X2, 32(DI)
-	MOVOU	48(SI), X3
-	MOVOU	X3, 48(DI)
-	MOVOU	64(SI), X4
-	MOVOU	X4, 64(DI)
-	MOVOU	80(SI), X5
-	MOVOU	X5, 80(DI)
-	MOVOU	96(SI), X6
-	MOVOU	X6, 96(DI)
-	MOVOU	112(SI), X7
-	MOVOU	X7, 112(DI)
-	MOVOU	128(SI), X8
-	MOVOU	X8, 128(DI)
-	MOVOU	144(SI), X9
-	MOVOU	X9, 144(DI)
-	MOVOU	160(SI), X10
-	MOVOU	X10, 160(DI)
-	MOVOU	176(SI), X11
-	MOVOU	X11, 176(DI)
-	MOVOU	192(SI), X12
-	MOVOU	X12, 192(DI)
-	MOVOU	208(SI), X13
-	MOVOU	X13, 208(DI)
-	MOVOU	224(SI), X14
-	MOVOU	X14, 224(DI)
-	MOVOU	240(SI), X15
-	MOVOU	X15, 240(DI)
-	CMPQ	BX, $256
-	LEAQ	256(SI), SI
-	LEAQ	256(DI), DI
-	JGE	move_256through2048
-	JMP	tail
diff --git a/pkg/safemem/seq_test.go b/pkg/safemem/seq_test.go
index eba4bb535..de34005e9 100644
--- a/pkg/safemem/seq_test.go
+++ b/pkg/safemem/seq_test.go
@@ -20,6 +20,27 @@ import (
 	"testing"
 )
 
+func TestBlockSeqOfEmptyBlock(t *testing.T) {
+	bs := BlockSeqOf(Block{})
+	if !bs.IsEmpty() {
+		t.Errorf("BlockSeqOf(Block{}).IsEmpty(): got false, wanted true; BlockSeq is %v", bs)
+	}
+}
+
+func TestBlockSeqOfNonemptyBlock(t *testing.T) {
+	b := BlockFromSafeSlice(make([]byte, 1))
+	bs := BlockSeqOf(b)
+	if bs.IsEmpty() {
+		t.Fatalf("BlockSeqOf(non-empty Block).IsEmpty(): got true, wanted false; BlockSeq is %v", bs)
+	}
+	if head := bs.Head(); head != b {
+		t.Fatalf("BlockSeqOf(non-empty Block).Head(): got %v, wanted %v", head, b)
+	}
+	if tail := bs.Tail(); !tail.IsEmpty() {
+		t.Fatalf("BlockSeqOf(non-empty Block).Tail().IsEmpty(): got false, wanted true: tail is %v", tail)
+	}
+}
+
 type blockSeqTest struct {
 	desc string
 
diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go
index dcdfc9600..f5f0574f8 100644
--- a/pkg/safemem/seq_unsafe.go
+++ b/pkg/safemem/seq_unsafe.go
@@ -56,6 +56,9 @@ type BlockSeq struct {
 
 // BlockSeqOf returns a BlockSeq representing the single Block b.
 func BlockSeqOf(b Block) BlockSeq {
+	if b.length == 0 {
+		return BlockSeq{}
+	}
 	bs := BlockSeq{
 		data:   b.start,
 		length: -1,
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index da5a5e4b2..88766f33b 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -451,7 +451,7 @@ func TestRandom(t *testing.T) {
 		}
 	}
 
-	fmt.Printf("Testing filters: %v", syscallRules)
+	t.Logf("Testing filters: %v", syscallRules)
 	instrs, err := BuildProgram([]RuleSet{
 		RuleSet{
 			Rules:  syscallRules,
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index be328db12..f7e986589 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -21,13 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 )
 
-// sockFprog is sock_fprog taken from <linux/filter.h>.
-type sockFprog struct {
-	Len    uint16
-	pad    [6]byte
-	Filter *linux.BPFInstruction
-}
-
 // SetFilter installs the given BPF program.
 //
 // This is safe to call from an afterFork context.
@@ -39,7 +32,7 @@ func SetFilter(instrs []linux.BPFInstruction) syscall.Errno {
 		return errno
 	}
 
-	sockProg := sockFprog{
+	sockProg := linux.SockFprog{
 		Len:    uint16(len(instrs)),
 		Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])),
 	}
diff --git a/pkg/segment/BUILD b/pkg/segment/BUILD
index 1b487b887..f57ccc170 100644
--- a/pkg/segment/BUILD
+++ b/pkg/segment/BUILD
@@ -21,6 +21,8 @@ go_template(
     ],
     opt_consts = [
         "minDegree",
+        # trackGaps must either be 0 or 1.
+        "trackGaps",
     ],
     types = [
         "Key",
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index 03e4f258f..1a17ad9cb 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -36,6 +36,34 @@ type Range interface{}
 // Value is a required type parameter.
 type Value interface{}
 
+// trackGaps is an optional parameter.
+//
+// If trackGaps is 1, the Set will track maximum gap size recursively,
+// enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this
+// case, Key must be an unsigned integer.
+//
+// trackGaps must be 0 or 1.
+const trackGaps = 0
+
+var _ = uint8(trackGaps << 7) // Will fail if not zero or one.
+
+// dynamicGap is a type that disappears if trackGaps is 0.
+type dynamicGap [trackGaps]Key
+
+// Get returns the value of the gap.
+//
+// Precondition: trackGaps must be non-zero.
+func (d *dynamicGap) Get() Key {
+	return d[:][0]
+}
+
+// Set sets the value of the gap.
+//
+// Precondition: trackGaps must be non-zero.
+func (d *dynamicGap) Set(v Key) {
+	d[:][0] = v
+}
+
 // Functions is a required type parameter that must be a struct implementing
 // the methods defined by Functions.
 type Functions interface {
@@ -327,8 +355,12 @@ func (s *Set) Insert(gap GapIterator, r Range, val Value) Iterator {
 	}
 	if prev.Ok() && prev.End() == r.Start {
 		if mval, ok := (Functions{}).Merge(prev.Range(), prev.Value(), r, val); ok {
+			shrinkMaxGap := trackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get()
 			prev.SetEndUnchecked(r.End)
 			prev.SetValue(mval)
+			if shrinkMaxGap {
+				gap.node.updateMaxGapLeaf()
+			}
 			if next.Ok() && next.Start() == r.End {
 				val = mval
 				if mval, ok := (Functions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok {
@@ -342,11 +374,16 @@ func (s *Set) Insert(gap GapIterator, r Range, val Value) Iterator {
 	}
 	if next.Ok() && next.Start() == r.End {
 		if mval, ok := (Functions{}).Merge(r, val, next.Range(), next.Value()); ok {
+			shrinkMaxGap := trackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get()
 			next.SetStartUnchecked(r.Start)
 			next.SetValue(mval)
+			if shrinkMaxGap {
+				gap.node.updateMaxGapLeaf()
+			}
 			return next
 		}
 	}
+	// InsertWithoutMergingUnchecked will maintain maxGap if necessary.
 	return s.InsertWithoutMergingUnchecked(gap, r, val)
 }
 
@@ -373,11 +410,15 @@ func (s *Set) InsertWithoutMerging(gap GapIterator, r Range, val Value) Iterator
 // Preconditions: r.Start >= gap.Start(); r.End <= gap.End().
 func (s *Set) InsertWithoutMergingUnchecked(gap GapIterator, r Range, val Value) Iterator {
 	gap = gap.node.rebalanceBeforeInsert(gap)
+	splitMaxGap := trackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get())
 	copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments])
 	copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments])
 	gap.node.keys[gap.index] = r
 	gap.node.values[gap.index] = val
 	gap.node.nrSegments++
+	if splitMaxGap {
+		gap.node.updateMaxGapLeaf()
+	}
 	return Iterator{gap.node, gap.index}
 }
 
@@ -399,12 +440,23 @@ func (s *Set) Remove(seg Iterator) GapIterator {
 		// overlap.
 		seg.SetRangeUnchecked(victim.Range())
 		seg.SetValue(victim.Value())
+		// Need to update the nextAdjacentNode's maxGap because the gap in between
+		// must have been modified by updating seg.Range() to victim.Range().
+		// seg.NextSegment() must exist since the last segment can't be in a
+		// non-leaf node.
+		nextAdjacentNode := seg.NextSegment().node
+		if trackGaps != 0 {
+			nextAdjacentNode.updateMaxGapLeaf()
+		}
 		return s.Remove(victim).NextGap()
 	}
 	copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments])
 	copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments])
 	Functions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1])
 	seg.node.nrSegments--
+	if trackGaps != 0 {
+		seg.node.updateMaxGapLeaf()
+	}
 	return seg.node.rebalanceAfterRemove(GapIterator{seg.node, seg.index})
 }
 
@@ -455,6 +507,7 @@ func (s *Set) MergeUnchecked(first, second Iterator) Iterator {
 			// overlaps second.
 			first.SetEndUnchecked(second.End())
 			first.SetValue(mval)
+			// Remove will handle the maxGap update if necessary.
 			return s.Remove(second).PrevSegment()
 		}
 	}
@@ -631,6 +684,12 @@ type node struct {
 	// than "isLeaf" because false must be the correct value for an empty root.
 	hasChildren bool
 
+	// The longest gap within this node. If the node is a leaf, it's simply the
+	// maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys
+	// including the 0th and nrSegments-th gap possibly shared with its upper-level
+	// nodes; if it's a non-leaf node, it's the max of all children's maxGap.
+	maxGap dynamicGap
+
 	// Nodes store keys and values in separate arrays to maximize locality in
 	// the common case (scanning keys for lookup).
 	keys     [maxDegree - 1]Range
@@ -676,12 +735,12 @@ func (n *node) nextSibling() *node {
 // required for insertion, and returns an updated iterator to the position
 // represented by gap.
 func (n *node) rebalanceBeforeInsert(gap GapIterator) GapIterator {
-	if n.parent != nil {
-		gap = n.parent.rebalanceBeforeInsert(gap)
-	}
 	if n.nrSegments < maxDegree-1 {
 		return gap
 	}
+	if n.parent != nil {
+		gap = n.parent.rebalanceBeforeInsert(gap)
+	}
 	if n.parent == nil {
 		// n is root. Move all segments before and after n's median segment
 		// into new child nodes adjacent to the median segment, which is now
@@ -719,6 +778,13 @@ func (n *node) rebalanceBeforeInsert(gap GapIterator) GapIterator {
 		n.hasChildren = true
 		n.children[0] = left
 		n.children[1] = right
+		// In this case, n's maxGap won't violated as it's still the root,
+		// but the left and right children should be updated locally as they
+		// are newly split from n.
+		if trackGaps != 0 {
+			left.updateMaxGapLocal()
+			right.updateMaxGapLocal()
+		}
 		if gap.node != n {
 			return gap
 		}
@@ -758,6 +824,12 @@ func (n *node) rebalanceBeforeInsert(gap GapIterator) GapIterator {
 		}
 	}
 	n.nrSegments = minDegree - 1
+	// MaxGap of n's parent is not violated because the segments within is not changed.
+	// n and its sibling's maxGap need to be updated locally as they are two new nodes split from old n.
+	if trackGaps != 0 {
+		n.updateMaxGapLocal()
+		sibling.updateMaxGapLocal()
+	}
 	// gap.node can't be n.parent because gaps are always in leaf nodes.
 	if gap.node != n {
 		return gap
@@ -821,6 +893,12 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator {
 			}
 			n.nrSegments++
 			sibling.nrSegments--
+			// n's parent's maxGap does not need to be updated as its content is unmodified.
+			// n and its sibling must be updated with (new) maxGap because of the shift of keys.
+			if trackGaps != 0 {
+				n.updateMaxGapLocal()
+				sibling.updateMaxGapLocal()
+			}
 			if gap.node == sibling && gap.index == sibling.nrSegments {
 				return GapIterator{n, 0}
 			}
@@ -849,6 +927,12 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator {
 			}
 			n.nrSegments++
 			sibling.nrSegments--
+			// n's parent's maxGap does not need to be updated as its content is unmodified.
+			// n and its sibling must be updated with (new) maxGap because of the shift of keys.
+			if trackGaps != 0 {
+				n.updateMaxGapLocal()
+				sibling.updateMaxGapLocal()
+			}
 			if gap.node == sibling {
 				if gap.index == 0 {
 					return GapIterator{n, n.nrSegments}
@@ -886,6 +970,7 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator {
 				p.children[0] = nil
 				p.children[1] = nil
 			}
+			// No need to update maxGap of p as its content is not changed.
 			if gap.node == left {
 				return GapIterator{p, gap.index}
 			}
@@ -932,11 +1017,152 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator {
 		}
 		p.children[p.nrSegments] = nil
 		p.nrSegments--
+		// Update maxGap of left locally, no need to change p and right because
+		// p's contents is not changed and right is already invalid.
+		if trackGaps != 0 {
+			left.updateMaxGapLocal()
+		}
 		// This process robs p of one segment, so recurse into rebalancing p.
 		n = p
 	}
 }
 
+// updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no
+// necessary update.
+//
+// Preconditions: n must be a leaf node, trackGaps must be 1.
+func (n *node) updateMaxGapLeaf() {
+	if n.hasChildren {
+		panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n))
+	}
+	max := n.calculateMaxGapLeaf()
+	if max == n.maxGap.Get() {
+		// If new max equals the old maxGap, no update is needed.
+		return
+	}
+	oldMax := n.maxGap.Get()
+	n.maxGap.Set(max)
+	if max > oldMax {
+		// Grow ancestor maxGaps.
+		for p := n.parent; p != nil; p = p.parent {
+			if p.maxGap.Get() >= max {
+				// p and its ancestors already contain an equal or larger gap.
+				break
+			}
+			// Only if new maxGap is larger than parent's
+			// old maxGap, propagate this update to parent.
+			p.maxGap.Set(max)
+		}
+		return
+	}
+	// Shrink ancestor maxGaps.
+	for p := n.parent; p != nil; p = p.parent {
+		if p.maxGap.Get() > oldMax {
+			// p and its ancestors still contain a larger gap.
+			break
+		}
+		// If new max is smaller than the old maxGap, and this gap used
+		// to be the maxGap of its parent, iterate parent's children
+		// and calculate parent's new maxGap.(It's probable that parent
+		// has two children with the old maxGap, but we need to check it anyway.)
+		parentNewMax := p.calculateMaxGapInternal()
+		if p.maxGap.Get() == parentNewMax {
+			// p and its ancestors still contain a gap of at least equal size.
+			break
+		}
+		// If p's new maxGap differs from the old one, propagate this update.
+		p.maxGap.Set(parentNewMax)
+	}
+}
+
+// updateMaxGapLocal updates maxGap of the calling node solely with no
+// propagation to ancestor nodes.
+//
+// Precondition: trackGaps must be 1.
+func (n *node) updateMaxGapLocal() {
+	if !n.hasChildren {
+		// Leaf node iterates its gaps.
+		n.maxGap.Set(n.calculateMaxGapLeaf())
+	} else {
+		// Non-leaf node iterates its children.
+		n.maxGap.Set(n.calculateMaxGapInternal())
+	}
+}
+
+// calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the
+// max.
+//
+// Preconditions: n must be a leaf node.
+func (n *node) calculateMaxGapLeaf() Key {
+	max := GapIterator{n, 0}.Range().Length()
+	for i := 1; i <= n.nrSegments; i++ {
+		if current := (GapIterator{n, i}).Range().Length(); current > max {
+			max = current
+		}
+	}
+	return max
+}
+
+// calculateMaxGapInternal iterates children's maxGap within an internal node n
+// and calculate the max.
+//
+// Preconditions: n must be a non-leaf node.
+func (n *node) calculateMaxGapInternal() Key {
+	max := n.children[0].maxGap.Get()
+	for i := 1; i <= n.nrSegments; i++ {
+		if current := n.children[i].maxGap.Get(); current > max {
+			max = current
+		}
+	}
+	return max
+}
+
+// searchFirstLargeEnoughGap returns the first gap having at least minSize length
+// in the subtree rooted by n. If not found, return a terminal gap iterator.
+func (n *node) searchFirstLargeEnoughGap(minSize Key) GapIterator {
+	if n.maxGap.Get() < minSize {
+		return GapIterator{}
+	}
+	if n.hasChildren {
+		for i := 0; i <= n.nrSegments; i++ {
+			if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() {
+				return largeEnoughGap
+			}
+		}
+	} else {
+		for i := 0; i <= n.nrSegments; i++ {
+			currentGap := GapIterator{n, i}
+			if currentGap.Range().Length() >= minSize {
+				return currentGap
+			}
+		}
+	}
+	panic(fmt.Sprintf("invalid maxGap in %v", n))
+}
+
+// searchLastLargeEnoughGap returns the last gap having at least minSize length
+// in the subtree rooted by n. If not found, return a terminal gap iterator.
+func (n *node) searchLastLargeEnoughGap(minSize Key) GapIterator {
+	if n.maxGap.Get() < minSize {
+		return GapIterator{}
+	}
+	if n.hasChildren {
+		for i := n.nrSegments; i >= 0; i-- {
+			if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() {
+				return largeEnoughGap
+			}
+		}
+	} else {
+		for i := n.nrSegments; i >= 0; i-- {
+			currentGap := GapIterator{n, i}
+			if currentGap.Range().Length() >= minSize {
+				return currentGap
+			}
+		}
+	}
+	panic(fmt.Sprintf("invalid maxGap in %v", n))
+}
+
 // A Iterator is conceptually one of:
 //
 // - A pointer to a segment in a set; or
@@ -1243,6 +1469,122 @@ func (gap GapIterator) NextGap() GapIterator {
 	return seg.NextGap()
 }
 
+// NextLargeEnoughGap returns the iterated gap's first next gap with larger
+// length than minSize.  If not found, return a terminal gap iterator (does NOT
+// include this gap itself).
+//
+// Precondition: trackGaps must be 1.
+func (gap GapIterator) NextLargeEnoughGap(minSize Key) GapIterator {
+	if trackGaps != 1 {
+		panic("set is not tracking gaps")
+	}
+	if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments {
+		// If gap is the trailing gap of an non-leaf node,
+		// translate it to the equivalent gap on leaf level.
+		gap.node = gap.NextSegment().node
+		gap.index = 0
+		return gap.nextLargeEnoughGapHelper(minSize)
+	}
+	return gap.nextLargeEnoughGapHelper(minSize)
+}
+
+// nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap
+// to do the real recursions.
+//
+// Preconditions: gap is NOT the trailing gap of a non-leaf node.
+func (gap GapIterator) nextLargeEnoughGapHelper(minSize Key) GapIterator {
+	// Crawl up the tree if no large enough gap in current node or the
+	// current gap is the trailing one on leaf level.
+	for gap.node != nil &&
+		(gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) {
+		gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	}
+	// If no large enough gap throughout the whole set, return a terminal
+	// gap iterator.
+	if gap.node == nil {
+		return GapIterator{}
+	}
+	// Iterate subsequent gaps.
+	gap.index++
+	for gap.index <= gap.node.nrSegments {
+		if gap.node.hasChildren {
+			if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() {
+				return largeEnoughGap
+			}
+		} else {
+			if gap.Range().Length() >= minSize {
+				return gap
+			}
+		}
+		gap.index++
+	}
+	gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	if gap.node != nil && gap.index == gap.node.nrSegments {
+		// If gap is the trailing gap of a non-leaf node, crawl up to
+		// parent again and do recursion.
+		gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	}
+	return gap.nextLargeEnoughGapHelper(minSize)
+}
+
+// PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or
+// equal length than minSize.  If not found, return a terminal gap iterator
+// (does NOT include this gap itself).
+//
+// Precondition: trackGaps must be 1.
+func (gap GapIterator) PrevLargeEnoughGap(minSize Key) GapIterator {
+	if trackGaps != 1 {
+		panic("set is not tracking gaps")
+	}
+	if gap.node != nil && gap.node.hasChildren && gap.index == 0 {
+		// If gap is the first gap of an non-leaf node,
+		// translate it to the equivalent gap on leaf level.
+		gap.node = gap.PrevSegment().node
+		gap.index = gap.node.nrSegments
+		return gap.prevLargeEnoughGapHelper(minSize)
+	}
+	return gap.prevLargeEnoughGapHelper(minSize)
+}
+
+// prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap
+// to do the real recursions.
+//
+// Preconditions: gap is NOT the first gap of a non-leaf node.
+func (gap GapIterator) prevLargeEnoughGapHelper(minSize Key) GapIterator {
+	// Crawl up the tree if no large enough gap in current node or the
+	// current gap is the first one on leaf level.
+	for gap.node != nil &&
+		(gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) {
+		gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	}
+	// If no large enough gap throughout the whole set, return a terminal
+	// gap iterator.
+	if gap.node == nil {
+		return GapIterator{}
+	}
+	// Iterate previous gaps.
+	gap.index--
+	for gap.index >= 0 {
+		if gap.node.hasChildren {
+			if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() {
+				return largeEnoughGap
+			}
+		} else {
+			if gap.Range().Length() >= minSize {
+				return gap
+			}
+		}
+		gap.index--
+	}
+	gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	if gap.node != nil && gap.index == 0 {
+		// If gap is the first gap of a non-leaf node, crawl up to
+		// parent again and do recursion.
+		gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	}
+	return gap.prevLargeEnoughGapHelper(minSize)
+}
+
 // segmentBeforePosition returns the predecessor segment of the position given
 // by n.children[i], which may or may not contain a child. If no such segment
 // exists, segmentBeforePosition returns a terminal iterator.
@@ -1271,7 +1613,7 @@ func segmentAfterPosition(n *node, i int) Iterator {
 
 func zeroValueSlice(slice []Value) {
 	// TODO(jamieliu): check if Go is actually smart enough to optimize a
-	// ClearValue that assigns nil to a memset here
+	// ClearValue that assigns nil to a memset here.
 	for i := range slice {
 		Functions{}.ClearValue(&slice[i])
 	}
@@ -1310,7 +1652,15 @@ func (n *node) writeDebugString(buf *bytes.Buffer, prefix string) {
 			child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i))
 		}
 		buf.WriteString(prefix)
-		buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
+		if n.hasChildren {
+			if trackGaps != 0 {
+				buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get()))
+			} else {
+				buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
+			}
+		} else {
+			buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
+		}
 	}
 	if child := n.children[n.nrSegments]; child != nil {
 		child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments))
@@ -1362,3 +1712,43 @@ func (s *Set) ImportSortedSlices(sds *SegmentDataSlices) error {
 	}
 	return nil
 }
+
+// segmentTestCheck returns an error if s is incorrectly sorted, does not
+// contain exactly expectedSegments segments, or contains a segment which
+// fails the passed check.
+//
+// This should be used only for testing, and has been added to this package for
+// templating convenience.
+func (s *Set) segmentTestCheck(expectedSegments int, segFunc func(int, Range, Value) error) error {
+	havePrev := false
+	prev := Key(0)
+	nrSegments := 0
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		next := seg.Start()
+		if havePrev && prev >= next {
+			return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments)
+		}
+		if segFunc != nil {
+			if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil {
+				return err
+			}
+		}
+		prev = next
+		havePrev = true
+		nrSegments++
+	}
+	if nrSegments != expectedSegments {
+		return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments)
+	}
+	return nil
+}
+
+// countSegments counts the number of segments in the set.
+//
+// Similar to Check, this should only be used for testing.
+func (s *Set) countSegments() (segments int) {
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		segments++
+	}
+	return segments
+}
diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD
index f2d8462d8..131bf09b9 100644
--- a/pkg/segment/test/BUILD
+++ b/pkg/segment/test/BUILD
@@ -29,10 +29,28 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "gap_set",
+    out = "gap_set.go",
+    consts = {
+        "trackGaps": "1",
+    },
+    package = "segment",
+    prefix = "gap",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "int",
+        "Range": "Range",
+        "Value": "int",
+        "Functions": "gapSetFunctions",
+    },
+)
+
 go_library(
     name = "segment",
     testonly = 1,
     srcs = [
+        "gap_set.go",
         "int_range.go",
         "int_set.go",
         "set_functions.go",
diff --git a/pkg/segment/test/segment_test.go b/pkg/segment/test/segment_test.go
index f19a005f3..85fa19096 100644
--- a/pkg/segment/test/segment_test.go
+++ b/pkg/segment/test/segment_test.go
@@ -17,6 +17,7 @@ package segment
 import (
 	"fmt"
 	"math/rand"
+	"reflect"
 	"testing"
 )
 
@@ -32,61 +33,65 @@ const (
 	// valueOffset is the difference between the value and start of test
 	// segments.
 	valueOffset = 100000
+
+	// intervalLength is the interval used by random gap tests.
+	intervalLength = 10
 )
 
 func shuffle(xs []int) {
-	for i := range xs {
-		j := rand.Intn(i + 1)
-		xs[i], xs[j] = xs[j], xs[i]
-	}
+	rand.Shuffle(len(xs), func(i, j int) { xs[i], xs[j] = xs[j], xs[i] })
 }
 
-func randPermutation(size int) []int {
+func randIntervalPermutation(size int) []int {
 	p := make([]int, size)
 	for i := range p {
-		p[i] = i
+		p[i] = intervalLength * i
 	}
 	shuffle(p)
 	return p
 }
 
-// checkSet returns an error if s is incorrectly sorted, does not contain
-// exactly expectedSegments segments, or contains a segment for which val !=
-// key + valueOffset.
-func checkSet(s *Set, expectedSegments int) error {
-	havePrev := false
-	prev := 0
-	nrSegments := 0
-	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		next := seg.Start()
-		if havePrev && prev >= next {
-			return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments)
-		}
-		if got, want := seg.Value(), seg.Start()+valueOffset; got != want {
-			return fmt.Errorf("segment %d has key %d, value %d (expected %d)", nrSegments, seg.Start, got, want)
-		}
-		prev = next
-		havePrev = true
-		nrSegments++
-	}
-	if nrSegments != expectedSegments {
-		return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments)
+// validate can be passed to Check.
+func validate(nr int, r Range, v int) error {
+	if got, want := v, r.Start+valueOffset; got != want {
+		return fmt.Errorf("segment %d has key %d, value %d (expected %d)", nr, r.Start, got, want)
 	}
 	return nil
 }
 
-// countSegmentsIn returns the number of segments in s.
-func countSegmentsIn(s *Set) int {
-	var count int
-	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		count++
+// checkSetMaxGap returns an error if maxGap inside all nodes of s is not well
+// maintained.
+func checkSetMaxGap(s *gapSet) error {
+	n := s.root
+	return checkNodeMaxGap(&n)
+}
+
+// checkNodeMaxGap returns an error if maxGap inside the subtree rooted by n is
+// not well maintained.
+func checkNodeMaxGap(n *gapnode) error {
+	var max int
+	if !n.hasChildren {
+		max = n.calculateMaxGapLeaf()
+	} else {
+		for i := 0; i <= n.nrSegments; i++ {
+			child := n.children[i]
+			if err := checkNodeMaxGap(child); err != nil {
+				return err
+			}
+			if temp := child.maxGap.Get(); i == 0 || temp > max {
+				max = temp
+			}
+		}
+	}
+	if max != n.maxGap.Get() {
+		return fmt.Errorf("maxGap wrong in node\n%vexpected: %d got: %d", n, max, n.maxGap)
 	}
-	return count
+	return nil
 }
 
 func TestAddRandom(t *testing.T) {
 	var s Set
-	order := randPermutation(testSize)
+	order := rand.Perm(testSize)
 	var nrInsertions int
 	for i, j := range order {
 		if !s.AddWithoutMerging(Range{j, j + 1}, j+valueOffset) {
@@ -94,12 +99,12 @@ func TestAddRandom(t *testing.T) {
 			break
 		}
 		nrInsertions++
-		if err := checkSet(&s, nrInsertions); err != nil {
+		if err := s.segmentTestCheck(nrInsertions, validate); err != nil {
 			t.Errorf("Iteration %d: %v", i, err)
 			break
 		}
 	}
-	if got, want := countSegmentsIn(&s), nrInsertions; got != want {
+	if got, want := s.countSegments(), nrInsertions; got != want {
 		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
 	}
 	if t.Failed() {
@@ -115,7 +120,156 @@ func TestRemoveRandom(t *testing.T) {
 			t.Fatalf("Failed to insert segment %d", i)
 		}
 	}
-	order := randPermutation(testSize)
+	order := rand.Perm(testSize)
+	var nrRemovals int
+	for i, j := range order {
+		seg := s.FindSegment(j)
+		if !seg.Ok() {
+			t.Errorf("Iteration %d: failed to find segment with key %d", i, j)
+			break
+		}
+		s.Remove(seg)
+		nrRemovals++
+		if err := s.segmentTestCheck(testSize-nrRemovals, validate); err != nil {
+			t.Errorf("Iteration %d: %v", i, err)
+			break
+		}
+	}
+	if got, want := s.countSegments(), testSize-nrRemovals; got != want {
+		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
+	}
+	if t.Failed() {
+		t.Logf("Removal order: %v", order[:nrRemovals])
+		t.Logf("Set contents:\n%v", &s)
+		t.FailNow()
+	}
+}
+
+func TestMaxGapAddRandom(t *testing.T) {
+	var s gapSet
+	order := rand.Perm(testSize)
+	var nrInsertions int
+	for i, j := range order {
+		if !s.AddWithoutMerging(Range{j, j + 1}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		nrInsertions++
+		if err := s.segmentTestCheck(nrInsertions, validate); err != nil {
+			t.Errorf("Iteration %d: %v", i, err)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	if got, want := s.countSegments(), nrInsertions; got != want {
+		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
+	}
+	if t.Failed() {
+		t.Logf("Insertion order: %v", order[:nrInsertions])
+		t.Logf("Set contents:\n%v", &s)
+	}
+}
+
+func TestMaxGapAddRandomWithRandomInterval(t *testing.T) {
+	var s gapSet
+	order := randIntervalPermutation(testSize)
+	var nrInsertions int
+	for i, j := range order {
+		if !s.AddWithoutMerging(Range{j, j + rand.Intn(intervalLength-1) + 1}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		nrInsertions++
+		if err := s.segmentTestCheck(nrInsertions, validate); err != nil {
+			t.Errorf("Iteration %d: %v", i, err)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	if got, want := s.countSegments(), nrInsertions; got != want {
+		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
+	}
+	if t.Failed() {
+		t.Logf("Insertion order: %v", order[:nrInsertions])
+		t.Logf("Set contents:\n%v", &s)
+	}
+}
+
+func TestMaxGapAddRandomWithMerge(t *testing.T) {
+	var s gapSet
+	order := randIntervalPermutation(testSize)
+	nrInsertions := 1
+	for i, j := range order {
+		if !s.Add(Range{j, j + intervalLength}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	if got, want := s.countSegments(), nrInsertions; got != want {
+		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
+	}
+	if t.Failed() {
+		t.Logf("Insertion order: %v", order)
+		t.Logf("Set contents:\n%v", &s)
+	}
+}
+
+func TestMaxGapRemoveRandom(t *testing.T) {
+	var s gapSet
+	for i := 0; i < testSize; i++ {
+		if !s.AddWithoutMerging(Range{i, i + 1}, i+valueOffset) {
+			t.Fatalf("Failed to insert segment %d", i)
+		}
+	}
+	order := rand.Perm(testSize)
+	var nrRemovals int
+	for i, j := range order {
+		seg := s.FindSegment(j)
+		if !seg.Ok() {
+			t.Errorf("Iteration %d: failed to find segment with key %d", i, j)
+			break
+		}
+		temprange := seg.Range()
+		s.Remove(seg)
+		nrRemovals++
+		if err := s.segmentTestCheck(testSize-nrRemovals, validate); err != nil {
+			t.Errorf("Iteration %d: %v", i, err)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When removing %v: %v", temprange, err)
+			break
+		}
+	}
+	if got, want := s.countSegments(), testSize-nrRemovals; got != want {
+		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
+	}
+	if t.Failed() {
+		t.Logf("Removal order: %v", order[:nrRemovals])
+		t.Logf("Set contents:\n%v", &s)
+		t.FailNow()
+	}
+}
+
+func TestMaxGapRemoveHalfRandom(t *testing.T) {
+	var s gapSet
+	for i := 0; i < testSize; i++ {
+		if !s.AddWithoutMerging(Range{intervalLength * i, intervalLength*i + rand.Intn(intervalLength-1) + 1}, intervalLength*i+valueOffset) {
+			t.Fatalf("Failed to insert segment %d", i)
+		}
+	}
+	order := randIntervalPermutation(testSize)
+	order = order[:testSize/2]
 	var nrRemovals int
 	for i, j := range order {
 		seg := s.FindSegment(j)
@@ -123,14 +277,19 @@ func TestRemoveRandom(t *testing.T) {
 			t.Errorf("Iteration %d: failed to find segment with key %d", i, j)
 			break
 		}
+		temprange := seg.Range()
 		s.Remove(seg)
 		nrRemovals++
-		if err := checkSet(&s, testSize-nrRemovals); err != nil {
+		if err := s.segmentTestCheck(testSize-nrRemovals, validate); err != nil {
 			t.Errorf("Iteration %d: %v", i, err)
 			break
 		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When removing %v: %v", temprange, err)
+			break
+		}
 	}
-	if got, want := countSegmentsIn(&s), testSize-nrRemovals; got != want {
+	if got, want := s.countSegments(), testSize-nrRemovals; got != want {
 		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
 	}
 	if t.Failed() {
@@ -140,6 +299,148 @@ func TestRemoveRandom(t *testing.T) {
 	}
 }
 
+func TestMaxGapAddRandomRemoveRandomHalfWithMerge(t *testing.T) {
+	var s gapSet
+	order := randIntervalPermutation(testSize * 2)
+	order = order[:testSize]
+	for i, j := range order {
+		if !s.Add(Range{j, j + intervalLength}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	shuffle(order)
+	var nrRemovals int
+	for _, j := range order {
+		seg := s.FindSegment(j)
+		if !seg.Ok() {
+			continue
+		}
+		temprange := seg.Range()
+		s.Remove(seg)
+		nrRemovals++
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When removing %v: %v", temprange, err)
+			break
+		}
+	}
+	if t.Failed() {
+		t.Logf("Removal order: %v", order[:nrRemovals])
+		t.Logf("Set contents:\n%v", &s)
+		t.FailNow()
+	}
+}
+
+func TestNextLargeEnoughGap(t *testing.T) {
+	var s gapSet
+	order := randIntervalPermutation(testSize * 2)
+	order = order[:testSize]
+	for i, j := range order {
+		if !s.Add(Range{j, j + rand.Intn(intervalLength-1) + 1}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	shuffle(order)
+	order = order[:testSize/2]
+	for _, j := range order {
+		seg := s.FindSegment(j)
+		if !seg.Ok() {
+			continue
+		}
+		temprange := seg.Range()
+		s.Remove(seg)
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When removing %v: %v", temprange, err)
+			break
+		}
+	}
+	minSize := 7
+	var gapArr1 []int
+	for gap := s.LowerBoundGap(0).NextLargeEnoughGap(minSize); gap.Ok(); gap = gap.NextLargeEnoughGap(minSize) {
+		if gap.Range().Length() < minSize {
+			t.Errorf("NextLargeEnoughGap wrong, gap %v has length %d, wanted %d", gap.Range(), gap.Range().Length(), minSize)
+		} else {
+			gapArr1 = append(gapArr1, gap.Range().Start)
+		}
+	}
+	var gapArr2 []int
+	for gap := s.LowerBoundGap(0).NextGap(); gap.Ok(); gap = gap.NextGap() {
+		if gap.Range().Length() >= minSize {
+			gapArr2 = append(gapArr2, gap.Range().Start)
+		}
+	}
+
+	if !reflect.DeepEqual(gapArr2, gapArr1) {
+		t.Errorf("Search result not correct, got: %v, wanted: %v", gapArr1, gapArr2)
+	}
+	if t.Failed() {
+		t.Logf("Set contents:\n%v", &s)
+		t.FailNow()
+	}
+}
+
+func TestPrevLargeEnoughGap(t *testing.T) {
+	var s gapSet
+	order := randIntervalPermutation(testSize * 2)
+	order = order[:testSize]
+	for i, j := range order {
+		if !s.Add(Range{j, j + rand.Intn(intervalLength-1) + 1}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	end := s.LastSegment().End()
+	shuffle(order)
+	order = order[:testSize/2]
+	for _, j := range order {
+		seg := s.FindSegment(j)
+		if !seg.Ok() {
+			continue
+		}
+		temprange := seg.Range()
+		s.Remove(seg)
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When removing %v: %v", temprange, err)
+			break
+		}
+	}
+	minSize := 7
+	var gapArr1 []int
+	for gap := s.UpperBoundGap(end + intervalLength).PrevLargeEnoughGap(minSize); gap.Ok(); gap = gap.PrevLargeEnoughGap(minSize) {
+		if gap.Range().Length() < minSize {
+			t.Errorf("PrevLargeEnoughGap wrong, gap length %d, wanted %d", gap.Range().Length(), minSize)
+		} else {
+			gapArr1 = append(gapArr1, gap.Range().Start)
+		}
+	}
+	var gapArr2 []int
+	for gap := s.UpperBoundGap(end + intervalLength).PrevGap(); gap.Ok(); gap = gap.PrevGap() {
+		if gap.Range().Length() >= minSize {
+			gapArr2 = append(gapArr2, gap.Range().Start)
+		}
+	}
+	if !reflect.DeepEqual(gapArr2, gapArr1) {
+		t.Errorf("Search result not correct, got: %v, wanted: %v", gapArr1, gapArr2)
+	}
+	if t.Failed() {
+		t.Logf("Set contents:\n%v", &s)
+		t.FailNow()
+	}
+}
+
 func TestAddSequentialAdjacent(t *testing.T) {
 	var s Set
 	var nrInsertions int
@@ -148,12 +449,12 @@ func TestAddSequentialAdjacent(t *testing.T) {
 			t.Fatalf("Failed to insert segment %d", i)
 		}
 		nrInsertions++
-		if err := checkSet(&s, nrInsertions); err != nil {
+		if err := s.segmentTestCheck(nrInsertions, validate); err != nil {
 			t.Errorf("Iteration %d: %v", i, err)
 			break
 		}
 	}
-	if got, want := countSegmentsIn(&s), nrInsertions; got != want {
+	if got, want := s.countSegments(), nrInsertions; got != want {
 		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
 	}
 	if t.Failed() {
@@ -202,12 +503,12 @@ func TestAddSequentialNonAdjacent(t *testing.T) {
 			t.Fatalf("Failed to insert segment %d", i)
 		}
 		nrInsertions++
-		if err := checkSet(&s, nrInsertions); err != nil {
+		if err := s.segmentTestCheck(nrInsertions, validate); err != nil {
 			t.Errorf("Iteration %d: %v", i, err)
 			break
 		}
 	}
-	if got, want := countSegmentsIn(&s), nrInsertions; got != want {
+	if got, want := s.countSegments(), nrInsertions; got != want {
 		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
 	}
 	if t.Failed() {
@@ -293,7 +594,7 @@ Tests:
 		var i int
 		for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
 			if i > len(test.final) {
-				t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, countSegmentsIn(&s), len(test.final), &s)
+				t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, s.countSegments(), len(test.final), &s)
 				continue Tests
 			}
 			if got, want := seg.Range(), test.final[i]; got != want {
@@ -351,7 +652,7 @@ Tests:
 		var i int
 		for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
 			if i > len(test.final) {
-				t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, countSegmentsIn(&s), len(test.final), &s)
+				t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, s.countSegments(), len(test.final), &s)
 				continue Tests
 			}
 			if got, want := seg.Range(), test.final[i]; got != want {
@@ -378,7 +679,7 @@ func benchmarkAddSequential(b *testing.B, size int) {
 }
 
 func benchmarkAddRandom(b *testing.B, size int) {
-	order := randPermutation(size)
+	order := rand.Perm(size)
 
 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
@@ -416,7 +717,7 @@ func benchmarkFindRandom(b *testing.B, size int) {
 			b.Fatalf("Failed to insert segment %d", i)
 		}
 	}
-	order := randPermutation(size)
+	order := rand.Perm(size)
 
 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
@@ -470,7 +771,7 @@ func benchmarkAddFindRemoveSequential(b *testing.B, size int) {
 }
 
 func benchmarkAddFindRemoveRandom(b *testing.B, size int) {
-	order := randPermutation(size)
+	order := rand.Perm(size)
 
 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
diff --git a/pkg/segment/test/set_functions.go b/pkg/segment/test/set_functions.go
index bcddb39bb..7cd895cc7 100644
--- a/pkg/segment/test/set_functions.go
+++ b/pkg/segment/test/set_functions.go
@@ -14,21 +14,16 @@
 
 package segment
 
-// Basic numeric constants that we define because the math package doesn't.
-// TODO(nlacasse): These should be Math.MaxInt64/MinInt64?
-const (
-	maxInt = int(^uint(0) >> 1)
-	minInt = -maxInt - 1
-)
-
 type setFunctions struct{}
 
-func (setFunctions) MinKey() int {
-	return minInt
+// MinKey returns the minimum key for the set.
+func (s setFunctions) MinKey() int {
+	return -s.MaxKey() - 1
 }
 
+// MaxKey returns the maximum key for the set.
 func (setFunctions) MaxKey() int {
-	return maxInt
+	return int(^uint(0) >> 1)
 }
 
 func (setFunctions) ClearValue(*int) {}
@@ -40,3 +35,20 @@ func (setFunctions) Merge(_ Range, val1 int, _ Range, _ int) (int, bool) {
 func (setFunctions) Split(_ Range, val int, _ int) (int, int) {
 	return val, val
 }
+
+type gapSetFunctions struct {
+	setFunctions
+}
+
+// MinKey is adjusted to make sure no add overflow would happen in test cases.
+// e.g. A gap with range {MinInt32, 2} would cause overflow in Range().Length().
+//
+// Normally Keys should be unsigned to avoid these issues.
+func (s gapSetFunctions) MinKey() int {
+	return s.setFunctions.MinKey() / 2
+}
+
+// MaxKey returns the maximum key for the set.
+func (s gapSetFunctions) MaxKey() int {
+	return s.setFunctions.MaxKey() / 2
+}
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index e27f21e5e..901e0f320 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -11,7 +11,6 @@ go_library(
         "arch_amd64.go",
         "arch_amd64.s",
         "arch_arm64.go",
-        "arch_state_aarch64.go",
         "arch_state_x86.go",
         "arch_x86.go",
         "arch_x86_impl.go",
@@ -26,11 +25,11 @@ go_library(
         "syscalls_amd64.go",
         "syscalls_arm64.go",
     ],
+    marshal = True,
     visibility = ["//:sandbox"],
     deps = [
         ":registers_go_proto",
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/cpuid",
         "//pkg/log",
@@ -38,6 +37,7 @@ go_library(
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
+        "//tools/go_marshal/marshal",
     ],
 )
 
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 1d11cc472..a903d031c 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -88,6 +88,9 @@ type Context interface {
 	// SyscallNo returns the syscall number.
 	SyscallNo() uintptr
 
+	// SyscallSaveOrig save orignal register value.
+	SyscallSaveOrig()
+
 	// SyscallArgs returns the syscall arguments in an array.
 	SyscallArgs() SyscallArguments
 
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 3b6987665..343f81f59 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -17,44 +17,55 @@
 package arch
 
 import (
+	"encoding/binary"
 	"fmt"
 	"io"
-	"syscall"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// Registers represents the CPU registers for this architecture.
+type Registers = linux.PtraceRegs
+
 const (
 	// SyscallWidth is the width of insturctions.
 	SyscallWidth = 4
+
+	// fpsimdMagic is the magic number which is used in fpsimd_context.
+	fpsimdMagic = 0x46508001
+
+	// fpsimdContextSize is the size of fpsimd_context.
+	fpsimdContextSize = 0x210
 )
 
+// ARMTrapFlag is the mask for the trap flag.
+const ARMTrapFlag = uint64(1) << 21
+
 // aarch64FPState is aarch64 floating point state.
 type aarch64FPState []byte
 
-// initAarch64FPState (defined in asm files) sets up initial state.
-func initAarch64FPState(data *FloatingPointData) {
-	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
+// initAarch64FPState sets up initial state.
+func initAarch64FPState(data aarch64FPState) {
+	binary.LittleEndian.PutUint32(data, fpsimdMagic)
+	binary.LittleEndian.PutUint32(data[4:], fpsimdContextSize)
 }
 
 func newAarch64FPStateSlice() []byte {
-	return alignedBytes(4096, 32)[:4096]
+	return alignedBytes(4096, 16)[:fpsimdContextSize]
 }
 
 // newAarch64FPState returns an initialized floating point state.
 //
 // The returned state is large enough to store all floating point state
 // supported by host, even if the app won't use much of it due to a restricted
-// FeatureSet. Since they may still be able to see state not advertised by
-// CPUID we must ensure it does not contain any sentry state.
+// FeatureSet.
 func newAarch64FPState() aarch64FPState {
 	f := aarch64FPState(newAarch64FPStateSlice())
-	initAarch64FPState(f.FloatingPointData())
+	initAarch64FPState(f)
 	return f
 }
 
@@ -81,13 +92,19 @@ func NewFloatingPointData() *FloatingPointData {
 // file ensures it's only built on aarch64).
 type State struct {
 	// The system registers.
-	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
+	Regs Registers
 
 	// Our floating point state.
 	aarch64FPState `state:"wait"`
 
+	// TLS pointer
+	TPValue uint64
+
 	// FeatureSet is a pointer to the currently active feature set.
 	FeatureSet *cpuid.FeatureSet
+
+	// OrigR0 stores the value of register R0.
+	OrigR0 uint64
 }
 
 // Proto returns a protobuf representation of the system registers in State.
@@ -133,10 +150,12 @@ func (s State) Proto() *rpb.Registers {
 
 // Fork creates and returns an identical copy of the state.
 func (s *State) Fork() State {
-	// TODO(gvisor.dev/issue/1238): floating-point is not supported.
 	return State{
-		Regs:       s.Regs,
-		FeatureSet: s.FeatureSet,
+		Regs:           s.Regs,
+		aarch64FPState: s.aarch64FPState.fork(),
+		TPValue:        s.TPValue,
+		FeatureSet:     s.FeatureSet,
+		OrigR0:         s.OrigR0,
 	}
 }
 
@@ -209,25 +228,27 @@ func (s *State) RegisterMap() (map[string]uintptr, error) {
 
 // PtraceGetRegs implements Context.PtraceGetRegs.
 func (s *State) PtraceGetRegs(dst io.Writer) (int, error) {
-	return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs()))
+	regs := s.ptraceGetRegs()
+	n, err := regs.WriteTo(dst)
+	return int(n), err
 }
 
-func (s *State) ptraceGetRegs() syscall.PtraceRegs {
+func (s *State) ptraceGetRegs() Registers {
 	return s.Regs
 }
 
-var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{}))
+var registersSize = (*Registers)(nil).SizeBytes()
 
 // PtraceSetRegs implements Context.PtraceSetRegs.
 func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
-	var regs syscall.PtraceRegs
-	buf := make([]byte, ptraceRegsSize)
+	var regs Registers
+	buf := make([]byte, registersSize)
 	if _, err := io.ReadFull(src, buf); err != nil {
 		return 0, err
 	}
-	binary.Unmarshal(buf, usermem.ByteOrder, &regs)
+	regs.UnmarshalUnsafe(buf)
 	s.Regs = regs
-	return ptraceRegsSize, nil
+	return registersSize, nil
 }
 
 // PtraceGetFPRegs implements Context.PtraceGetFPRegs.
@@ -246,13 +267,14 @@ func (s *State) PtraceSetFPRegs(src io.Reader) (int, error) {
 const (
 	_NT_PRSTATUS = 1
 	_NT_PRFPREG  = 2
+	_NT_ARM_TLS  = 0x401
 )
 
 // PtraceGetRegSet implements Context.PtraceGetRegSet.
 func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
 	switch regset {
 	case _NT_PRSTATUS:
-		if maxlen < ptraceRegsSize {
+		if maxlen < registersSize {
 			return 0, syserror.EFAULT
 		}
 		return s.PtraceGetRegs(dst)
@@ -265,7 +287,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int,
 func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
 	switch regset {
 	case _NT_PRSTATUS:
-		if maxlen < ptraceRegsSize {
+		if maxlen < registersSize {
 			return 0, syserror.EFAULT
 		}
 		return s.PtraceSetRegs(src)
@@ -285,8 +307,10 @@ func New(arch Arch, fs *cpuid.FeatureSet) Context {
 	case ARM64:
 		return &context64{
 			State{
-				FeatureSet: fs,
+				aarch64FPState: newAarch64FPState(),
+				FeatureSet:     fs,
 			},
+			[]aarch64FPState(nil),
 		}
 	}
 	panic(fmt.Sprintf("unknown architecture %v", arch))
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 85d6acc0f..3b3a0a272 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -22,7 +22,6 @@ import (
 	"math/rand"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -301,8 +300,10 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
 	// PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and
 	// u_debugreg, returning 0 or silently no-oping for other fields
 	// respectively.
-	if addr < uintptr(ptraceRegsSize) {
-		buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
+	if addr < uintptr(registersSize) {
+		regs := c.ptraceGetRegs()
+		buf := make([]byte, regs.SizeBytes())
+		regs.MarshalUnsafe(buf)
 		return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil
 	}
 	// Note: x86 debug registers are missing.
@@ -314,8 +315,10 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error {
 	if addr&7 != 0 || addr >= userStructSize {
 		return syscall.EIO
 	}
-	if addr < uintptr(ptraceRegsSize) {
-		buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
+	if addr < uintptr(registersSize) {
+		regs := c.ptraceGetRegs()
+		buf := make([]byte, regs.SizeBytes())
+		regs.MarshalUnsafe(buf)
 		usermem.ByteOrder.PutUint64(buf[addr:], uint64(data))
 		_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
 		return err
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
index ac98897b5..ada7ac7b8 100644
--- a/pkg/sentry/arch/arch_arm64.go
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package arch
 
 import (
@@ -53,6 +55,11 @@ const (
 	preferredPIELoadAddr usermem.Addr = maxAddr64 / 6 * 5
 )
 
+var (
+	// CPUIDInstruction doesn't exist on ARM64.
+	CPUIDInstruction = []byte{}
+)
+
 // These constants are selected as heuristics to help make the Platform's
 // potentially limited address space conform as closely to Linux as possible.
 const (
@@ -68,6 +75,7 @@ const (
 // context64 represents an ARM64 context.
 type context64 struct {
 	State
+	sigFPState []aarch64FPState // fpstate to be restored on sigreturn.
 }
 
 // Arch implements Context.Arch.
@@ -75,10 +83,19 @@ func (c *context64) Arch() Arch {
 	return ARM64
 }
 
+func (c *context64) copySigFPState() []aarch64FPState {
+	var sigfps []aarch64FPState
+	for _, s := range c.sigFPState {
+		sigfps = append(sigfps, s.fork())
+	}
+	return sigfps
+}
+
 // Fork returns an exact copy of this context.
 func (c *context64) Fork() Context {
 	return &context64{
-		State: c.State.Fork(),
+		State:      c.State.Fork(),
+		sigFPState: c.copySigFPState(),
 	}
 }
 
@@ -125,16 +142,17 @@ func (c *context64) SetStack(value uintptr) {
 
 // TLS returns the current TLS pointer.
 func (c *context64) TLS() uintptr {
-	// TODO(gvisor.dev/issue/1238): TLS is not supported.
-	// MRS_TPIDR_EL0
-	return 0
+	return uintptr(c.TPValue)
 }
 
 // SetTLS sets the current TLS pointer. Returns false if value is invalid.
 func (c *context64) SetTLS(value uintptr) bool {
-	// TODO(gvisor.dev/issue/1238): TLS is not supported.
-	// MSR_TPIDR_EL0
-	return false
+	if value >= uintptr(maxAddr64) {
+		return false
+	}
+
+	c.TPValue = uint64(value)
+	return true
 }
 
 // SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index e35c9214a..19ce99d25 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package arch
 
 import (
 	"fmt"
-	"syscall"
 
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -90,44 +89,3 @@ func (s *State) afterLoadFPState() {
 	// Copy to the new, aligned location.
 	copy(s.x86FPState, old)
 }
-
-// +stateify savable
-type syscallPtraceRegs struct {
-	R15      uint64
-	R14      uint64
-	R13      uint64
-	R12      uint64
-	Rbp      uint64
-	Rbx      uint64
-	R11      uint64
-	R10      uint64
-	R9       uint64
-	R8       uint64
-	Rax      uint64
-	Rcx      uint64
-	Rdx      uint64
-	Rsi      uint64
-	Rdi      uint64
-	Orig_rax uint64
-	Rip      uint64
-	Cs       uint64
-	Eflags   uint64
-	Rsp      uint64
-	Ss       uint64
-	Fs_base  uint64
-	Gs_base  uint64
-	Ds       uint64
-	Es       uint64
-	Fs       uint64
-	Gs       uint64
-}
-
-// saveRegs is invoked by stateify.
-func (s *State) saveRegs() syscallPtraceRegs {
-	return syscallPtraceRegs(s.Regs)
-}
-
-// loadRegs is invoked by stateify.
-func (s *State) loadRegs(r syscallPtraceRegs) {
-	s.Regs = syscall.PtraceRegs(r)
-}
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 88b40a9d1..dc458b37f 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package arch
 
@@ -21,7 +21,7 @@ import (
 	"io"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
@@ -30,6 +30,9 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// Registers represents the CPU registers for this architecture.
+type Registers = linux.PtraceRegs
+
 // System-related constants for x86.
 const (
 	// SyscallWidth is the width of syscall, sysenter, and int 80 insturctions.
@@ -267,10 +270,12 @@ func (s *State) RegisterMap() (map[string]uintptr, error) {
 
 // PtraceGetRegs implements Context.PtraceGetRegs.
 func (s *State) PtraceGetRegs(dst io.Writer) (int, error) {
-	return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs()))
+	regs := s.ptraceGetRegs()
+	n, err := regs.WriteTo(dst)
+	return int(n), err
 }
 
-func (s *State) ptraceGetRegs() syscall.PtraceRegs {
+func (s *State) ptraceGetRegs() Registers {
 	regs := s.Regs
 	// These may not be initialized.
 	if regs.Cs == 0 || regs.Ss == 0 || regs.Eflags == 0 {
@@ -306,16 +311,16 @@ func (s *State) ptraceGetRegs() syscall.PtraceRegs {
 	return regs
 }
 
-var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{}))
+var registersSize = (*Registers)(nil).SizeBytes()
 
 // PtraceSetRegs implements Context.PtraceSetRegs.
 func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
-	var regs syscall.PtraceRegs
-	buf := make([]byte, ptraceRegsSize)
+	var regs Registers
+	buf := make([]byte, registersSize)
 	if _, err := io.ReadFull(src, buf); err != nil {
 		return 0, err
 	}
-	binary.Unmarshal(buf, usermem.ByteOrder, &regs)
+	regs.UnmarshalUnsafe(buf)
 	// Truncate segment registers to 16 bits.
 	regs.Cs = uint64(uint16(regs.Cs))
 	regs.Ds = uint64(uint16(regs.Ds))
@@ -369,7 +374,7 @@ func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
 	}
 	regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable)
 	s.Regs = regs
-	return ptraceRegsSize, nil
+	return registersSize, nil
 }
 
 // isUserSegmentSelector returns true if the given segment selector specifies a
@@ -538,7 +543,7 @@ const (
 func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
 	switch regset {
 	case _NT_PRSTATUS:
-		if maxlen < ptraceRegsSize {
+		if maxlen < registersSize {
 			return 0, syserror.EFAULT
 		}
 		return s.PtraceGetRegs(dst)
@@ -558,7 +563,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int,
 func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
 	switch regset {
 	case _NT_PRSTATUS:
-		if maxlen < ptraceRegsSize {
+		if maxlen < registersSize {
 			return 0, syserror.EFAULT
 		}
 		return s.PtraceSetRegs(src)
diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go
index 04ac283c6..0c73fcbfb 100644
--- a/pkg/sentry/arch/arch_x86_impl.go
+++ b/pkg/sentry/arch/arch_x86_impl.go
@@ -12,13 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package arch
 
 import (
-	"syscall"
-
 	"gvisor.dev/gvisor/pkg/cpuid"
 )
 
@@ -28,7 +26,7 @@ import (
 // +stateify savable
 type State struct {
 	// The system registers.
-	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
+	Regs Registers
 
 	// Our floating point state.
 	x86FPState `state:"wait"`
diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go
index 8b03d0187..c9fb55d00 100644
--- a/pkg/sentry/arch/signal.go
+++ b/pkg/sentry/arch/signal.go
@@ -22,6 +22,7 @@ import (
 // SignalAct represents the action that should be taken when a signal is
 // delivered, and is equivalent to struct sigaction.
 //
+// +marshal
 // +stateify savable
 type SignalAct struct {
 	Handler  uint64
@@ -43,6 +44,7 @@ func (s *SignalAct) DeserializeTo(other *SignalAct) {
 // SignalStack represents information about a user stack, and is equivalent to
 // stack_t.
 //
+// +marshal
 // +stateify savable
 type SignalStack struct {
 	Addr  uint64
@@ -64,6 +66,7 @@ func (s *SignalStack) DeserializeTo(other *SignalStack) {
 // SignalInfo represents information about a signal being delivered, and is
 // equivalent to struct siginfo in linux kernel(linux/include/uapi/asm-generic/siginfo.h).
 //
+// +marshal
 // +stateify savable
 type SignalInfo struct {
 	Signo int32 // Signal number
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
index f9ca2e74e..32173aa20 100644
--- a/pkg/sentry/arch/signal_act.go
+++ b/pkg/sentry/arch/signal_act.go
@@ -14,6 +14,8 @@
 
 package arch
 
+import "gvisor.dev/gvisor/tools/go_marshal/marshal"
+
 // Special values for SignalAct.Handler.
 const (
 	// SignalActDefault is SIG_DFL and specifies that the default behavior for
@@ -71,6 +73,8 @@ func (s SignalAct) HasRestorer() bool {
 // NativeSignalAct is a type that is equivalent to struct sigaction in the
 // guest architecture.
 type NativeSignalAct interface {
+	marshal.Marshallable
+
 	// SerializeFrom copies the data in the host SignalAct s into this object.
 	SerializeFrom(s *SignalAct)
 
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 4f4cc46a8..642c79dda 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -19,6 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -30,14 +31,29 @@ type SignalContext64 struct {
 	Sp        uint64
 	Pc        uint64
 	Pstate    uint64
-	_pad      [8]byte // __attribute__((__aligned__(16)))
-	Reserved  [4096]uint8
+	_pad      [8]byte       // __attribute__((__aligned__(16)))
+	Fpsimd64  FpsimdContext // size = 528
+	Reserved  [3568]uint8
+}
+
+type aarch64Ctx struct {
+	Magic uint32
+	Size  uint32
+}
+
+// FpsimdContext is equivalent to struct fpsimd_context on arm64
+// (arch/arm64/include/uapi/asm/sigcontext.h).
+type FpsimdContext struct {
+	Head  aarch64Ctx
+	Fpsr  uint32
+	Fpcr  uint32
+	Vregs [64]uint64 // actually [32]uint128
 }
 
 // UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h).
 type UContext64 struct {
 	Flags  uint64
-	Link   *UContext64
+	Link   uint64
 	Stack  SignalStack
 	Sigset linux.SignalSet
 	// glibc uses a 1024-bit sigset_t
@@ -83,9 +99,12 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 	if ucSize < 0 {
 		panic("can't get size of UContext64")
 	}
-	// st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128.
-	frameSize := int(st.Arch.Width()) + ucSize + 128
-	frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8
+
+	// frameSize = ucSize + sizeof(siginfo).
+	// sizeof(siginfo) == 128.
+	// R30 stores the restorer address.
+	frameSize := ucSize + 128
+	frameBottom := (sp - usermem.Addr(frameSize)) & ^usermem.Addr(15)
 	sp = frameBottom + usermem.Addr(frameSize)
 	st.Bottom = sp
 
@@ -115,12 +134,48 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 	c.Regs.Regs[0] = uint64(info.Signo)
 	c.Regs.Regs[1] = uint64(infoAddr)
 	c.Regs.Regs[2] = uint64(ucAddr)
+	c.Regs.Regs[30] = uint64(act.Restorer)
 
+	// Save the thread's floating point state.
+	c.sigFPState = append(c.sigFPState, c.aarch64FPState)
+	// Signal handler gets a clean floating point state.
+	c.aarch64FPState = newAarch64FPState()
 	return nil
 }
 
 // SignalRestore implements Context.SignalRestore.
-// Only used on intel.
 func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
-	return 0, SignalStack{}, nil
+	// Copy out the stack frame.
+	var uc UContext64
+	if _, err := st.Pop(&uc); err != nil {
+		return 0, SignalStack{}, err
+	}
+	var info SignalInfo
+	if _, err := st.Pop(&info); err != nil {
+		return 0, SignalStack{}, err
+	}
+
+	// Restore registers.
+	c.Regs.Regs = uc.MContext.Regs
+	c.Regs.Pc = uc.MContext.Pc
+	c.Regs.Sp = uc.MContext.Sp
+	c.Regs.Pstate = uc.MContext.Pstate
+
+	// Restore floating point state.
+	l := len(c.sigFPState)
+	if l > 0 {
+		c.aarch64FPState = c.sigFPState[l-1]
+		// NOTE(cl/133042258): State save requires that any slice
+		// elements from '[len:cap]' to be zero value.
+		c.sigFPState[l-1] = nil
+		c.sigFPState = c.sigFPState[0 : l-1]
+	} else {
+		// This might happen if sigreturn(2) calls are unbalanced with
+		// respect to signal handler entries. This is not expected so
+		// don't bother to do anything fancy with the floating point
+		// state.
+		log.Warningf("sigreturn unable to restore application fpstate")
+	}
+
+	return uc.Sigset, uc.Stack, nil
 }
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index 1a6056171..0fa738a1d 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64 arm64
+// +build 386 amd64 arm64
 
 package arch
 
 import (
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 const (
@@ -55,6 +56,8 @@ func (s *SignalStack) Contains(sp usermem.Addr) bool {
 // NativeSignalStack is a type that is equivalent to stack_t in the guest
 // architecture.
 type NativeSignalStack interface {
+	marshal.Marshallable
+
 	// SerializeFrom copies the data in the host SignalStack s into this
 	// object.
 	SerializeFrom(s *SignalStack)
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 09bceabc9..1108fa0bd 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -97,7 +97,6 @@ func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) {
 		if c < 0 {
 			return 0, fmt.Errorf("bad binary.Size for %T", v)
 		}
-		// TODO(b/38173783): Use a real context.Context.
 		n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{})
 		if err != nil || c != n {
 			return 0, err
@@ -121,11 +120,9 @@ func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) {
 		var err error
 		if isVaddr {
 			value := s.Arch.Native(uintptr(0))
-			// TODO(b/38173783): Use a real context.Context.
 			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{})
 			*vaddr = usermem.Addr(s.Arch.Value(value))
 		} else {
-			// TODO(b/38173783): Use a real context.Context.
 			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{})
 		}
 		if err != nil {
diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go
index 8b4f23007..3859f41ee 100644
--- a/pkg/sentry/arch/syscalls_amd64.go
+++ b/pkg/sentry/arch/syscalls_amd64.go
@@ -18,6 +18,13 @@ package arch
 
 const restartSyscallNr = uintptr(219)
 
+// SyscallSaveOrig save the value of the register which is clobbered in
+// syscall handler(doSyscall()).
+//
+// Noop on x86.
+func (c *context64) SyscallSaveOrig() {
+}
+
 // SyscallNo returns the syscall number according to the 64-bit convention.
 func (c *context64) SyscallNo() uintptr {
 	return uintptr(c.Regs.Orig_rax)
diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go
index 00d5ef461..95dfd1e90 100644
--- a/pkg/sentry/arch/syscalls_arm64.go
+++ b/pkg/sentry/arch/syscalls_arm64.go
@@ -18,6 +18,17 @@ package arch
 
 const restartSyscallNr = uintptr(128)
 
+// SyscallSaveOrig save the value of the register R0 which is clobbered in
+// syscall handler(doSyscall()).
+//
+// In linux, at the entry of the syscall handler(el0_svc_common()), value of R0
+// is saved to the pt_regs.orig_x0 in kernel code. But currently, the orig_x0
+// was not accessible to the userspace application, so we have to do the same
+// operation in the sentry code to save the R0 value into the App context.
+func (c *context64) SyscallSaveOrig() {
+	c.OrigR0 = c.Regs.Regs[0]
+}
+
 // SyscallNo returns the syscall number according to the 64-bit convention.
 func (c *context64) SyscallNo() uintptr {
 	return uintptr(c.Regs.Regs[8])
@@ -40,7 +51,7 @@ func (c *context64) SyscallNo() uintptr {
 // R30: the link register.
 func (c *context64) SyscallArgs() SyscallArguments {
 	return SyscallArguments{
-		SyscallArgument{Value: uintptr(c.Regs.Regs[0])},
+		SyscallArgument{Value: uintptr(c.OrigR0)},
 		SyscallArgument{Value: uintptr(c.Regs.Regs[1])},
 		SyscallArgument{Value: uintptr(c.Regs.Regs[2])},
 		SyscallArgument{Value: uintptr(c.Regs.Regs[3])},
@@ -50,13 +61,21 @@ func (c *context64) SyscallArgs() SyscallArguments {
 }
 
 // RestartSyscall implements Context.RestartSyscall.
+// Prepare for system call restart, OrigR0 will be restored to R0.
+// Please see the linux code as reference:
+// arch/arm64/kernel/signal.c:do_signal()
 func (c *context64) RestartSyscall() {
 	c.Regs.Pc -= SyscallWidth
-	c.Regs.Regs[8] = uint64(restartSyscallNr)
+	// R0 will be backed up into OrigR0 when entering doSyscall().
+	// Please see the linux code as reference:
+	// arch/arm64/kernel/syscall.c:el0_svc_common().
+	// Here we restore it back.
+	c.Regs.Regs[0] = uint64(c.OrigR0)
 }
 
 // RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock.
 func (c *context64) RestartSyscallWithRestartBlock() {
 	c.Regs.Pc -= SyscallWidth
+	c.Regs.Regs[0] = uint64(c.OrigR0)
 	c.Regs.Regs[8] = uint64(restartSyscallNr)
 }
diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go
index 031fc64ec..8e5658c7a 100644
--- a/pkg/sentry/contexttest/contexttest.go
+++ b/pkg/sentry/contexttest/contexttest.go
@@ -97,7 +97,7 @@ type hostClock struct {
 }
 
 // Now implements ktime.Clock.Now.
-func (hostClock) Now() ktime.Time {
+func (*hostClock) Now() ktime.Time {
 	return ktime.FromNanoseconds(time.Now().UnixNano())
 }
 
@@ -127,7 +127,7 @@ func (t *TestContext) Value(key interface{}) interface{} {
 	case uniqueid.CtxInotifyCookie:
 		return atomic.AddUint32(&lastInotifyCookie, 1)
 	case ktime.CtxRealtimeClock:
-		return hostClock{}
+		return &hostClock{}
 	default:
 		if val, ok := t.otherValues[key]; ok {
 			return val
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index e69496477..0c9a62f0d 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -16,10 +16,16 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fd",
+        "//pkg/fspath",
         "//pkg/log",
+        "//pkg/sentry/fdimport",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
+        "//pkg/sentry/fs/user",
+        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/host",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
@@ -27,10 +33,13 @@ go_library(
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/sentry/watchdog",
         "//pkg/sync",
+        "//pkg/syserror",
         "//pkg/tcpip/link/sniffer",
         "//pkg/urpc",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 151808911..663e51989 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -117,9 +117,9 @@ func (p *Profile) HeapProfile(o *ProfileOpts, _ *struct{}) error {
 	return nil
 }
 
-// Goroutine is an RPC stub which dumps out the stack trace for all running
-// goroutines.
-func (p *Profile) Goroutine(o *ProfileOpts, _ *struct{}) error {
+// GoroutineProfile is an RPC stub which dumps out the stack trace for all
+// running goroutines.
+func (p *Profile) GoroutineProfile(o *ProfileOpts, _ *struct{}) error {
 	if len(o.FilePayload.Files) < 1 {
 		return errNoOutput
 	}
@@ -131,6 +131,34 @@ func (p *Profile) Goroutine(o *ProfileOpts, _ *struct{}) error {
 	return nil
 }
 
+// BlockProfile is an RPC stub which dumps out the stack trace that led to
+// blocking on synchronization primitives.
+func (p *Profile) BlockProfile(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+	output := o.FilePayload.Files[0]
+	defer output.Close()
+	if err := pprof.Lookup("block").WriteTo(output, 0); err != nil {
+		return err
+	}
+	return nil
+}
+
+// MutexProfile is an RPC stub which dumps out the stack trace of holders of
+// contended mutexes.
+func (p *Profile) MutexProfile(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+	output := o.FilePayload.Files[0]
+	defer output.Close()
+	if err := pprof.Lookup("mutex").WriteTo(output, 0); err != nil {
+		return err
+	}
+	return nil
+}
+
 // StartTrace is an RPC stub which starts collection of an execution trace.
 func (p *Profile) StartTrace(o *ProfileOpts, _ *struct{}) error {
 	if len(o.FilePayload.Files) < 1 {
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index ced51c66c..8767430b7 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -23,14 +23,23 @@ import (
 	"text/tabwriter"
 	"time"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fdimport"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
@@ -60,6 +69,12 @@ type ExecArgs struct {
 	// process's MountNamespace.
 	MountNamespace *fs.MountNamespace
 
+	// MountNamespaceVFS2 is the mount namespace to execute the new process in.
+	// A reference on MountNamespace must be held for the lifetime of the
+	// ExecArgs. If MountNamespace is nil, it will default to the init
+	// process's MountNamespace.
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// WorkingDirectory defines the working directory for the new process.
 	WorkingDirectory string `json:"wd"`
 
@@ -71,15 +86,13 @@ type ExecArgs struct {
 	// the root group if not set explicitly.
 	KGID auth.KGID
 
-	// ExtraKGIDs is the list of additional groups to which the user
-	// belongs.
+	// ExtraKGIDs is the list of additional groups to which the user belongs.
 	ExtraKGIDs []auth.KGID
 
 	// Capabilities is the list of capabilities to give to the process.
 	Capabilities *auth.TaskCapabilities
 
-	// StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host
-	// pty FD.
+	// StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD.
 	StdioIsPty bool
 
 	// FilePayload determines the files to give to the new process.
@@ -104,7 +117,7 @@ func (args ExecArgs) String() string {
 
 // Exec runs a new task.
 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
-	newTG, _, _, err := proc.execAsync(args)
+	newTG, _, _, _, err := proc.execAsync(args)
 	if err != nil {
 		return err
 	}
@@ -117,26 +130,18 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 
 // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined
 // as a function rather than a method to avoid exposing execAsync as an RPC.
-func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) {
+func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	return proc.execAsync(args)
 }
 
 // execAsync runs a new task, but doesn't wait for it to finish. It returns the
 // newly created thread group and its PID. If the stdio FDs are TTYs, then a
 // TTYFileOperations that wraps the TTY is also returned.
-func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) {
+func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	// Import file descriptors.
 	fdTable := proc.Kernel.NewFDTable()
 	defer fdTable.DecRef()
 
-	// No matter what happens, we should close all files in the FilePayload
-	// before returning. Any files that are imported will be duped.
-	defer func() {
-		for _, f := range args.FilePayload.Files {
-			f.Close()
-		}
-	}()
-
 	creds := auth.NewUserCredentials(
 		args.KUID,
 		args.KGID,
@@ -150,6 +155,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		Envv:                    args.Envv,
 		WorkingDirectory:        args.WorkingDirectory,
 		MountNamespace:          args.MountNamespace,
+		MountNamespaceVFS2:      args.MountNamespaceVFS2,
 		Credentials:             creds,
 		FDTable:                 fdTable,
 		Umask:                   0022,
@@ -166,81 +172,86 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		// be donated to the new process in CreateProcess.
 		initArgs.MountNamespace.IncRef()
 	}
+	if initArgs.MountNamespaceVFS2 != nil {
+		// initArgs must hold a reference on MountNamespaceVFS2, which will
+		// be donated to the new process in CreateProcess.
+		initArgs.MountNamespaceVFS2.IncRef()
+	}
 	ctx := initArgs.NewContext(proc.Kernel)
 
 	if initArgs.Filename == "" {
-		// Get the full path to the filename from the PATH env variable.
-		paths := fs.GetPath(initArgs.Envv)
-		mns := initArgs.MountNamespace
-		if mns == nil {
-			mns = proc.Kernel.GlobalInit().Leader().MountNamespace()
-		}
-		f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
-		if err != nil {
-			return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
-		}
-		initArgs.Filename = f
-	}
-
-	mounter := fs.FileOwnerFromContext(ctx)
-
-	var ttyFile *fs.File
-	for appFD, hostFile := range args.FilePayload.Files {
-		var appFile *fs.File
-
-		if args.StdioIsPty && appFD < 3 {
-			// Import the file as a host TTY file.
-			if ttyFile == nil {
-				var err error
-				appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, true /* isTTY */)
-				if err != nil {
-					return nil, 0, nil, err
-				}
-				defer appFile.DecRef()
-
-				// Remember this in the TTY file, as we will
-				// use it for the other stdio FDs.
-				ttyFile = appFile
-			} else {
-				// Re-use the existing TTY file, as all three
-				// stdio FDs must point to the same fs.File in
-				// order to share TTY state, specifically the
-				// foreground process group id.
-				appFile = ttyFile
+		if kernel.VFS2Enabled {
+			// Get the full path to the filename from the PATH env variable.
+			if initArgs.MountNamespaceVFS2 == nil {
+				// Set initArgs so that 'ctx' returns the namespace.
+				//
+				// MountNamespaceVFS2 adds a reference to the namespace, which is
+				// transferred to the new process.
+				initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+			}
+			file, err := getExecutableFD(ctx, creds, proc.Kernel.VFS(), initArgs.MountNamespaceVFS2, initArgs.Envv, initArgs.WorkingDirectory, initArgs.Argv[0])
+			if err != nil {
+				return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in environment %v: %v", initArgs.Argv[0], initArgs.Envv, err)
 			}
+			initArgs.File = fsbridge.NewVFSFile(file)
 		} else {
-			// Import the file as a regular host file.
-			var err error
-			appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, false /* isTTY */)
+			if initArgs.MountNamespace == nil {
+				// Set initArgs so that 'ctx' returns the namespace.
+				initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
+
+				// initArgs must hold a reference on MountNamespace, which will
+				// be donated to the new process in CreateProcess.
+				initArgs.MountNamespace.IncRef()
+			}
+			f, err := user.ResolveExecutablePath(ctx, creds, initArgs.MountNamespace, initArgs.Envv, initArgs.WorkingDirectory, initArgs.Argv[0])
 			if err != nil {
-				return nil, 0, nil, err
+				return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], initArgs.Envv, err)
 			}
-			defer appFile.DecRef()
+			initArgs.Filename = f
 		}
+	}
 
-		// Add the file to the FD map.
-		if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
-			return nil, 0, nil, err
+	fds := make([]int, len(args.FilePayload.Files))
+	for i, file := range args.FilePayload.Files {
+		if kernel.VFS2Enabled {
+			// Need to dup to remove ownership from os.File.
+			dup, err := unix.Dup(int(file.Fd()))
+			if err != nil {
+				return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
+			}
+			fds[i] = dup
+		} else {
+			// VFS1 dups the file on import.
+			fds[i] = int(file.Fd())
+		}
+	}
+	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds)
+	if err != nil {
+		if kernel.VFS2Enabled {
+			for _, fd := range fds {
+				unix.Close(fd)
+			}
 		}
+		return nil, 0, nil, nil, err
 	}
 
 	tg, tid, err := proc.Kernel.CreateProcess(initArgs)
 	if err != nil {
-		return nil, 0, nil, err
+		return nil, 0, nil, nil, err
 	}
 
-	var ttyFileOps *host.TTYFileOperations
-	if ttyFile != nil {
-		// Set the foreground process group on the TTY before starting
-		// the process.
-		ttyFileOps = ttyFile.FileOperations.(*host.TTYFileOperations)
-		ttyFileOps.InitForegroundProcessGroup(tg.ProcessGroup())
+	// Set the foreground process group on the TTY before starting the process.
+	switch {
+	case ttyFile != nil:
+		ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
+	case ttyFileVFS2 != nil:
+		ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
 	}
 
 	// Start the newly created process.
 	proc.Kernel.StartProcess(tg)
 
-	return tg, tid, ttyFileOps, nil
+	return tg, tid, ttyFile, ttyFileVFS2, nil
 }
 
 // PsArgs is the set of arguments to ps.
@@ -411,3 +422,31 @@ func ttyName(tty *kernel.TTY) string {
 	}
 	return fmt.Sprintf("pts/%d", tty.Index)
 }
+
+// getExecutableFD resolves the given executable name and returns a
+// vfs.FileDescription for the executable file.
+func getExecutableFD(ctx context.Context, creds *auth.Credentials, vfsObj *vfs.VirtualFilesystem, mns *vfs.MountNamespace, envv []string, wd, name string) (*vfs.FileDescription, error) {
+	path, err := user.ResolveExecutablePathVFS2(ctx, creds, mns, envv, wd, name)
+	if err != nil {
+		return nil, err
+	}
+
+	root := vfs.RootFromContext(ctx)
+	defer root.DecRef()
+
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root, // binPath is absolute, Start can be anything.
+		Path:               fspath.Parse(path),
+		FollowFinalSymlink: true,
+	}
+	opts := &vfs.OpenOptions{
+		Flags:    linux.O_RDONLY,
+		FileExec: true,
+	}
+	f, err := vfsObj.OpenAt(ctx, creds, &pop, opts)
+	if err == syserror.ENOENT || err == syserror.EACCES {
+		return nil, nil
+	}
+	return f, err
+}
diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD
new file mode 100644
index 000000000..5e41ceb4e
--- /dev/null
+++ b/pkg/sentry/fdimport/BUILD
@@ -0,0 +1,19 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "fdimport",
+    srcs = [
+        "fdimport.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/host",
+        "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/vfs",
+    ],
+)
diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go
new file mode 100644
index 000000000..a4199f9e9
--- /dev/null
+++ b/pkg/sentry/fdimport/fdimport.go
@@ -0,0 +1,129 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fdimport
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Import imports a slice of FDs into the given FDTable. If console is true,
+// sets up TTY for the first 3 FDs in the slice representing stdin, stdout,
+// stderr. Upon success, Import takes ownership of all FDs.
+func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	if kernel.VFS2Enabled {
+		ttyFile, err := importVFS2(ctx, fdTable, console, fds)
+		return nil, ttyFile, err
+	}
+	ttyFile, err := importFS(ctx, fdTable, console, fds)
+	return ttyFile, nil, err
+}
+
+func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, error) {
+	var ttyFile *fs.File
+	for appFD, hostFD := range fds {
+		var appFile *fs.File
+
+		if console && appFD < 3 {
+			// Import the file as a host TTY file.
+			if ttyFile == nil {
+				var err error
+				appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */)
+				if err != nil {
+					return nil, err
+				}
+				defer appFile.DecRef()
+
+				// Remember this in the TTY file, as we will
+				// use it for the other stdio FDs.
+				ttyFile = appFile
+			} else {
+				// Re-use the existing TTY file, as all three
+				// stdio FDs must point to the same fs.File in
+				// order to share TTY state, specifically the
+				// foreground process group id.
+				appFile = ttyFile
+			}
+		} else {
+			// Import the file as a regular host file.
+			var err error
+			appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */)
+			if err != nil {
+				return nil, err
+			}
+			defer appFile.DecRef()
+		}
+
+		// Add the file to the FD map.
+		if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
+			return nil, err
+		}
+	}
+
+	if ttyFile == nil {
+		return nil, nil
+	}
+	return ttyFile.FileOperations.(*host.TTYFileOperations), nil
+}
+
+func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) {
+	k := kernel.KernelFromContext(ctx)
+
+	var ttyFile *vfs.FileDescription
+	for appFD, hostFD := range stdioFDs {
+		var appFile *vfs.FileDescription
+
+		if console && appFD < 3 {
+			// Import the file as a host TTY file.
+			if ttyFile == nil {
+				var err error
+				appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, true /* isTTY */)
+				if err != nil {
+					return nil, err
+				}
+				defer appFile.DecRef()
+
+				// Remember this in the TTY file, as we will use it for the other stdio
+				// FDs.
+				ttyFile = appFile
+			} else {
+				// Re-use the existing TTY file, as all three stdio FDs must point to
+				// the same fs.File in order to share TTY state, specifically the
+				// foreground process group id.
+				appFile = ttyFile
+			}
+		} else {
+			var err error
+			appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, false /* isTTY */)
+			if err != nil {
+				return nil, err
+			}
+			defer appFile.DecRef()
+		}
+
+		if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
+			return nil, err
+		}
+	}
+
+	if ttyFile == nil {
+		return nil, nil
+	}
+	return ttyFile.Impl().(*hostvfs2.TTYFileDescription), nil
+}
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index b060a12ff..ab1424c95 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -222,8 +222,8 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 		}
 		childUpper, err := parentUpper.Lookup(ctx, next.name)
 		if err != nil {
-			log.Warningf("copy up failed to lookup directory: %v", err)
-			cleanupUpper(ctx, parentUpper, next.name)
+			werr := fmt.Errorf("copy up failed to lookup directory: %v", err)
+			cleanupUpper(ctx, parentUpper, next.name, werr)
 			return syserror.EIO
 		}
 		defer childUpper.DecRef()
@@ -242,8 +242,8 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 		}
 		childUpper, err := parentUpper.Lookup(ctx, next.name)
 		if err != nil {
-			log.Warningf("copy up failed to lookup symlink: %v", err)
-			cleanupUpper(ctx, parentUpper, next.name)
+			werr := fmt.Errorf("copy up failed to lookup symlink: %v", err)
+			cleanupUpper(ctx, parentUpper, next.name, werr)
 			return syserror.EIO
 		}
 		defer childUpper.DecRef()
@@ -256,23 +256,23 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	// Bring file attributes up to date. This does not include size, which will be
 	// brought up to date with copyContentsLocked.
 	if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil {
-		log.Warningf("copy up failed to copy up attributes: %v", err)
-		cleanupUpper(ctx, parentUpper, next.name)
+		werr := fmt.Errorf("copy up failed to copy up attributes: %v", err)
+		cleanupUpper(ctx, parentUpper, next.name, werr)
 		return syserror.EIO
 	}
 
 	// Copy the entire file.
 	if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil {
-		log.Warningf("copy up failed to copy up contents: %v", err)
-		cleanupUpper(ctx, parentUpper, next.name)
+		werr := fmt.Errorf("copy up failed to copy up contents: %v", err)
+		cleanupUpper(ctx, parentUpper, next.name, werr)
 		return syserror.EIO
 	}
 
 	lowerMappable := next.Inode.overlay.lower.Mappable()
 	upperMappable := childUpperInode.Mappable()
 	if lowerMappable != nil && upperMappable == nil {
-		log.Warningf("copy up failed: cannot ensure memory mapping coherence")
-		cleanupUpper(ctx, parentUpper, next.name)
+		werr := fmt.Errorf("copy up failed: cannot ensure memory mapping coherence")
+		cleanupUpper(ctx, parentUpper, next.name, werr)
 		return syserror.EIO
 	}
 
@@ -324,12 +324,14 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	return nil
 }
 
-// cleanupUpper removes name from parent, and panics if it is unsuccessful.
-func cleanupUpper(ctx context.Context, parent *Inode, name string) {
+// cleanupUpper is called when copy-up fails. It logs the copy-up error and
+// attempts to remove name from parent. If that fails, then it panics.
+func cleanupUpper(ctx context.Context, parent *Inode, name string, copyUpErr error) {
+	log.Warningf(copyUpErr.Error())
 	if err := parent.InodeOperations.Remove(ctx, parent, name); err != nil {
 		// Unfortunately we don't have much choice. We shouldn't
 		// willingly give the caller access to a nonsense filesystem.
-		panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: failed to remove %q from upper filesystem: %v", name, err))
+		panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: copyUp got error: %v; then cleanup failed to remove %q from upper filesystem: %v.", copyUpErr, name, err))
 	}
 }
 
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 4c4b7d5cc..9379a4d7b 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -9,6 +9,7 @@ go_library(
         "device.go",
         "fs.go",
         "full.go",
+        "net_tun.go",
         "null.go",
         "random.go",
         "tty.go",
@@ -19,15 +20,20 @@ go_library(
         "//pkg/context",
         "//pkg/rand",
         "//pkg/safemem",
+        "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
         "//pkg/sentry/pgalloc",
+        "//pkg/sentry/socket/netstack",
         "//pkg/syserror",
+        "//pkg/tcpip/link/tun",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 35bd23991..acbd401a0 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -66,8 +67,8 @@ func newMemDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSo
 	})
 }
 
-func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	iops := ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+func newDirectory(ctx context.Context, contents map[string]*fs.Inode, msrc *fs.MountSource) *fs.Inode {
+	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(ctx, iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
@@ -111,7 +112,7 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		// A devpts is typically mounted at /dev/pts to provide
 		// pseudoterminal support. Place an empty directory there for
 		// the devpts to be mounted over.
-		"pts": newDirectory(ctx, msrc),
+		"pts": newDirectory(ctx, nil, msrc),
 		// Similarly, applications expect a ptmx device at /dev/ptmx
 		// connected to the terminals provided by /dev/pts/. Rather
 		// than creating a device directly (which requires a hairy
@@ -126,6 +127,12 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor),
 	}
 
+	if isNetTunSupported(inet.StackFromContext(ctx)) {
+		contents["net"] = newDirectory(ctx, map[string]*fs.Inode{
+			"tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor),
+		}, msrc)
+	}
+
 	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(ctx, iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
new file mode 100644
index 000000000..dc7ad075a
--- /dev/null
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -0,0 +1,177 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dev
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	netTunDevMajor = 10
+	netTunDevMinor = 200
+)
+
+// +stateify savable
+type netTunInodeOperations struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+}
+
+var _ fs.InodeOperations = (*netTunInodeOperations)(nil)
+
+func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *netTunInodeOperations {
+	return &netTunInodeOperations{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil
+}
+
+// +stateify savable
+type netTunFileOperations struct {
+	fsutil.FileNoSeek               `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	device tun.Device
+}
+
+var _ fs.FileOperations = (*netTunFileOperations)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (fops *netTunFileOperations) Release() {
+	fops.device.Release()
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	request := args[1].Uint()
+	data := args[2].Pointer()
+
+	switch request {
+	case linux.TUNSETIFF:
+		t := kernel.TaskFromContext(ctx)
+		if t == nil {
+			panic("Ioctl should be called from a task context")
+		}
+		if !t.HasCapability(linux.CAP_NET_ADMIN) {
+			return 0, syserror.EPERM
+		}
+		stack, ok := t.NetworkContext().(*netstack.Stack)
+		if !ok {
+			return 0, syserror.EINVAL
+		}
+
+		var req linux.IFReq
+		if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		flags := usermem.ByteOrder.Uint16(req.Data[:])
+		return 0, fops.device.SetIff(stack.Stack, req.Name(), flags)
+
+	case linux.TUNGETIFF:
+		var req linux.IFReq
+
+		copy(req.IFName[:], fops.device.Name())
+
+		// Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
+		// there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
+		flags := fops.device.Flags() | linux.IFF_NOFILTER
+		usermem.ByteOrder.PutUint16(req.Data[:], flags)
+
+		_, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// Write implements fs.FileOperations.Write.
+func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	data := make([]byte, src.NumBytes())
+	if _, err := src.CopyIn(ctx, data); err != nil {
+		return 0, err
+	}
+	return fops.device.Write(data)
+}
+
+// Read implements fs.FileOperations.Read.
+func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	data, err := fops.device.Read()
+	if err != nil {
+		return 0, err
+	}
+	n, err := dst.CopyOut(ctx, data)
+	if n > 0 && n < len(data) {
+		// Not an error for partial copying. Packet truncated.
+		err = nil
+	}
+	return int64(n), err
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fops.device.Readiness(mask)
+}
+
+// EventRegister implements watier.Waitable.EventRegister.
+func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fops.device.EventRegister(e, mask)
+}
+
+// EventUnregister implements watier.Waitable.EventUnregister.
+func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
+	fops.device.EventUnregister(e)
+}
+
+// isNetTunSupported returns whether /dev/net/tun device is supported for s.
+func isNetTunSupported(s inet.Stack) bool {
+	_, ok := s.(*netstack.Stack)
+	return ok
+}
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index acab0411a..65be12175 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -17,7 +17,6 @@ package fs
 import (
 	"fmt"
 	"path"
-	"sort"
 	"sync/atomic"
 	"syscall"
 
@@ -121,9 +120,6 @@ type Dirent struct {
 	// deleted may be set atomically when removed.
 	deleted int32
 
-	// frozen indicates this entry can't walk to unknown nodes.
-	frozen bool
-
 	// mounted is true if Dirent is a mount point, similar to include/linux/dcache.h:DCACHE_MOUNTED.
 	mounted bool
 
@@ -253,8 +249,7 @@ func (d *Dirent) IsNegative() bool {
 	return d.Inode == nil
 }
 
-// hashChild will hash child into the children list of its new parent d, carrying over
-// any "frozen" state from d.
+// hashChild will hash child into the children list of its new parent d.
 //
 // Returns (*WeakRef, true) if hashing child caused a Dirent to be unhashed. The caller must
 // validate the returned unhashed weak reference. Common cases:
@@ -282,9 +277,6 @@ func (d *Dirent) hashChild(child *Dirent) (*refs.WeakRef, bool) {
 		d.IncRef()
 	}
 
-	// Carry over parent's frozen state.
-	child.frozen = d.frozen
-
 	return d.hashChildParentSet(child)
 }
 
@@ -320,9 +312,9 @@ func (d *Dirent) SyncAll(ctx context.Context) {
 
 	// There is nothing to sync for a read-only filesystem.
 	if !d.Inode.MountSource.Flags.ReadOnly {
-		// FIXME(b/34856369): This should be a mount traversal, not a
-		// Dirent traversal, because some Inodes that need to be synced
-		// may no longer be reachable by name (after sys_unlink).
+		// NOTE(b/34856369): This should be a mount traversal, not a Dirent
+		// traversal, because some Inodes that need to be synced may no longer
+		// be reachable by name (after sys_unlink).
 		//
 		// Write out metadata, dirty page cached pages, and sync disk/remote
 		// caches.
@@ -400,38 +392,6 @@ func (d *Dirent) MountRoot() *Dirent {
 	return mountRoot
 }
 
-// Freeze prevents this dirent from walking to more nodes. Freeze is applied
-// recursively to all children.
-//
-// If this particular Dirent represents a Virtual node, then Walks and Creates
-// may proceed as before.
-//
-// Freeze can only be called before the application starts running, otherwise
-// the root it might be out of sync with the application root if modified by
-// sys_chroot.
-func (d *Dirent) Freeze() {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	if d.frozen {
-		// Already frozen.
-		return
-	}
-	d.frozen = true
-
-	// Take a reference when freezing.
-	for _, w := range d.children {
-		if child := w.Get(); child != nil {
-			// NOTE: We would normally drop the reference here. But
-			// instead we're hanging on to it.
-			ch := child.(*Dirent)
-			ch.Freeze()
-		}
-	}
-
-	// Drop all expired weak references.
-	d.flush()
-}
-
 // descendantOf returns true if the receiver dirent is equal to, or a
 // descendant of, the argument dirent.
 //
@@ -524,11 +484,6 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 		w.Drop()
 	}
 
-	// Are we allowed to do the lookup?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return nil, syscall.ENOENT
-	}
-
 	// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be
 	// expensive, if possible release the lock and re-acquire it.
 	if walkMayUnlock {
@@ -659,11 +614,6 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
 		return nil, syscall.EEXIST
 	}
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return nil, syscall.ENOENT
-	}
-
 	// Try the create. We need to trust the file system to return EEXIST (or something
 	// that will translate to EEXIST) if name already exists.
 	file, err := d.Inode.Create(ctx, d, name, flags, perms)
@@ -727,11 +677,6 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c
 		return syscall.EEXIST
 	}
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Remove any negative Dirent. We've already asserted above with d.exists
 	// that the only thing remaining here can be a negative Dirent.
 	if w, ok := d.children[name]; ok {
@@ -862,49 +807,6 @@ func (d *Dirent) GetDotAttrs(root *Dirent) (DentAttr, DentAttr) {
 	return dot, dot
 }
 
-// readdirFrozen returns readdir results based solely on the frozen children.
-func (d *Dirent) readdirFrozen(root *Dirent, offset int64, dirCtx *DirCtx) (int64, error) {
-	// Collect attrs for "." and  "..".
-	attrs := make(map[string]DentAttr)
-	names := []string{".", ".."}
-	attrs["."], attrs[".."] = d.GetDotAttrs(root)
-
-	// Get info from all children.
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	for name, w := range d.children {
-		if child := w.Get(); child != nil {
-			defer child.DecRef()
-
-			// Skip negative children.
-			if child.(*Dirent).IsNegative() {
-				continue
-			}
-
-			sattr := child.(*Dirent).Inode.StableAttr
-			attrs[name] = DentAttr{
-				Type:    sattr.Type,
-				InodeID: sattr.InodeID,
-			}
-			names = append(names, name)
-		}
-	}
-
-	sort.Strings(names)
-
-	if int(offset) >= len(names) {
-		return offset, nil
-	}
-	names = names[int(offset):]
-	for _, name := range names {
-		if err := dirCtx.DirEmit(name, attrs[name]); err != nil {
-			return offset, err
-		}
-		offset++
-	}
-	return offset, nil
-}
-
 // DirIterator is an open directory containing directory entries that can be read.
 type DirIterator interface {
 	// IterateDir emits directory entries by calling dirCtx.EmitDir, beginning
@@ -964,10 +866,6 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent,
 		return offset, nil
 	}
 
-	if d.frozen {
-		return d.readdirFrozen(root, offset, dirCtx)
-	}
-
 	// Collect attrs for "." and "..".
 	dot, dotdot := d.GetDotAttrs(root)
 
@@ -1068,11 +966,6 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err
 		return nil, syserror.EINVAL
 	}
 
-	// Are we frozen?
-	if d.parent.frozen && !d.parent.Inode.IsVirtual() {
-		return nil, syserror.ENOENT
-	}
-
 	// Dirent that'll replace d.
 	//
 	// Note that NewDirent returns with one reference taken; the reference
@@ -1101,11 +994,6 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error {
 		return syserror.ENOENT
 	}
 
-	// Are we frozen?
-	if d.parent.frozen && !d.parent.Inode.IsVirtual() {
-		return syserror.ENOENT
-	}
-
 	// Remount our former child in its place.
 	//
 	// As replacement used to be our child, it must already have the right
@@ -1135,11 +1023,6 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath
 	unlock := d.lockDirectory()
 	defer unlock()
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Try to walk to the node.
 	child, err := d.walk(ctx, root, name, false /* may unlock */)
 	if err != nil {
@@ -1201,11 +1084,6 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
 	unlock := d.lockDirectory()
 	defer unlock()
 
-	// Are we frozen?
-	if d.frozen && !d.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Check for dots.
 	if name == "." {
 		// Rejected as the last component by rmdir(2).
@@ -1438,8 +1316,8 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName
 	}, nil
 }
 
-func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
-	uattr, err := dir.Inode.UnstableAttr(ctx)
+func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error {
+	uattr, err := d.Inode.UnstableAttr(ctx)
 	if err != nil {
 		return syserror.EPERM
 	}
@@ -1465,30 +1343,33 @@ func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
 	return syserror.EPERM
 }
 
-// MayDelete determines whether `name`, a child of `dir`, can be deleted or
+// MayDelete determines whether `name`, a child of `d`, can be deleted or
 // renamed by `ctx`.
 //
 // Compare Linux kernel fs/namei.c:may_delete.
-func MayDelete(ctx context.Context, root, dir *Dirent, name string) error {
-	if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error {
+	if err := d.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
 		return err
 	}
 
-	victim, err := dir.Walk(ctx, root, name)
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	victim, err := d.walk(ctx, root, name, true /* may unlock */)
 	if err != nil {
 		return err
 	}
 	defer victim.DecRef()
 
-	return mayDelete(ctx, dir, victim)
+	return d.mayDelete(ctx, victim)
 }
 
 // mayDelete determines whether `victim`, a child of `dir`, can be deleted or
 // renamed by `ctx`.
 //
 // Preconditions: `dir` is writable and executable by `ctx`.
-func mayDelete(ctx context.Context, dir, victim *Dirent) error {
-	if err := checkSticky(ctx, dir, victim); err != nil {
+func (d *Dirent) mayDelete(ctx context.Context, victim *Dirent) error {
+	if err := d.checkSticky(ctx, victim); err != nil {
 		return err
 	}
 
@@ -1516,15 +1397,6 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		return err
 	}
 
-	// Are we frozen?
-	// TODO(jamieliu): Is this the right errno?
-	if oldParent.frozen && !oldParent.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-	if newParent.frozen && !newParent.Inode.IsVirtual() {
-		return syscall.ENOENT
-	}
-
 	// Do we have general permission to remove from oldParent and
 	// create/replace in newParent?
 	if err := oldParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
@@ -1542,7 +1414,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	defer renamed.DecRef()
 
 	// Check that the renamed dirent is deletable.
-	if err := mayDelete(ctx, oldParent, renamed); err != nil {
+	if err := oldParent.mayDelete(ctx, renamed); err != nil {
 		return err
 	}
 
@@ -1580,7 +1452,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		// across the Rename, so must call DecRef manually (no defer).
 
 		// Check that we can delete replaced.
-		if err := mayDelete(ctx, newParent, replaced); err != nil {
+		if err := newParent.mayDelete(ctx, replaced); err != nil {
 			replaced.DecRef()
 			return err
 		}
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index 25514ace4..33de32c69 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -101,8 +101,6 @@ func (c *DirentCache) remove(d *Dirent) {
 		panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d))
 	}
 	c.list.Remove(d)
-	d.SetPrev(nil)
-	d.SetNext(nil)
 	d.DecRef()
 	c.currentSize--
 	if c.limit != nil {
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index 5aff0cc95..a0082ecca 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -119,7 +119,7 @@ func TestNewPipe(t *testing.T) {
 				continue
 			}
 			if flags := p.flags; test.flags != flags {
-				t.Errorf("%s: got file flags %s, want %s", test.desc, flags, test.flags)
+				t.Errorf("%s: got file flags %v, want %v", test.desc, flags, test.flags)
 				continue
 			}
 			if len(test.readAheadBuffer) != len(p.readAheadBuffer) {
@@ -136,7 +136,7 @@ func TestNewPipe(t *testing.T) {
 				continue
 			}
 			if !fdnotifier.HasFD(int32(f.FD())) {
-				t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD)
+				t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD())
 			}
 		}
 	}
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 78100e448..2a278fbe3 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -44,7 +44,7 @@ var (
 	RecordWaitTime = false
 
 	reads    = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.")
-	readWait = metric.MustCreateNewUint64Metric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.")
+	readWait = metric.MustCreateNewUint64NanosecondsMetric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.")
 )
 
 // IncrementWait increments the given wait time metric, if enabled.
@@ -310,7 +310,6 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error
 	if !f.mu.Lock(ctx) {
 		return 0, syserror.ErrInterrupted
 	}
-
 	unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
 	// Handle append mode.
 	if f.Flags().Append {
@@ -355,7 +354,6 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
 	// offset."
 	unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
 	defer unlockAppendMu()
-
 	if f.Flags().Append {
 		if err := f.offsetForAppend(ctx, &offset); err != nil {
 			return 0, err
@@ -374,9 +372,10 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
 	return f.FileOperations.Write(ctx, f, src, offset)
 }
 
-// offsetForAppend sets the given offset to the end of the file.
+// offsetForAppend atomically sets the given offset to the end of the file.
 //
-// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing.
+// Precondition: the file.Dirent.Inode.appendMu mutex should be held for
+// writing.
 func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
 	uattr, err := f.Dirent.Inode.UnstableAttr(ctx)
 	if err != nil {
@@ -386,7 +385,7 @@ func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
 	}
 
 	// Update the offset.
-	*offset = uattr.Size
+	atomic.StoreInt64(offset, uattr.Size)
 
 	return nil
 }
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index a76d87e3a..1971cc680 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -175,90 +175,6 @@ func TestReaddirRevalidation(t *testing.T) {
 	}
 }
 
-// TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with
-// a frozen dirent tree does not make Readdir calls to the underlying files.
-// This is a regression test for b/114808269.
-func TestReaddirOverlayFrozen(t *testing.T) {
-	ctx := contexttest.Context(t)
-
-	// Create an overlay with two directories, each with two files.
-	upper := newTestRamfsDir(ctx, []dirContent{{name: "upper-file1"}, {name: "upper-file2"}}, nil)
-	lower := newTestRamfsDir(ctx, []dirContent{{name: "lower-file1"}, {name: "lower-file2"}}, nil)
-	overlayInode := fs.NewTestOverlayDir(ctx, upper, lower, false)
-
-	// Set that overlay as the root.
-	root := fs.NewDirent(ctx, overlayInode, "root")
-	ctx = &rootContext{
-		Context: ctx,
-		root:    root,
-	}
-
-	// Check that calling Readdir on the root now returns all 4 files (2
-	// from each layer in the overlay).
-	rootFile, err := root.Inode.GetFile(ctx, root, fs.FileFlags{Read: true})
-	if err != nil {
-		t.Fatalf("root.Inode.GetFile failed: %v", err)
-	}
-	defer rootFile.DecRef()
-	ser := &fs.CollectEntriesSerializer{}
-	if err := rootFile.Readdir(ctx, ser); err != nil {
-		t.Fatalf("rootFile.Readdir failed: %v", err)
-	}
-	if got, want := ser.Order, []string{".", "..", "lower-file1", "lower-file2", "upper-file1", "upper-file2"}; !reflect.DeepEqual(got, want) {
-		t.Errorf("Readdir got names %v, want %v", got, want)
-	}
-
-	// Readdir should have been called on upper and lower.
-	upperDir := upper.InodeOperations.(*dir)
-	lowerDir := lower.InodeOperations.(*dir)
-	if !upperDir.ReaddirCalled {
-		t.Errorf("upperDir.ReaddirCalled got %v, want true", upperDir.ReaddirCalled)
-	}
-	if !lowerDir.ReaddirCalled {
-		t.Errorf("lowerDir.ReaddirCalled got %v, want true", lowerDir.ReaddirCalled)
-	}
-
-	// Reset.
-	upperDir.ReaddirCalled = false
-	lowerDir.ReaddirCalled = false
-
-	// Take references on "upper-file1" and "lower-file1", pinning them in
-	// the dirent tree.
-	for _, name := range []string{"upper-file1", "lower-file1"} {
-		if _, err := root.Walk(ctx, root, name); err != nil {
-			t.Fatalf("root.Walk(%q) failed: %v", name, err)
-		}
-		// Don't drop a reference on the returned dirent so that it
-		// will stay in the tree.
-	}
-
-	// Freeze the dirent tree.
-	root.Freeze()
-
-	// Seek back to the beginning of the file.
-	if _, err := rootFile.Seek(ctx, fs.SeekSet, 0); err != nil {
-		t.Fatalf("error seeking to beginning of directory: %v", err)
-	}
-
-	// Calling Readdir on the root now will return only the pinned
-	// children.
-	ser = &fs.CollectEntriesSerializer{}
-	if err := rootFile.Readdir(ctx, ser); err != nil {
-		t.Fatalf("rootFile.Readdir failed: %v", err)
-	}
-	if got, want := ser.Order, []string{".", "..", "lower-file1", "upper-file1"}; !reflect.DeepEqual(got, want) {
-		t.Errorf("Readdir got names %v, want %v", got, want)
-	}
-
-	// Readdir should NOT have been called on upper or lower.
-	if upperDir.ReaddirCalled {
-		t.Errorf("upperDir.ReaddirCalled got %v, want false", upperDir.ReaddirCalled)
-	}
-	if lowerDir.ReaddirCalled {
-		t.Errorf("lowerDir.ReaddirCalled got %v, want false", lowerDir.ReaddirCalled)
-	}
-}
-
 type rootContext struct {
 	context.Context
 	root *fs.Dirent
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index bdba6efe5..d2dbff268 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -42,9 +42,10 @@
 //     Dirent.dirMu
 //       Dirent.mu
 //         DirentCache.mu
-//         Locks in InodeOperations implementations or overlayEntry
 //         Inode.Watches.mu (see `Inotify` for other lock ordering)
 //         MountSource.mu
+//         Inode.appendMu
+//           Locks in InodeOperations implementations or overlayEntry
 //
 // If multiple Dirent or MountSource locks must be taken, locks in the parent must be
 // taken before locks in their children.
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index 6564fd0c6..dd6f5aba6 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -18,6 +18,7 @@ import (
 	"math"
 
 	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
 )
 
 // FrameRefSetFunctions implements segment.Functions for FrameRefSet.
@@ -49,3 +50,42 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
 func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
 	return val, val
 }
+
+// IncRefAndAccount adds a reference on the range fr. All newly inserted segments
+// are accounted as host page cache memory mappings.
+func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
+	seg, gap := refs.Find(fr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < fr.End:
+			seg = refs.Isolate(seg, fr)
+			seg.SetValue(seg.Value() + 1)
+			seg, gap = seg.NextNonEmpty()
+		case gap.Ok() && gap.Start() < fr.End:
+			newRange := gap.Range().Intersect(fr)
+			usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
+			seg, gap = refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
+		default:
+			refs.MergeAdjacent(fr)
+			return
+		}
+	}
+}
+
+// DecRefAndAccount removes a reference on the range fr and untracks segments
+// that are removed from memory accounting.
+func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) {
+	seg := refs.FindSegment(fr.Start)
+
+	for seg.Ok() && seg.Start() < fr.End {
+		seg = refs.Isolate(seg, fr)
+		if old := seg.Value(); old == 1 {
+			usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
+			seg = refs.Remove(seg).NextSegment()
+		} else {
+			seg.SetValue(old - 1)
+			seg = seg.NextSegment()
+		}
+	}
+	refs.MergeAdjacent(fr)
+}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index daecc4ffe..1922ff08c 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -259,8 +259,8 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, ui
 
 // RemoveXattr implements fs.InodeOperations.RemoveXattr.
 func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error {
-	i.mu.RLock()
-	defer i.mu.RUnlock()
+	i.mu.Lock()
+	defer i.mu.Unlock()
 	if _, ok := i.xattrs[name]; ok {
 		delete(i.xattrs, name)
 		return nil
diff --git a/pkg/sentry/fs/g3doc/.gitignore b/pkg/sentry/fs/g3doc/.gitignore
new file mode 100644
index 000000000..2d19fc766
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
new file mode 100644
index 000000000..2ca84dd74
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -0,0 +1,263 @@
+# Foreword
+
+This document describes an on-going project to support FUSE filesystems within
+the sentry. This is intended to become the final documentation for this
+subsystem, and is therefore written in the past tense. However FUSE support is
+currently incomplete and the document will be updated as things progress.
+
+# FUSE: Filesystem in Userspace
+
+The sentry supports dispatching filesystem operations to a FUSE server, allowing
+FUSE filesystem to be used with a sandbox.
+
+## Overview
+
+FUSE has two main components:
+
+1.  A client kernel driver (canonically `fuse.ko` in Linux), which forwards
+    filesystem operations (usually initiated by syscalls) to the server.
+
+2.  A server, which is a userspace daemon that implements the actual filesystem.
+
+The sentry implements the client component, which allows a server daemon running
+within the sandbox to implement a filesystem within the sandbox.
+
+A FUSE filesystem is initialized with `mount(2)`, typically with the help of a
+utility like `fusermount(1)`. Various mount options exist for establishing
+ownership and access permissions on the filesystem, but the most important mount
+option is a file descriptor used to establish communication between the client
+and server.
+
+The FUSE device FD is obtained by opening `/dev/fuse`. During regular operation,
+the client and server use the FUSE protocol described in `fuse(4)` to service
+filesystem operations. See the "Protocol" section below for more information
+about this protocol. The core of the sentry support for FUSE is the client-side
+implementation of this protocol.
+
+## FUSE in the Sentry
+
+The sentry's FUSE client targets VFS2 and has the following components:
+
+-   An implementation of `/dev/fuse`.
+
+-   A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting
+    VFS2, one point of contention may be the lack of inodes in VFS2. We can
+    tentatively implement a kernfs-based filesystem to bridge the gap in APIs.
+    The kernfs base functionality can serve the role of the Linux inode cache
+    and, the filesystem can map VFS2 syscalls to kernfs inode operations; see
+    the `kernfs.Inode` interface.
+
+The FUSE protocol lends itself well to marshaling with `go_marshal`. The various
+request and response packets can be defined in the ABI package and converted to
+and from the wire format using `go_marshal`.
+
+### Design Goals
+
+-   While filesystem performance is always important, the sentry's FUSE support
+    is primarily concerned with compatibility, with performance as a secondary
+    concern.
+
+-   Avoiding deadlocks from a hung server daemon.
+
+-   Consider the potential for denial of service from a malicious server daemon.
+    Protecting itself from userspace is already a design goal for the sentry,
+    but needs additional consideration for FUSE. Normally, an operating system
+    doesn't rely on userspace to make progress with filesystem operations. Since
+    this changes with FUSE, it opens up the possibility of creating a chain of
+    dependencies controlled by userspace, which could affect an entire sandbox.
+    For example: a FUSE op can block a syscall, which could be holding a
+    subsystem lock, which can then block another task goroutine.
+
+### Milestones
+
+Below are some broad goals to aim for while implementing FUSE in the sentry.
+Many FUSE ops can be grouped into broad categories of functionality, and most
+ops can be implemented in parallel.
+
+#### Minimal client that can mount a trivial FUSE filesystem.
+
+-   Implement `/dev/fuse` - a character device used to establish an FD for
+    communication between the sentry and the server daemon.
+
+-   Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+
+#### Read-only mount with basic file operations
+
+-   Implement the majority of file, directory and file descriptor FUSE ops. For
+    this milestone, we can skip uncommon or complex operations like mmap, mknod,
+    file locking, poll, and extended attributes. We can stub these out along
+    with any ops that modify the filesystem. The exact list of required ops are
+    to be determined, but the goal is to mount a real filesystem as read-only,
+    and be able to read contents from the filesystem in the sentry.
+
+#### Full read-write support
+
+-   Implement the remaining FUSE ops and decide if we can omit rarely used
+    operations like ioctl.
+
+# Appendix
+
+## FUSE Protocol
+
+The FUSE protocol is a request-response protocol. All requests are initiated by
+the client. The wire-format for the protocol is raw C structs serialized to
+memory.
+
+All FUSE requests begin with the following request header:
+
+```c
+struct fuse_in_header {
+  uint32_t len;       // Length of the request, including this header.
+  uint32_t opcode;    // Requested operation.
+  uint64_t unique;    // A unique identifier for this request.
+  uint64_t nodeid;    // ID of the filesystem object being operated on.
+  uint32_t uid;       // UID of the requesting process.
+  uint32_t gid;       // GID of the requesting process.
+  uint32_t pid;       // PID of the requesting process.
+  uint32_t padding;
+};
+```
+
+The request is then followed by a payload specific to the `opcode`.
+
+All responses begin with this response header:
+
+```c
+struct fuse_out_header {
+  uint32_t len;       // Length of the response, including this header.
+  int32_t  error;     // Status of the request, 0 if success.
+  uint64_t unique;    // The unique identifier from the corresponding request.
+};
+```
+
+The response payload also depends on the request `opcode`. If `error != 0`, the
+response payload must be empty.
+
+### Operations
+
+The following is a list of all FUSE operations used in `fuse_in_header.opcode`
+as of Linux v4.4, and a brief description of their purpose. These are defined in
+`uapi/linux/fuse.h`. Many of these have a corresponding request and response
+payload struct; `fuse(4)` has details for some of these. We also note how these
+operations map to the sentry virtual filesystem.
+
+#### FUSE meta-operations
+
+These operations are specific to FUSE and don't have a corresponding action in a
+generic filesystem.
+
+-   `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the
+    first message sent by the client after mount. This is used for version and
+    feature negotiation. This is related to `mount(2)`.
+-   `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`.
+-   `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the
+    `fuse_in_header.unique` value provided in the corresponding request header.
+    The client can send at most one of these per request, and will enter an
+    uninterruptible wait for a reply. The server is expected to reply promptly.
+-   `FUSE_FORGET`: A hint to the server that server should evict the indicate
+    node from any caches. This is wired up to `(struct
+    super_operations).evict_inode` in Linux, which is in turned hooked as the
+    inode cache shrinker which is typically triggered by system memory pressure.
+-   `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`.
+
+#### Filesystem Syscalls
+
+These FUSE ops map directly to an equivalent filesystem syscall, or family of
+syscalls. The relevant syscalls have a similar name to the operation, unless
+otherwise noted.
+
+Node creation:
+
+-   `FUSE_MKNOD`
+-   `FUSE_MKDIR`
+-   `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which
+    atomically creates and opens a node.
+
+Node attributes and extended attributes:
+
+-   `FUSE_GETATTR`
+-   `FUSE_SETATTR`
+-   `FUSE_SETXATTR`
+-   `FUSE_GETXATTR`
+-   `FUSE_LISTXATTR`
+-   `FUSE_REMOVEXATTR`
+
+Node link manipulation:
+
+-   `FUSE_READLINK`
+-   `FUSE_LINK`
+-   `FUSE_SYMLINK`
+-   `FUSE_UNLINK`
+
+Directory operations:
+
+-   `FUSE_RMDIR`
+-   `FUSE_RENAME`
+-   `FUSE_RENAME2`
+-   `FUSE_OPENDIR`: `open(2)` for directories.
+-   `FUSE_RELEASEDIR`: `close(2)` for directories.
+-   `FUSE_READDIR`
+-   `FUSE_READDIRPLUS`
+-   `FUSE_FSYNCDIR`: `fsync(2)` for directories.
+-   `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is
+    reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path
+    component to a node. However the returned identifier is opaque to the
+    client. The server must remember this mapping, as this is how the client
+    will reference the node in the future.
+
+File operations:
+
+-   `FUSE_OPEN`: `open(2)` for files.
+-   `FUSE_RELEASE`: `close(2)` for files.
+-   `FUSE_FSYNC`
+-   `FUSE_FALLOCATE`
+-   `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`.
+-   `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`.
+
+File locking:
+
+-   `FUSE_GETLK`
+-   `FUSE_SETLK`
+-   `FUSE_SETLKW`
+-   `FUSE_COPY_FILE_RANGE`
+
+File descriptor operations:
+
+-   `FUSE_IOCTL`
+-   `FUSE_POLL`
+-   `FUSE_LSEEK`
+
+Filesystem operations:
+
+-   `FUSE_STATFS`
+
+#### Permissions
+
+-   `FUSE_ACCESS` is used to check if a node is accessible, as part of many
+    syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` in the
+    sentry.
+
+#### I/O Operations
+
+These ops are used to read and write file pages. They're used to implement both
+I/O syscalls like `read(2)`, `write(2)` and `mmap(2)`.
+
+-   `FUSE_READ`
+-   `FUSE_WRITE`
+
+#### Miscellaneous
+
+-   `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is
+    closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)`
+    syscall from the user. Maps to `vfs.FileDescriptorImpl.Release` in the
+    sentry.
+-   `FUSE_BMAP`: Old address space API for block defrag. Probably not needed.
+-   `FUSE_NOTIFY_REPLY`: [TODO: what does this do?]
+
+# References
+
+-   [fuse(4) Linux manual page](https://www.man7.org/linux/man-pages/man4/fuse.4.html)
+-   [Linux kernel FUSE documentation](https://www.kernel.org/doc/html/latest/filesystems/fuse.html)
+-   [The reference implementation of the Linux FUSE (Filesystem in Userspace)
+    interface](https://github.com/libfuse/libfuse)
+-   [The kernel interface of FUSE](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fuse.h)
diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go
index 6db4b762d..d481baf77 100644
--- a/pkg/sentry/fs/gofer/attr.go
+++ b/pkg/sentry/fs/gofer/attr.go
@@ -75,10 +75,18 @@ func owner(mounter fs.FileOwner, valid p9.AttrMask, pattr p9.Attr) fs.FileOwner
 	// task's EUID/EGID.
 	owner := mounter
 	if valid.UID {
-		owner.UID = auth.KUID(pattr.UID)
+		if pattr.UID.Ok() {
+			owner.UID = auth.KUID(pattr.UID)
+		} else {
+			owner.UID = auth.KUID(auth.OverflowUID)
+		}
 	}
 	if valid.GID {
-		owner.GID = auth.KGID(pattr.GID)
+		if pattr.GID.Ok() {
+			owner.GID = auth.KGID(pattr.GID)
+		} else {
+			owner.GID = auth.KGID(auth.OverflowGID)
+		}
 	}
 	return owner
 }
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 23296f246..b2fcab127 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -37,9 +37,9 @@ var (
 	opens9P      = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a 9P file was opened from a gofer.")
 	opensHost    = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a host file was opened from a gofer.")
 	reads9P      = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.")
-	readWait9P   = metric.MustCreateNewUint64Metric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.")
+	readWait9P   = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.")
 	readsHost    = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.")
-	readWaitHost = metric.MustCreateNewUint64Metric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.")
+	readWaitHost = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.")
 )
 
 // fileOperations implements fs.FileOperations for a remote file system.
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index ff96b28ba..edd6576aa 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -34,7 +34,6 @@ func (f *fileOperations) afterLoad() {
 		flags := f.flags
 		flags.Truncate = false
 
-		// TODO(b/38173783): Context is not plumbed to save/restore.
 		f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), flags, f.inodeOperations.cachingInodeOps)
 		if err != nil {
 			return fmt.Errorf("failed to re-open handle: %v", err)
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 9d41fcbdb..8ae2d78d7 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -60,8 +60,7 @@ const (
 	limitHostFDTranslationKey = "limit_host_fd_translation"
 
 	// overlayfsStaleRead if present closes cached readonly file after the first
-	// write. This is done to workaround a limitation of overlayfs in kernels
-	// before 4.19 where open FDs are not updated after the file is copied up.
+	// write. This is done to workaround a limitation of Linux overlayfs.
 	overlayfsStaleRead = "overlayfs_stale_read"
 )
 
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index 9f7c3e89f..fc14249be 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -57,7 +57,6 @@ func (h *handles) DecRef() {
 				}
 			}
 		}
-		// FIXME(b/38173783): Context is not plumbed here.
 		if err := h.File.close(context.Background()); err != nil {
 			log.Warningf("error closing p9 file: %v", err)
 		}
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 1c934981b..a016c896e 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -273,7 +273,7 @@ func (i *inodeFileState) recreateReadHandles(ctx context.Context, writer *handle
 	// operations on the old will see the new data. Then, make the new handle take
 	// ownereship of the old FD and mark the old readHandle to not close the FD
 	// when done.
-	if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), 0); err != nil {
+	if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), syscall.O_CLOEXEC); err != nil {
 		return err
 	}
 
@@ -710,13 +710,10 @@ func init() {
 }
 
 // AddLink implements InodeOperations.AddLink, but is currently a noop.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (*inodeOperations) AddLink() {}
 
 // DropLink implements InodeOperations.DropLink, but is currently a noop.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (*inodeOperations) DropLink() {}
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 238f7804c..a3402e343 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -123,7 +123,6 @@ func (i *inodeFileState) afterLoad() {
 			// beforeSave.
 			return fmt.Errorf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings))
 		}
-		// TODO(b/38173783): Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 
 		_, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name))
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index a35c3a23d..cf9800100 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -16,7 +16,6 @@ package gofer
 
 import (
 	"fmt"
-	"syscall"
 
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
@@ -68,7 +67,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 	// Get a p9.File for name.
 	qids, newFile, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name})
 	if err != nil {
-		if err == syscall.ENOENT {
+		if err == syserror.ENOENT {
 			if cp.cacheNegativeDirents() {
 				// Return a negative Dirent. It will stay cached until something
 				// is created over it.
@@ -207,7 +206,7 @@ func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, t
 
 	targetOpts, ok := target.InodeOperations.(*inodeOperations)
 	if !ok {
-		return syscall.EXDEV
+		return syserror.EXDEV
 	}
 
 	if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil {
@@ -251,7 +250,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	}
 
 	if i.session().overrides == nil {
-		return nil, syscall.EOPNOTSUPP
+		return nil, syserror.EOPNOTSUPP
 	}
 
 	// Stabilize the override map while creation is in progress.
@@ -280,7 +279,7 @@ func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name st
 
 	// N.B. FIFOs use major/minor numbers 0.
 	if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
-		if i.session().overrides == nil || err != syscall.EPERM {
+		if i.session().overrides == nil || err != syserror.EPERM {
 			return err
 		}
 		// If gofer doesn't support mknod, check if we can create an internal fifo.
@@ -427,17 +426,16 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent
 		return syserror.ENAMETOOLONG
 	}
 
-	// Unwrap the new parent to a *inodeOperations.
-	newParentInodeOperations, ok := newParent.InodeOperations.(*inodeOperations)
-	if !ok {
-		return syscall.EXDEV
+	// Don't allow renames across different mounts.
+	if newParent.MountSource != oldParent.MountSource {
+		return syserror.EXDEV
 	}
 
+	// Unwrap the new parent to a *inodeOperations.
+	newParentInodeOperations := newParent.InodeOperations.(*inodeOperations)
+
 	// Unwrap the old parent to a *inodeOperations.
-	oldParentInodeOperations, ok := oldParent.InodeOperations.(*inodeOperations)
-	if !ok {
-		return syscall.EXDEV
-	}
+	oldParentInodeOperations := oldParent.InodeOperations.(*inodeOperations)
 
 	// Do the rename.
 	if err := i.fileState.file.rename(ctx, newParentInodeOperations.fileState.file, newName); err != nil {
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index f6b3ef178..b5efc86f2 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -190,9 +190,9 @@ type session struct {
 	// be socket/pipe files. This allows unix domain sockets and named pipes to
 	// be used with paths that belong to a gofer.
 	//
-	// TODO(gvisor.dev/issue/1200): there are few possible races with someone
-	// stat'ing the file and another deleting it concurrently, where the file
-	// will not be reported as socket file.
+	// There are a few possible races with someone stat'ing the file and another
+	// deleting it concurrently, where the file will not be reported as socket
+	// file.
 	overrides *overrideMaps `state:"wait"`
 }
 
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 111da59f9..2d398b753 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -104,7 +104,6 @@ func (s *session) afterLoad() {
 	// If private unix sockets are enabled, create and fill the session's endpoint
 	// maps.
 	if opts.privateunixsocket {
-		// TODO(b/38173783): Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 
 		if err = s.restoreEndpointMaps(ctx); err != nil {
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 10ba2f5f0..40f2c1cad 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -47,6 +47,8 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport.
 	return &endpoint{inode, i.fileState.file.file, path}
 }
 
+// LINT.IfChange
+
 // endpoint is a Gofer-backed transport.BoundEndpoint.
 //
 // An endpoint's lifetime is the time between when InodeOperations.BoundEndpoint()
@@ -146,3 +148,5 @@ func (e *endpoint) Release() {
 func (e *endpoint) Passcred() bool {
 	return false
 }
+
+// LINT.ThenChange(../../fsimpl/gofer/socket.go)
diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go
index 2d8d3a2ea..47a6c69bf 100644
--- a/pkg/sentry/fs/gofer/util.go
+++ b/pkg/sentry/fs/gofer/util.go
@@ -20,17 +20,29 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 )
 
 func utimes(ctx context.Context, file contextFile, ts fs.TimeSpec) error {
 	if ts.ATimeOmit && ts.MTimeOmit {
 		return nil
 	}
+
+	// Replace requests to use the "system time" with the current time to
+	// ensure that timestamps remain consistent with the remote
+	// filesystem.
+	now := ktime.NowFromContext(ctx)
+	if ts.ATimeSetSystemTime {
+		ts.ATime = now
+	}
+	if ts.MTimeSetSystemTime {
+		ts.MTime = now
+	}
 	mask := p9.SetAttrMask{
 		ATime:              !ts.ATimeOmit,
-		ATimeNotSystemTime: !ts.ATimeSetSystemTime,
+		ATimeNotSystemTime: true,
 		MTime:              !ts.MTimeOmit,
-		MTimeNotSystemTime: !ts.MTimeSetSystemTime,
+		MTimeNotSystemTime: true,
 	}
 	as, ans := ts.ATime.Unix()
 	ms, mns := ts.MTime.Unix()
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 21003ea45..aabce6cc9 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -10,7 +10,7 @@ go_library(
         "descriptor_state.go",
         "device.go",
         "file.go",
-        "fs.go",
+        "host.go",
         "inode.go",
         "inode_state.go",
         "ioctl_unsafe.go",
@@ -62,18 +62,15 @@ go_test(
     size = "small",
     srcs = [
         "descriptor_test.go",
-        "fs_test.go",
         "inode_test.go",
         "socket_test.go",
         "wait_test.go",
     ],
     library = ":host",
     deps = [
-        "//pkg/context",
         "//pkg/fd",
         "//pkg/fdnotifier",
         "//pkg/sentry/contexttest",
-        "//pkg/sentry/fs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index 1658979fc..39299b7e4 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -23,6 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 )
 
+// LINT.IfChange
+
 type scmRights struct {
 	fds []int
 }
@@ -76,7 +78,7 @@ func fdsToFiles(ctx context.Context, fds []int) []*fs.File {
 		}
 
 		// Create the file backed by hostFD.
-		file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx))
+		file, err := NewFile(ctx, fd)
 		if err != nil {
 			ctx.Warningf("Error creating file from host FD: %v", err)
 			break
@@ -91,3 +93,5 @@ func fdsToFiles(ctx context.Context, fds []int) []*fs.File {
 	}
 	return files
 }
+
+// LINT.ThenChange(../../fsimpl/host/control.go)
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 2a4d1b291..cfdce6a74 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -16,7 +16,6 @@ package host
 
 import (
 	"fmt"
-	"path"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/fdnotifier"
@@ -28,12 +27,9 @@ import (
 //
 // +stateify savable
 type descriptor struct {
-	// donated is true if the host fd was donated by another process.
-	donated bool
-
 	// If origFD >= 0, it is the host fd that this file was originally created
 	// from, which must be available at time of restore. The FD can be closed
-	// after descriptor is created. Only set if donated is true.
+	// after descriptor is created.
 	origFD int
 
 	// wouldBlock is true if value (below) points to a file that can
@@ -41,15 +37,13 @@ type descriptor struct {
 	wouldBlock bool
 
 	// value is the wrapped host fd. It is never saved or restored
-	// directly. How it is restored depends on whether it was
-	// donated and the fs.MountSource it was originally
-	// opened/created from.
+	// directly.
 	value int `state:"nosave"`
 }
 
 // newDescriptor returns a wrapped host file descriptor. On success,
 // the descriptor is registered for event notifications with queue.
-func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) {
+func newDescriptor(fd int, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) {
 	ownedFD := fd
 	origFD := -1
 	if saveable {
@@ -69,7 +63,6 @@ func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *
 		}
 	}
 	return &descriptor{
-		donated:    donated,
 		origFD:     origFD,
 		wouldBlock: wouldBlock,
 		value:      ownedFD,
@@ -77,25 +70,11 @@ func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *
 }
 
 // initAfterLoad initializes the value of the descriptor after Load.
-func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error {
-	if d.donated {
-		var err error
-		d.value, err = syscall.Dup(d.origFD)
-		if err != nil {
-			return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err)
-		}
-	} else {
-		name, ok := mo.inodeMappings[id]
-		if !ok {
-			return fmt.Errorf("failed to find path for inode number %d", id)
-		}
-		fullpath := path.Join(mo.root, name)
-
-		var err error
-		d.value, err = open(nil, fullpath)
-		if err != nil {
-			return fmt.Errorf("failed to open %q: %v", fullpath, err)
-		}
+func (d *descriptor) initAfterLoad(id uint64, queue *waiter.Queue) error {
+	var err error
+	d.value, err = syscall.Dup(d.origFD)
+	if err != nil {
+		return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err)
 	}
 	if d.wouldBlock {
 		if err := syscall.SetNonblock(d.value, true); err != nil {
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
index 8167390a9..e880582ab 100644
--- a/pkg/sentry/fs/host/descriptor_state.go
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -16,7 +16,7 @@ package host
 
 // beforeSave is invoked by stateify.
 func (d *descriptor) beforeSave() {
-	if d.donated && d.origFD < 0 {
+	if d.origFD < 0 {
 		panic("donated file descriptor cannot be saved")
 	}
 }
diff --git a/pkg/sentry/fs/host/descriptor_test.go b/pkg/sentry/fs/host/descriptor_test.go
index 4205981f5..d8e4605b6 100644
--- a/pkg/sentry/fs/host/descriptor_test.go
+++ b/pkg/sentry/fs/host/descriptor_test.go
@@ -47,10 +47,10 @@ func TestDescriptorRelease(t *testing.T) {
 
 			// FD ownership is transferred to the descritor.
 			queue := &waiter.Queue{}
-			d, err := newDescriptor(fd, false /* donated*/, tc.saveable, tc.wouldBlock, queue)
+			d, err := newDescriptor(fd, tc.saveable, tc.wouldBlock, queue)
 			if err != nil {
 				syscall.Close(fd)
-				t.Fatalf("newDescriptor(%d, %t, false, %t, queue) failed, err: %v", fd, tc.saveable, tc.wouldBlock, err)
+				t.Fatalf("newDescriptor(%d, %t, %t, queue) failed, err: %v", fd, tc.saveable, tc.wouldBlock, err)
 			}
 			if tc.saveable {
 				if d.origFD < 0 {
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index e08f56d04..3e48b8b2c 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -60,8 +60,8 @@ var _ fs.FileOperations = (*fileOperations)(nil)
 // The returned File cannot be saved, since there is no guarantee that the same
 // FD will exist or represent the same file at time of restore. If such a
 // guarantee does exist, use ImportFile instead.
-func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) {
-	return newFileFromDonatedFD(ctx, fd, mounter, false, false)
+func NewFile(ctx context.Context, fd int) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, false, false)
 }
 
 // ImportFile creates a new File backed by the provided host file descriptor.
@@ -71,13 +71,13 @@ func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error
 // If the returned file is saved, it will be restored by re-importing the FD
 // originally passed to ImportFile. It is the restorer's responsibility to
 // ensure that the FD represents the same file.
-func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, isTTY bool) (*fs.File, error) {
-	return newFileFromDonatedFD(ctx, fd, mounter, true, isTTY)
+func ImportFile(ctx context.Context, fd int, isTTY bool) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, true, isTTY)
 }
 
 // newFileFromDonatedFD returns an fs.File from a donated FD. If the FD is
 // saveable, then saveable is true.
-func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, isTTY bool) (*fs.File, error) {
+func newFileFromDonatedFD(ctx context.Context, donated int, saveable, isTTY bool) (*fs.File, error) {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(donated, &s); err != nil {
 		return nil, err
@@ -101,8 +101,8 @@ func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner
 		})
 		return s, nil
 	default:
-		msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */)
-		inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */)
+		msrc := fs.NewNonCachingMountSource(ctx, &filesystem{}, fs.MountSourceFlags{})
+		inode, err := newInode(ctx, msrc, donated, saveable)
 		if err != nil {
 			return nil, err
 		}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
deleted file mode 100644
index d3e8e3a36..000000000
--- a/pkg/sentry/fs/host/fs.go
+++ /dev/null
@@ -1,339 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package host implements an fs.Filesystem for files backed by host
-// file descriptors.
-package host
-
-import (
-	"fmt"
-	"path"
-	"path/filepath"
-	"strconv"
-	"strings"
-
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-)
-
-// FilesystemName is the name under which Filesystem is registered.
-const FilesystemName = "whitelistfs"
-
-const (
-	// whitelistKey is the mount option containing a comma-separated list
-	// of host paths to whitelist.
-	whitelistKey = "whitelist"
-
-	// rootPathKey is the mount option containing the root path of the
-	// mount.
-	rootPathKey = "root"
-
-	// dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership.
-	dontTranslateOwnershipKey = "dont_translate_ownership"
-)
-
-// maxTraversals determines link traversals in building the whitelist.
-const maxTraversals = 10
-
-// Filesystem is a pseudo file system that is only available during the setup
-// to lock down the configurations. This filesystem should only be mounted at root.
-//
-// Think twice before exposing this to applications.
-//
-// +stateify savable
-type Filesystem struct {
-	// whitelist is a set of host paths to whitelist.
-	paths []string
-}
-
-var _ fs.Filesystem = (*Filesystem)(nil)
-
-// Name is the identifier of this file system.
-func (*Filesystem) Name() string {
-	return FilesystemName
-}
-
-// AllowUserMount prohibits users from using mount(2) with this file system.
-func (*Filesystem) AllowUserMount() bool {
-	return false
-}
-
-// AllowUserList allows this filesystem to be listed in /proc/filesystems.
-func (*Filesystem) AllowUserList() bool {
-	return true
-}
-
-// Flags returns that there is nothing special about this file system.
-func (*Filesystem) Flags() fs.FilesystemFlags {
-	return 0
-}
-
-// Mount returns an fs.Inode exposing the host file system.  It is intended to be locked
-// down in PreExec below.
-func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
-	// Parse generic comma-separated key=value options.
-	options := fs.GenericMountSourceOptions(data)
-
-	// Grab the whitelist if one was specified.
-	// TODO(edahlgren/mpratt/hzy): require another option "testonly" in order to allow
-	// no whitelist.
-	if wl, ok := options[whitelistKey]; ok {
-		f.paths = strings.Split(wl, "|")
-		delete(options, whitelistKey)
-	}
-
-	// If the rootPath was set, use it. Othewise default to the root of the
-	// host fs.
-	rootPath := "/"
-	if rp, ok := options[rootPathKey]; ok {
-		rootPath = rp
-		delete(options, rootPathKey)
-
-		// We must relativize the whitelisted paths to the new root.
-		for i, p := range f.paths {
-			rel, err := filepath.Rel(rootPath, p)
-			if err != nil {
-				return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath)
-			}
-			f.paths[i] = path.Join("/", rel)
-		}
-	}
-	fd, err := open(nil, rootPath)
-	if err != nil {
-		return nil, fmt.Errorf("failed to find root: %v", err)
-	}
-
-	var dontTranslateOwnership bool
-	if v, ok := options[dontTranslateOwnershipKey]; ok {
-		b, err := strconv.ParseBool(v)
-		if err != nil {
-			return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err)
-		}
-		dontTranslateOwnership = b
-		delete(options, dontTranslateOwnershipKey)
-	}
-
-	// Fail if the caller passed us more options than we know about.
-	if len(options) > 0 {
-		return nil, fmt.Errorf("unsupported mount options: %v", options)
-	}
-
-	// The mounting EUID/EGID will be cached by this file system. This will
-	// be used to assign ownership to files that we own.
-	owner := fs.FileOwnerFromContext(ctx)
-
-	// Construct the host file system mount and inode.
-	msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership)
-	return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */)
-}
-
-// InstallWhitelist locks down the MountNamespace to only the currently installed
-// Dirents and the given paths.
-func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error {
-	return installWhitelist(ctx, m, f.paths)
-}
-
-func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error {
-	if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") {
-		// Warning will be logged during filter installation if the empty
-		// whitelist matters (allows for host file access).
-		return nil
-	}
-
-	// Done tracks entries already added.
-	done := make(map[string]bool)
-	root := m.Root()
-	defer root.DecRef()
-
-	for i := 0; i < len(paths); i++ {
-		// Make sure the path is absolute. This is a sanity check.
-		if !path.IsAbs(paths[i]) {
-			return fmt.Errorf("path %q is not absolute", paths[i])
-		}
-
-		// We need to add all the intermediate paths, in case one of
-		// them is a symlink that needs to be resolved.
-		for j := 1; j <= len(paths[i]); j++ {
-			if j < len(paths[i]) && paths[i][j] != '/' {
-				continue
-			}
-			current := paths[i][:j]
-
-			// Lookup the given component in the tree.
-			remainingTraversals := uint(maxTraversals)
-			d, err := m.FindLink(ctx, root, nil, current, &remainingTraversals)
-			if err != nil {
-				log.Warningf("populate failed for %q: %v", current, err)
-				continue
-			}
-
-			// It's critical that this DecRef happens after the
-			// freeze below. This ensures that the dentry is in
-			// place to be frozen. Otherwise, we freeze without
-			// these entries.
-			defer d.DecRef()
-
-			// Expand the last component if necessary.
-			if current == paths[i] {
-				// Is it a directory or symlink?
-				sattr := d.Inode.StableAttr
-				if fs.IsDir(sattr) {
-					for name := range childDentAttrs(ctx, d) {
-						paths = append(paths, path.Join(current, name))
-					}
-				}
-				if fs.IsSymlink(sattr) {
-					// Only expand symlinks once. The
-					// folder structure may contain
-					// recursive symlinks and we don't want
-					// to end up infinitely expanding this
-					// symlink. This is safe because this
-					// is the last component. If a later
-					// path wants to symlink something
-					// beneath this symlink that will still
-					// be handled by the FindLink above.
-					if done[current] {
-						continue
-					}
-
-					s, err := d.Inode.Readlink(ctx)
-					if err != nil {
-						log.Warningf("readlink failed for %q: %v", current, err)
-						continue
-					}
-					if path.IsAbs(s) {
-						paths = append(paths, s)
-					} else {
-						target := path.Join(path.Dir(current), s)
-						paths = append(paths, target)
-					}
-				}
-			}
-
-			// Only report this one once even though we may look
-			// it up more than once. If we whitelist /a/b,/a then
-			// /a will be "done" when it is looked up for /a/b,
-			// however we still need to expand all of its contents
-			// when whitelisting /a.
-			if !done[current] {
-				log.Debugf("whitelisted: %s", current)
-			}
-			done[current] = true
-		}
-	}
-
-	// Freeze the mount tree in place. This prevents any new paths from
-	// being opened and any old ones from being removed. If we do provide
-	// tmpfs mounts, we'll want to freeze/thaw those separately.
-	m.Freeze()
-	return nil
-}
-
-func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr {
-	dirname, _ := d.FullName(nil /* root */)
-	dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
-	if err != nil {
-		log.Warningf("failed to open directory %q: %v", dirname, err)
-		return nil
-	}
-	dir.DecRef()
-	var stubSerializer fs.CollectEntriesSerializer
-	if err := dir.Readdir(ctx, &stubSerializer); err != nil {
-		log.Warningf("failed to iterate on host directory %q: %v", dirname, err)
-		return nil
-	}
-	delete(stubSerializer.Entries, ".")
-	delete(stubSerializer.Entries, "..")
-	return stubSerializer.Entries
-}
-
-// newMountSource constructs a new host fs.MountSource
-// relative to a root path. The root should match the mount point.
-func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource {
-	return fs.NewMountSource(ctx, &superOperations{
-		root:                   root,
-		inodeMappings:          make(map[uint64]string),
-		mounter:                mounter,
-		dontTranslateOwnership: dontTranslateOwnership,
-	}, filesystem, flags)
-}
-
-// superOperations implements fs.MountSourceOperations.
-//
-// +stateify savable
-type superOperations struct {
-	fs.SimpleMountSourceOperations
-
-	// root is the path of the mount point. All inode mappings
-	// are relative to this root.
-	root string
-
-	// inodeMappings contains mappings of fs.Inodes associated
-	// with this MountSource to paths under root.
-	inodeMappings map[uint64]string
-
-	// mounter is the cached EUID/EGID that mounted this file system.
-	mounter fs.FileOwner
-
-	// dontTranslateOwnership indicates whether to not translate file
-	// ownership.
-	//
-	// By default, files/directories owned by the sandbox uses UID/GID
-	// of the mounter. For files/directories that are not owned by the
-	// sandbox, file UID/GID is translated to a UID/GID which cannot
-	// be mapped in the sandboxed application's user namespace. The
-	// UID/GID will look like the nobody UID/GID (65534) but is not
-	// strictly owned by the user "nobody".
-	//
-	// If whitelistfs is a lower filesystem in an overlay, set
-	// dont_translate_ownership=true in mount options.
-	dontTranslateOwnership bool
-}
-
-var _ fs.MountSourceOperations = (*superOperations)(nil)
-
-// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
-func (m *superOperations) ResetInodeMappings() {
-	m.inodeMappings = make(map[uint64]string)
-}
-
-// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping.
-func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) {
-	// This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
-	// because overlay copyUp may have changed them out from under us.
-	// So much for "immutable".
-	sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
-	m.inodeMappings[sattr.InodeID] = path
-}
-
-// Keep implements fs.MountSourceOperations.Keep.
-//
-// TODO(b/72455313,b/77596690): It is possible to change the permissions on a
-// host file while it is in the dirent cache (say from RO to RW), but it is not
-// possible to re-open the file with more relaxed permissions, since the host
-// FD is already open and stored in the inode.
-//
-// Using the dirent LRU cache increases the odds that this bug is encountered.
-// Since host file access is relatively fast anyways, we disable the LRU cache
-// for host fs files.  Once we can properly deal with permissions changes and
-// re-opening host files, we should revisit whether or not to make use of the
-// LRU cache.
-func (*superOperations) Keep(*fs.Dirent) bool {
-	return false
-}
-
-func init() {
-	fs.RegisterFilesystem(&Filesystem{})
-}
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
deleted file mode 100644
index 3111d2df9..000000000
--- a/pkg/sentry/fs/host/fs_test.go
+++ /dev/null
@@ -1,380 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package host
-
-import (
-	"fmt"
-	"io/ioutil"
-	"os"
-	"path"
-	"reflect"
-	"sort"
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-)
-
-// newTestMountNamespace creates a MountNamespace with a ramfs root.
-// It returns the host folder created, which should be removed when done.
-func newTestMountNamespace(t *testing.T) (*fs.MountNamespace, string, error) {
-	p, err := ioutil.TempDir("", "root")
-	if err != nil {
-		return nil, "", err
-	}
-
-	fd, err := open(nil, p)
-	if err != nil {
-		os.RemoveAll(p)
-		return nil, "", err
-	}
-	ctx := contexttest.Context(t)
-	root, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false)
-	if err != nil {
-		os.RemoveAll(p)
-		return nil, "", err
-	}
-	mm, err := fs.NewMountNamespace(ctx, root)
-	if err != nil {
-		os.RemoveAll(p)
-		return nil, "", err
-	}
-	return mm, p, nil
-}
-
-// createTestDirs populates the root with some test files and directories.
-// /a/a1.txt
-// /a/a2.txt
-// /b/b1.txt
-// /b/c/c1.txt
-// /symlinks/normal.txt
-// /symlinks/to_normal.txt -> /symlinks/normal.txt
-// /symlinks/recursive -> /symlinks
-func createTestDirs(ctx context.Context, t *testing.T, m *fs.MountNamespace) error {
-	r := m.Root()
-	defer r.DecRef()
-
-	if err := r.CreateDirectory(ctx, r, "a", fs.FilePermsFromMode(0777)); err != nil {
-		return err
-	}
-
-	a, err := r.Walk(ctx, r, "a")
-	if err != nil {
-		return err
-	}
-	defer a.DecRef()
-
-	a1, err := a.Create(ctx, r, "a1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
-	if err != nil {
-		return err
-	}
-	a1.DecRef()
-
-	a2, err := a.Create(ctx, r, "a2.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
-	if err != nil {
-		return err
-	}
-	a2.DecRef()
-
-	if err := r.CreateDirectory(ctx, r, "b", fs.FilePermsFromMode(0777)); err != nil {
-		return err
-	}
-
-	b, err := r.Walk(ctx, r, "b")
-	if err != nil {
-		return err
-	}
-	defer b.DecRef()
-
-	b1, err := b.Create(ctx, r, "b1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
-	if err != nil {
-		return err
-	}
-	b1.DecRef()
-
-	if err := b.CreateDirectory(ctx, r, "c", fs.FilePermsFromMode(0777)); err != nil {
-		return err
-	}
-
-	c, err := b.Walk(ctx, r, "c")
-	if err != nil {
-		return err
-	}
-	defer c.DecRef()
-
-	c1, err := c.Create(ctx, r, "c1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
-	if err != nil {
-		return err
-	}
-	c1.DecRef()
-
-	if err := r.CreateDirectory(ctx, r, "symlinks", fs.FilePermsFromMode(0777)); err != nil {
-		return err
-	}
-
-	symlinks, err := r.Walk(ctx, r, "symlinks")
-	if err != nil {
-		return err
-	}
-	defer symlinks.DecRef()
-
-	normal, err := symlinks.Create(ctx, r, "normal.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
-	if err != nil {
-		return err
-	}
-	normal.DecRef()
-
-	if err := symlinks.CreateLink(ctx, r, "/symlinks/normal.txt", "to_normal.txt"); err != nil {
-		return err
-	}
-
-	return symlinks.CreateLink(ctx, r, "/symlinks", "recursive")
-}
-
-// allPaths returns a slice of all paths of entries visible in the rootfs.
-func allPaths(ctx context.Context, t *testing.T, m *fs.MountNamespace, base string) ([]string, error) {
-	var paths []string
-	root := m.Root()
-	defer root.DecRef()
-
-	maxTraversals := uint(1)
-	d, err := m.FindLink(ctx, root, nil, base, &maxTraversals)
-	if err != nil {
-		t.Logf("FindLink failed for %q", base)
-		return paths, err
-	}
-	defer d.DecRef()
-
-	if fs.IsDir(d.Inode.StableAttr) {
-		dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
-		if err != nil {
-			return nil, fmt.Errorf("failed to open directory %q: %v", base, err)
-		}
-		iter, ok := dir.FileOperations.(fs.DirIterator)
-		if !ok {
-			return nil, fmt.Errorf("cannot directly iterate on host directory %q", base)
-		}
-		dirCtx := &fs.DirCtx{
-			Serializer: noopDentrySerializer{},
-		}
-		if _, err := fs.DirentReaddir(ctx, d, iter, root, dirCtx, 0); err != nil {
-			return nil, err
-		}
-		for name := range dirCtx.DentAttrs() {
-			if name == "." || name == ".." {
-				continue
-			}
-
-			fullName := path.Join(base, name)
-			paths = append(paths, fullName)
-
-			// Recurse.
-			subpaths, err := allPaths(ctx, t, m, fullName)
-			if err != nil {
-				return paths, err
-			}
-			paths = append(paths, subpaths...)
-		}
-	}
-
-	return paths, nil
-}
-
-type noopDentrySerializer struct{}
-
-func (noopDentrySerializer) CopyOut(string, fs.DentAttr) error {
-	return nil
-}
-func (noopDentrySerializer) Written() int {
-	return 4096
-}
-
-// pathsEqual returns true if the two string slices contain the same entries.
-func pathsEqual(got, want []string) bool {
-	sort.Strings(got)
-	sort.Strings(want)
-
-	if len(got) != len(want) {
-		return false
-	}
-
-	for i := range got {
-		if got[i] != want[i] {
-			return false
-		}
-	}
-
-	return true
-}
-
-func TestWhitelist(t *testing.T) {
-	for _, test := range []struct {
-		// description of the test.
-		desc string
-		// paths are the paths to whitelist
-		paths []string
-		// want are all of the directory entries that should be
-		// visible (nothing beyond this set should be visible).
-		want []string
-	}{
-		{
-			desc:  "root",
-			paths: []string{"/"},
-			want:  []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt", "/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt", "/symlinks/recursive"},
-		},
-		{
-			desc:  "top-level directories",
-			paths: []string{"/a", "/b"},
-			want:  []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
-		},
-		{
-			desc:  "nested directories (1/2)",
-			paths: []string{"/b", "/b/c"},
-			want:  []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
-		},
-		{
-			desc:  "nested directories (2/2)",
-			paths: []string{"/b/c", "/b"},
-			want:  []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
-		},
-		{
-			desc:  "single file",
-			paths: []string{"/b/c/c1.txt"},
-			want:  []string{"/b", "/b/c", "/b/c/c1.txt"},
-		},
-		{
-			desc:  "single file and directory",
-			paths: []string{"/a/a1.txt", "/b/c"},
-			want:  []string{"/a", "/a/a1.txt", "/b", "/b/c", "/b/c/c1.txt"},
-		},
-		{
-			desc:  "symlink",
-			paths: []string{"/symlinks/to_normal.txt"},
-			want:  []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt"},
-		},
-		{
-			desc:  "recursive symlink",
-			paths: []string{"/symlinks/recursive/normal.txt"},
-			want:  []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/recursive"},
-		},
-	} {
-		t.Run(test.desc, func(t *testing.T) {
-			m, p, err := newTestMountNamespace(t)
-			if err != nil {
-				t.Errorf("Failed to create MountNamespace: %v", err)
-			}
-			defer os.RemoveAll(p)
-
-			ctx := withRoot(contexttest.RootContext(t), m.Root())
-			if err := createTestDirs(ctx, t, m); err != nil {
-				t.Errorf("Failed to create test dirs: %v", err)
-			}
-
-			if err := installWhitelist(ctx, m, test.paths); err != nil {
-				t.Errorf("installWhitelist(%v) err got %v want nil", test.paths, err)
-			}
-
-			got, err := allPaths(ctx, t, m, "/")
-			if err != nil {
-				t.Fatalf("Failed to lookup paths (whitelisted: %v): %v", test.paths, err)
-			}
-
-			if !pathsEqual(got, test.want) {
-				t.Errorf("For paths %v got %v want %v", test.paths, got, test.want)
-			}
-		})
-	}
-}
-
-func TestRootPath(t *testing.T) {
-	// Create a temp dir, which will be the root of our mounted fs.
-	rootPath, err := ioutil.TempDir(os.TempDir(), "root")
-	if err != nil {
-		t.Fatalf("TempDir failed: %v", err)
-	}
-	defer os.RemoveAll(rootPath)
-
-	// Create two files inside the new root, one which will be whitelisted
-	// and one not.
-	whitelisted, err := ioutil.TempFile(rootPath, "white")
-	if err != nil {
-		t.Fatalf("TempFile failed: %v", err)
-	}
-	if _, err := ioutil.TempFile(rootPath, "black"); err != nil {
-		t.Fatalf("TempFile failed: %v", err)
-	}
-
-	// Create a mount with a root path and single whitelisted file.
-	hostFS := &Filesystem{}
-	ctx := contexttest.Context(t)
-	data := fmt.Sprintf("%s=%s,%s=%s", rootPathKey, rootPath, whitelistKey, whitelisted.Name())
-	inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data, nil)
-	if err != nil {
-		t.Fatalf("Mount failed: %v", err)
-	}
-	mm, err := fs.NewMountNamespace(ctx, inode)
-	if err != nil {
-		t.Fatalf("NewMountNamespace failed: %v", err)
-	}
-	if err := hostFS.InstallWhitelist(ctx, mm); err != nil {
-		t.Fatalf("InstallWhitelist failed: %v", err)
-	}
-
-	// Get the contents of the root directory.
-	rootDir := mm.Root()
-	rctx := withRoot(ctx, rootDir)
-	f, err := rootDir.Inode.GetFile(rctx, rootDir, fs.FileFlags{})
-	if err != nil {
-		t.Fatalf("GetFile failed: %v", err)
-	}
-	c := &fs.CollectEntriesSerializer{}
-	if err := f.Readdir(rctx, c); err != nil {
-		t.Fatalf("Readdir failed: %v", err)
-	}
-
-	// We should have only our whitelisted file, plus the dots.
-	want := []string{path.Base(whitelisted.Name()), ".", ".."}
-	got := c.Order
-	sort.Strings(want)
-	sort.Strings(got)
-	if !reflect.DeepEqual(got, want) {
-		t.Errorf("Readdir got %v, wanted %v", got, want)
-	}
-}
-
-type rootContext struct {
-	context.Context
-	root *fs.Dirent
-}
-
-// withRoot returns a copy of ctx with the given root.
-func withRoot(ctx context.Context, root *fs.Dirent) context.Context {
-	return &rootContext{
-		Context: ctx,
-		root:    root,
-	}
-}
-
-// Value implements Context.Value.
-func (rc rootContext) Value(key interface{}) interface{} {
-	switch key {
-	case fs.CtxRoot:
-		rc.root.IncRef()
-		return rc.root
-	default:
-		return rc.Context.Value(key)
-	}
-}
diff --git a/pkg/sentry/fs/host/host.go b/pkg/sentry/fs/host/host.go
new file mode 100644
index 000000000..081ba1dd8
--- /dev/null
+++ b/pkg/sentry/fs/host/host.go
@@ -0,0 +1,59 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host supports file descriptors imported directly.
+package host
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// filesystem is a host filesystem.
+//
+// +stateify savable
+type filesystem struct{}
+
+func init() {
+	fs.RegisterFilesystem(&filesystem{})
+}
+
+// FilesystemName is the name under which the filesystem is registered.
+const FilesystemName = "host"
+
+// Name is the name of the filesystem.
+func (*filesystem) Name() string {
+	return FilesystemName
+}
+
+// Mount returns an error. Mounting hostfs is not allowed.
+func (*filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, dataObj interface{}) (*fs.Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// AllowUserMount prohibits users from using mount(2) with this file system.
+func (*filesystem) AllowUserMount() bool {
+	return false
+}
+
+// AllowUserList prohibits this filesystem to be listed in /proc/filesystems.
+func (*filesystem) AllowUserList() bool {
+	return false
+}
+
+// Flags returns that there is nothing special about this file system.
+func (*filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 6fa39caab..62f1246aa 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -17,12 +17,10 @@ package host
 import (
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/secio"
-	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -69,9 +67,6 @@ type inodeOperations struct {
 //
 // +stateify savable
 type inodeFileState struct {
-	// Common file system state.
-	mops *superOperations `state:"wait"`
-
 	// descriptor is the backing host FD.
 	descriptor *descriptor `state:"wait"`
 
@@ -160,7 +155,7 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err
 	if err := syscall.Fstat(i.FD(), &s); err != nil {
 		return fs.UnstableAttr{}, err
 	}
-	return unstableAttr(i.mops, &s), nil
+	return unstableAttr(&s), nil
 }
 
 // Allocate implements fsutil.CachedFileObject.Allocate.
@@ -172,7 +167,7 @@ func (i *inodeFileState) Allocate(_ context.Context, offset, length int64) error
 var _ fs.InodeOperations = (*inodeOperations)(nil)
 
 // newInode returns a new fs.Inode backed by the host FD.
-func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) {
+func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool) (*fs.Inode, error) {
 	// Retrieve metadata.
 	var s syscall.Stat_t
 	err := syscall.Fstat(fd, &s)
@@ -181,24 +176,17 @@ func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool,
 	}
 
 	fileState := &inodeFileState{
-		mops:  msrc.MountSourceOperations.(*superOperations),
 		sattr: stableAttr(&s),
 	}
 
 	// Initialize the wrapped host file descriptor.
-	fileState.descriptor, err = newDescriptor(
-		fd,
-		donated,
-		saveable,
-		wouldBlock(&s),
-		&fileState.queue,
-	)
+	fileState.descriptor, err = newDescriptor(fd, saveable, wouldBlock(&s), &fileState.queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// Build the fs.InodeOperations.
-	uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s)
+	uattr := unstableAttr(&s)
 	iops := &inodeOperations{
 		fileState: fileState,
 		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, fsutil.CachingInodeOperationsOptions{
@@ -232,54 +220,23 @@ func (i *inodeOperations) Release(context.Context) {
 
 // Lookup implements fs.InodeOperations.Lookup.
 func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
-	// Get a new FD relative to i at name.
-	fd, err := open(i, name)
-	if err != nil {
-		if err == syserror.ENOENT {
-			return nil, syserror.ENOENT
-		}
-		return nil, err
-	}
-
-	inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
-	if err != nil {
-		return nil, err
-	}
-
-	// Return the fs.Dirent.
-	return fs.NewDirent(ctx, inode, name), nil
+	return nil, syserror.ENOENT
 }
 
 // Create implements fs.InodeOperations.Create.
 func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) {
-	// Create a file relative to i at name.
-	//
-	// N.B. We always open this file O_RDWR regardless of flags because a
-	// future GetFile might want more access. Open allows this regardless
-	// of perm.
-	fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode())
-	if err != nil {
-		return nil, err
-	}
-
-	inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
-	if err != nil {
-		return nil, err
-	}
+	return nil, syserror.EPERM
 
-	d := fs.NewDirent(ctx, inode, name)
-	defer d.DecRef()
-	return inode.GetFile(ctx, d, flags)
 }
 
 // CreateDirectory implements fs.InodeOperations.CreateDirectory.
 func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
-	return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode()))
+	return syserror.EPERM
 }
 
 // CreateLink implements fs.InodeOperations.CreateLink.
 func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error {
-	return createLink(i.fileState.FD(), oldname, newname)
+	return syserror.EPERM
 }
 
 // CreateHardLink implements fs.InodeOperations.CreateHardLink.
@@ -294,25 +251,17 @@ func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePe
 
 // Remove implements fs.InodeOperations.Remove.
 func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error {
-	return unlinkAt(i.fileState.FD(), name, false /* dir */)
+	return syserror.EPERM
 }
 
 // RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
 func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
-	return unlinkAt(i.fileState.FD(), name, true /* dir */)
+	return syserror.EPERM
 }
 
 // Rename implements fs.InodeOperations.Rename.
 func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
-	op, ok := oldParent.InodeOperations.(*inodeOperations)
-	if !ok {
-		return syscall.EXDEV
-	}
-	np, ok := newParent.InodeOperations.(*inodeOperations)
-	if !ok {
-		return syscall.EXDEV
-	}
-	return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName)
+	return syserror.EPERM
 }
 
 // Bind implements fs.InodeOperations.Bind.
@@ -448,82 +397,17 @@ func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) {
 }
 
 // AddLink implements fs.InodeOperations.AddLink.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) AddLink() {}
 
 // DropLink implements fs.InodeOperations.DropLink.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) DropLink() {}
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
 
 // readdirAll returns all of the directory entries in i.
 func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) {
-	i.readdirMu.Lock()
-	defer i.readdirMu.Unlock()
-
-	fd := i.fileState.FD()
-
-	// syscall.ReadDirent will use getdents, which will seek the file past
-	// the last directory entry. To read the directory entries a second
-	// time, we need to seek back to the beginning.
-	if _, err := syscall.Seek(fd, 0, 0); err != nil {
-		if err == syscall.ESPIPE {
-			// All directories should be seekable. If this file
-			// isn't seekable, it is not a directory and we should
-			// return that more sane error.
-			err = syscall.ENOTDIR
-		}
-		return nil, err
-	}
-
-	names := make([]string, 0, 100)
-	for {
-		// Refill the buffer if necessary
-		if d.bufp >= d.nbuf {
-			d.bufp = 0
-			// ReadDirent will just do a sys_getdents64 to the kernel.
-			n, err := syscall.ReadDirent(fd, d.buf)
-			if err != nil {
-				return nil, err
-			}
-			if n == 0 {
-				break // EOF
-			}
-			d.nbuf = n
-		}
-
-		var nb int
-		// Parse the dirent buffer we just get and return the directory names along
-		// with the number of bytes consumed in the buffer.
-		nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names)
-		d.bufp += nb
-	}
-
-	entries := make(map[string]fs.DentAttr)
-	for _, filename := range names {
-		// Lookup the type and host device and inode.
-		stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW)
-		if lerr == syscall.ENOENT {
-			// File disappeared between readdir and lstat.
-			// Just treat it as if it didn't exist.
-			continue
-		}
-
-		// There was a serious problem, we should probably report it.
-		if lerr != nil {
-			return nil, lerr
-		}
-
-		entries[filename] = fs.DentAttr{
-			Type: nodeType(&stat),
-			InodeID: hostFileDevice.Map(device.MultiDeviceKey{
-				Device: stat.Dev,
-				Inode:  stat.Ino,
-			}),
-		}
-	}
-	return entries, nil
+	// We only support non-directory file descriptors that have been
+	// imported, so just claim that this isn't a directory, even if it is.
+	return nil, syscall.ENOTDIR
 }
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
index 299e0e0b0..1adbd4562 100644
--- a/pkg/sentry/fs/host/inode_state.go
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -18,29 +18,14 @@ import (
 	"fmt"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
-// beforeSave is invoked by stateify.
-func (i *inodeFileState) beforeSave() {
-	if !i.queue.IsEmpty() {
-		panic("event queue must be empty")
-	}
-	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
-		uattr, err := i.unstableAttr(context.Background())
-		if err != nil {
-			panic(fs.ErrSaveRejection{fmt.Errorf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err)})
-		}
-		i.savedUAttr = &uattr
-	}
-}
-
 // afterLoad is invoked by stateify.
 func (i *inodeFileState) afterLoad() {
 	// Initialize the descriptor value.
-	if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil {
+	if err := i.descriptor.initAfterLoad(i.sattr.InodeID, &i.queue); err != nil {
 		panic(fmt.Sprintf("failed to load value of descriptor: %v", err))
 	}
 
@@ -61,19 +46,4 @@ func (i *inodeFileState) afterLoad() {
 		// change across save and restore, error out.
 		panic(fs.ErrCorruption{fmt.Errorf("host %s conflict in host device mappings: %s", key, hostFileDevice)})
 	}
-
-	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
-		env, ok := fs.CurrentRestoreEnvironment()
-		if !ok {
-			panic("missing restore environment")
-		}
-		uattr := unstableAttr(i.mops, &s)
-		if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size {
-			panic(fs.ErrCorruption{fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)})
-		}
-		if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime {
-			panic(fs.ErrCorruption{fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)})
-		}
-		i.savedUAttr = nil
-	}
 }
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
index 7221bc825..c507f57eb 100644
--- a/pkg/sentry/fs/host/inode_test.go
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -15,79 +15,12 @@
 package host
 
 import (
-	"io/ioutil"
-	"os"
-	"path"
 	"syscall"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
-// TestMultipleReaddir verifies that multiple Readdir calls return the same
-// thing if they use different dir contexts.
-func TestMultipleReaddir(t *testing.T) {
-	p, err := ioutil.TempDir("", "readdir")
-	if err != nil {
-		t.Fatalf("Failed to create test dir: %v", err)
-	}
-	defer os.RemoveAll(p)
-
-	f, err := os.Create(path.Join(p, "a.txt"))
-	if err != nil {
-		t.Fatalf("Failed to create a.txt: %v", err)
-	}
-	f.Close()
-
-	f, err = os.Create(path.Join(p, "b.txt"))
-	if err != nil {
-		t.Fatalf("Failed to create b.txt: %v", err)
-	}
-	f.Close()
-
-	fd, err := open(nil, p)
-	if err != nil {
-		t.Fatalf("Failed to open %q: %v", p, err)
-	}
-	ctx := contexttest.Context(t)
-	n, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false)
-	if err != nil {
-		t.Fatalf("Failed to create inode: %v", err)
-	}
-
-	dirent := fs.NewDirent(ctx, n, "readdir")
-	openFile, err := n.GetFile(ctx, dirent, fs.FileFlags{Read: true})
-	if err != nil {
-		t.Fatalf("Failed to get file: %v", err)
-	}
-	defer openFile.DecRef()
-
-	c1 := &fs.DirCtx{DirCursor: new(string)}
-	if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, dirent, c1, 0); err != nil {
-		t.Fatalf("First Readdir failed: %v", err)
-	}
-
-	c2 := &fs.DirCtx{DirCursor: new(string)}
-	if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, dirent, c2, 0); err != nil {
-		t.Errorf("Second Readdir failed: %v", err)
-	}
-
-	if _, ok := c1.DentAttrs()["a.txt"]; !ok {
-		t.Errorf("want a.txt in first Readdir, got %v", c1.DentAttrs())
-	}
-	if _, ok := c1.DentAttrs()["b.txt"]; !ok {
-		t.Errorf("want b.txt in first Readdir, got %v", c1.DentAttrs())
-	}
-
-	if _, ok := c2.DentAttrs()["a.txt"]; !ok {
-		t.Errorf("want a.txt in second Readdir, got %v", c2.DentAttrs())
-	}
-	if _, ok := c2.DentAttrs()["b.txt"]; !ok {
-		t.Errorf("want b.txt in second Readdir, got %v", c2.DentAttrs())
-	}
-}
-
 // TestCloseFD verifies fds will be closed.
 func TestCloseFD(t *testing.T) {
 	var p [2]int
@@ -99,7 +32,7 @@ func TestCloseFD(t *testing.T) {
 
 	// Use the write-end because we will detect if it's closed on the read end.
 	ctx := contexttest.Context(t)
-	file, err := NewFile(ctx, p[1], fs.RootOwner)
+	file, err := NewFile(ctx, p[1])
 	if err != nil {
 		t.Fatalf("Failed to create File: %v", err)
 	}
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
index 271582e54..150ac8e19 100644
--- a/pkg/sentry/fs/host/ioctl_unsafe.go
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -21,6 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 )
 
+// LINT.IfChange
+
 func ioctlGetTermios(fd int) (*linux.Termios, error) {
 	var t linux.Termios
 	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
@@ -54,3 +56,5 @@ func ioctlSetWinsize(fd int, w *linux.Winsize) error {
 	}
 	return nil
 }
+
+// LINT.ThenChange(../../fsimpl/host/ioctl_unsafe.go)
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 06fc2d80a..b6e94583e 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -37,6 +37,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
 //
 // N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
@@ -388,3 +390,5 @@ func (c *ConnectedEndpoint) Release() {
 
 // CloseUnread implements transport.ConnectedEndpoint.CloseUnread.
 func (c *ConnectedEndpoint) CloseUnread() {}
+
+// LINT.ThenChange(../../fsimpl/host/socket.go)
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
index af6955675..5c18dbd5e 100644
--- a/pkg/sentry/fs/host/socket_iovec.go
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -21,6 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // maxIovs is the maximum number of iovecs to pass to the host.
 var maxIovs = linux.UIO_MAXIOV
 
@@ -111,3 +113,5 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec
 
 	return total, iovecs, nil, err
 }
+
+// LINT.ThenChange(../../fsimpl/host/socket_iovec.go)
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index eb4afe520..affdbcacb 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -199,14 +199,14 @@ func TestListen(t *testing.T) {
 }
 
 func TestPasscred(t *testing.T) {
-	e := ConnectedEndpoint{}
+	e := &ConnectedEndpoint{}
 	if got, want := e.Passcred(), false; got != want {
 		t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want)
 	}
 }
 
 func TestGetLocalAddress(t *testing.T) {
-	e := ConnectedEndpoint{path: "foo"}
+	e := &ConnectedEndpoint{path: "foo"}
 	want := tcpip.FullAddress{Addr: tcpip.Address("foo")}
 	if got, err := e.GetLocalAddress(); err != nil || got != want {
 		t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil)
@@ -214,7 +214,7 @@ func TestGetLocalAddress(t *testing.T) {
 }
 
 func TestQueuedSize(t *testing.T) {
-	e := ConnectedEndpoint{}
+	e := &ConnectedEndpoint{}
 	tests := []struct {
 		name string
 		f    func() int64
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index f3bbed7ea..5d4f312cf 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -19,6 +19,8 @@ import (
 	"unsafe"
 )
 
+// LINT.IfChange
+
 // fdReadVec receives from fd to bufs.
 //
 // If the total length of bufs is > maxlen, fdReadVec will do a partial read
@@ -99,3 +101,5 @@ func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int6
 
 	return int64(n), length, err
 }
+
+// LINT.ThenChange(../../fsimpl/host/socket_unsafe.go)
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index 3f218b4a7..cb91355ab 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -26,6 +26,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // TTYFileOperations implements fs.FileOperations for a host file descriptor
 // that wraps a TTY FD.
 //
@@ -43,6 +45,7 @@ type TTYFileOperations struct {
 	// connected to this TTY.
 	fgProcessGroup *kernel.ProcessGroup
 
+	// termios contains the terminal attributes for this TTY.
 	termios linux.KernelTermios
 }
 
@@ -357,3 +360,5 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e
 	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
 	return kernel.ERESTARTSYS
 }
+
+// LINT.ThenChange(../../fsimpl/host/tty.go)
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index e37e687c6..1b0356930 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -16,7 +16,6 @@ package host
 
 import (
 	"os"
-	"path"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -28,45 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-func open(parent *inodeOperations, name string) (int, error) {
-	if parent == nil && !path.IsAbs(name) {
-		return -1, syserror.EINVAL
-	}
-	name = path.Clean(name)
-
-	// Don't follow through symlinks.
-	flags := syscall.O_NOFOLLOW
-
-	if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil {
-		return fd, nil
-	}
-	// Retry as read-only.
-	if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil {
-		return fd, nil
-	}
-
-	// Retry as write-only.
-	if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil {
-		return fd, nil
-	}
-
-	// Retry as a symlink, by including O_PATH as an option.
-	fd, err := openAt(parent, name, linux.O_PATH|flags, 0)
-	if err == nil {
-		return fd, nil
-	}
-
-	// Everything failed.
-	return -1, err
-}
-
-func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) {
-	if parent == nil {
-		return syscall.Open(name, flags, uint32(perm))
-	}
-	return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm))
-}
-
 func nodeType(s *syscall.Stat_t) fs.InodeType {
 	switch x := (s.Mode & syscall.S_IFMT); x {
 	case syscall.S_IFLNK:
@@ -107,51 +67,19 @@ func stableAttr(s *syscall.Stat_t) fs.StableAttr {
 	}
 }
 
-func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner {
-	// User requested no translation, just return actual owner.
-	if mo.dontTranslateOwnership {
-		return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)}
+func owner(s *syscall.Stat_t) fs.FileOwner {
+	return fs.FileOwner{
+		UID: auth.KUID(s.Uid),
+		GID: auth.KGID(s.Gid),
 	}
-
-	// Show only IDs relevant to the sandboxed task. I.e. if we not own the
-	// file, no sandboxed task can own the file. In that case, we
-	// use OverflowID for UID, implying that the IDs are not mapped in the
-	// "root" user namespace.
-	//
-	// E.g.
-	// sandbox's host EUID/EGID is 1/1.
-	// some_dir's host UID/GID is 2/1.
-	// Task that mounted this fs has virtualized EUID/EGID 5/5.
-	//
-	// If you executed `ls -n` in the sandboxed task, it would show:
-	// drwxwrxwrx [...] 65534 5 [...] some_dir
-
-	// Files are owned by OverflowID by default.
-	owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)}
-
-	// If we own file on host, let mounting task's initial EUID own
-	// the file.
-	if s.Uid == hostUID {
-		owner.UID = mo.mounter.UID
-	}
-
-	// If our group matches file's group, make file's group match
-	// the mounting task's initial EGID.
-	for _, gid := range hostGIDs {
-		if s.Gid == gid {
-			owner.GID = mo.mounter.GID
-			break
-		}
-	}
-	return owner
 }
 
-func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr {
+func unstableAttr(s *syscall.Stat_t) fs.UnstableAttr {
 	return fs.UnstableAttr{
 		Size:             s.Size,
 		Usage:            s.Blocks * 512,
 		Perms:            fs.FilePermsFromMode(linux.FileMode(s.Mode)),
-		Owner:            owner(mo, s),
+		Owner:            owner(s),
 		AccessTime:       ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
 		ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
 		StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
@@ -165,6 +93,8 @@ type dirInfo struct {
 	bufp int    // location of next record in buf.
 }
 
+// LINT.IfChange
+
 // isBlockError unwraps os errors and checks if they are caused by EAGAIN or
 // EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock.
 func isBlockError(err error) bool {
@@ -177,6 +107,8 @@ func isBlockError(err error) bool {
 	return false
 }
 
+// LINT.ThenChange(../../fsimpl/host/util.go)
+
 func hostEffectiveKIDs() (uint32, []uint32, error) {
 	gids, err := os.Getgroups()
 	if err != nil {
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
index 3ab36b088..23bd35d64 100644
--- a/pkg/sentry/fs/host/util_unsafe.go
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -26,26 +26,6 @@ import (
 // NulByte is a single NUL byte. It is passed to readlinkat as an empty string.
 var NulByte byte = '\x00'
 
-func createLink(fd int, name string, linkName string) error {
-	namePtr, err := syscall.BytePtrFromString(name)
-	if err != nil {
-		return err
-	}
-	linkNamePtr, err := syscall.BytePtrFromString(linkName)
-	if err != nil {
-		return err
-	}
-	_, _, errno := syscall.Syscall(
-		syscall.SYS_SYMLINKAT,
-		uintptr(unsafe.Pointer(namePtr)),
-		uintptr(fd),
-		uintptr(unsafe.Pointer(linkNamePtr)))
-	if errno != 0 {
-		return errno
-	}
-	return nil
-}
-
 func readLink(fd int) (string, error) {
 	// Buffer sizing copied from os.Readlink.
 	for l := 128; ; l *= 2 {
@@ -66,27 +46,6 @@ func readLink(fd int) (string, error) {
 	}
 }
 
-func unlinkAt(fd int, name string, dir bool) error {
-	namePtr, err := syscall.BytePtrFromString(name)
-	if err != nil {
-		return err
-	}
-	var flags uintptr
-	if dir {
-		flags = linux.AT_REMOVEDIR
-	}
-	_, _, errno := syscall.Syscall(
-		syscall.SYS_UNLINKAT,
-		uintptr(fd),
-		uintptr(unsafe.Pointer(namePtr)),
-		flags,
-	)
-	if errno != 0 {
-		return errno
-	}
-	return nil
-}
-
 func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec {
 	if omit {
 		return syscall.Timespec{0, linux.UTIME_OMIT}
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
index d49c3a635..ce397a5e3 100644
--- a/pkg/sentry/fs/host/wait_test.go
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -20,7 +20,6 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -34,7 +33,7 @@ func TestWait(t *testing.T) {
 	defer syscall.Close(fds[1])
 
 	ctx := contexttest.Context(t)
-	file, err := NewFile(ctx, fds[0], fs.RootOwner)
+	file, err := NewFile(ctx, fds[0])
 	if err != nil {
 		syscall.Close(fds[0])
 		t.Fatalf("NewFile failed: %v", err)
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 55fb71c16..a34fbc946 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -102,7 +102,6 @@ func (i *Inode) DecRef() {
 
 // destroy releases the Inode and releases the msrc reference taken.
 func (i *Inode) destroy() {
-	// FIXME(b/38173783): Context is not plumbed here.
 	ctx := context.Background()
 	if err := i.WriteOut(ctx); err != nil {
 		// FIXME(b/65209558): Mark as warning again once noatime is
@@ -397,8 +396,6 @@ func (i *Inode) Getlink(ctx context.Context) (*Dirent, error) {
 // AddLink calls i.InodeOperations.AddLink.
 func (i *Inode) AddLink() {
 	if i.overlay != nil {
-		// FIXME(b/63117438): Remove this from InodeOperations altogether.
-		//
 		// This interface is only used by ramfs to update metadata of
 		// children. These filesystems should _never_ have overlay
 		// Inodes cached as children. So explicitly disallow this
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 5ada33a32..537c8d257 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -231,7 +231,8 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st
 	upperFile.Dirent.Inode.IncRef()
 	entry, err := newOverlayEntry(ctx, upperFile.Dirent.Inode, nil, false)
 	if err != nil {
-		cleanupUpper(ctx, o.upper, name)
+		werr := fmt.Errorf("newOverlayEntry failed: %v", err)
+		cleanupUpper(ctx, o.upper, name, werr)
 		return nil, err
 	}
 
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 928c90aa0..e3a715c1f 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -143,7 +143,10 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i
 	}
 
 	var writeLen int64
-	for event := i.events.Front(); event != nil; event = event.Next() {
+	for it := i.events.Front(); it != nil; {
+		event := it
+		it = it.Next()
+
 		// Does the buffer have enough remaining space to hold the event we're
 		// about to write out?
 		if dst.NumBytes() < int64(event.sizeOf()) {
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index e672a438c..a3d10770b 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -36,11 +36,12 @@ func mountPathsAre(root *Dirent, got []*Mount, want ...string) error {
 	gotPaths := make(map[string]struct{}, len(got))
 	gotStr := make([]string, len(got))
 	for i, g := range got {
-		groot := g.Root()
-		name, _ := groot.FullName(root)
-		groot.DecRef()
-		gotStr[i] = name
-		gotPaths[name] = struct{}{}
+		if groot := g.Root(); groot != nil {
+			name, _ := groot.FullName(root)
+			groot.DecRef()
+			gotStr[i] = name
+			gotPaths[name] = struct{}{}
+		}
 	}
 	if len(got) != len(want) {
 		return fmt.Errorf("mount paths are different, got: %q, want: %q", gotStr, want)
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 574a2cc91..3f2bd0e87 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -17,13 +17,9 @@ package fs
 import (
 	"fmt"
 	"math"
-	"path"
-	"strings"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -100,10 +96,14 @@ func newUndoMount(d *Dirent) *Mount {
 	}
 }
 
-// Root returns the root dirent of this mount. Callers must call DecRef on the
-// returned dirent.
+// Root returns the root dirent of this mount.
+//
+// This may return nil if the mount has already been free. Callers must handle this
+// case appropriately. If non-nil, callers must call DecRef on the returned *Dirent.
 func (m *Mount) Root() *Dirent {
-	m.root.IncRef()
+	if !m.root.TryIncRef() {
+		return nil
+	}
 	return m.root
 }
 
@@ -269,19 +269,6 @@ func (mns *MountNamespace) DecRef() {
 	mns.DecRefWithDestructor(mns.destroy)
 }
 
-// Freeze freezes the entire mount tree.
-func (mns *MountNamespace) Freeze() {
-	mns.mu.Lock()
-	defer mns.mu.Unlock()
-
-	// We only want to freeze Dirents with active references, not Dirents referenced
-	// by a mount's MountSource.
-	mns.flushMountSourceRefsLocked()
-
-	// Freeze the entire shebang.
-	mns.root.Freeze()
-}
-
 // withMountLocked prevents further walks to `node`, because `node` is about to
 // be a mount point.
 func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error {
@@ -634,71 +621,3 @@ func (mns *MountNamespace) SyncAll(ctx context.Context) {
 	defer mns.mu.Unlock()
 	mns.root.SyncAll(ctx)
 }
-
-// ResolveExecutablePath resolves the given executable name given a set of
-// paths that might contain it.
-func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name string, paths []string) (string, error) {
-	// Absolute paths can be used directly.
-	if path.IsAbs(name) {
-		return name, nil
-	}
-
-	// Paths with '/' in them should be joined to the working directory, or
-	// to the root if working directory is not set.
-	if strings.IndexByte(name, '/') > 0 {
-		if wd == "" {
-			wd = "/"
-		}
-		if !path.IsAbs(wd) {
-			return "", fmt.Errorf("working directory %q must be absolute", wd)
-		}
-		return path.Join(wd, name), nil
-	}
-
-	// Otherwise, We must lookup the name in the paths, starting from the
-	// calling context's root directory.
-	root := RootFromContext(ctx)
-	if root == nil {
-		// Caller has no root. Don't bother traversing anything.
-		return "", syserror.ENOENT
-	}
-	defer root.DecRef()
-	for _, p := range paths {
-		binPath := path.Join(p, name)
-		traversals := uint(linux.MaxSymlinkTraversals)
-		d, err := mns.FindInode(ctx, root, nil, binPath, &traversals)
-		if err == syserror.ENOENT || err == syserror.EACCES {
-			// Didn't find it here.
-			continue
-		}
-		if err != nil {
-			return "", err
-		}
-		defer d.DecRef()
-
-		// Check that it is a regular file.
-		if !IsRegular(d.Inode.StableAttr) {
-			continue
-		}
-
-		// Check whether we can read and execute the found file.
-		if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil {
-			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
-			continue
-		}
-		return path.Join("/", p, name), nil
-	}
-	return "", syserror.ENOENT
-}
-
-// GetPath returns the PATH as a slice of strings given the environment
-// variables.
-func GetPath(env []string) []string {
-	const prefix = "PATH="
-	for _, e := range env {
-		if strings.HasPrefix(e, prefix) {
-			return strings.Split(strings.TrimPrefix(e, prefix), ":")
-		}
-	}
-	return nil
-}
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 280093c5e..77c2c5c0e 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -36,6 +36,7 @@ go_library(
         "//pkg/sentry/fs/proc/device",
         "//pkg/sentry/fs/proc/seqfile",
         "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 465b47da9..91617267d 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -58,12 +58,16 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 
 	var buf bytes.Buffer
 	fmt.Fprintf(&buf, "MemTotal:       %8d kB\n", totalSize/1024)
-	memFree := (totalSize - totalUsage) / 1024
+	memFree := totalSize - totalUsage
+	if memFree > totalSize {
+		// Underflow.
+		memFree = 0
+	}
 	// We use MemFree as MemAvailable because we don't swap.
 	// TODO(rahat): When reclaim is implemented the value of MemAvailable
 	// should change.
-	fmt.Fprintf(&buf, "MemFree:        %8d kB\n", memFree)
-	fmt.Fprintf(&buf, "MemAvailable:   %8d kB\n", memFree)
+	fmt.Fprintf(&buf, "MemFree:        %8d kB\n", memFree/1024)
+	fmt.Fprintf(&buf, "MemAvailable:   %8d kB\n", memFree/1024)
 	fmt.Fprintf(&buf, "Buffers:               0 kB\n") // memory usage by block devices
 	fmt.Fprintf(&buf, "Cached:         %8d kB\n", (file+snapshot.Tmpfs)/1024)
 	// Emulate a system with no swap, which disables inactivation of anon pages.
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index c10888100..1fc9c703c 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -60,13 +60,15 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
 	})
 	for _, m := range ms {
 		mroot := m.Root()
+		if mroot == nil {
+			continue // No longer valid.
+		}
 		mountPath, desc := mroot.FullName(rootDir)
 		mroot.DecRef()
 		if !desc {
 			// MountSources that are not descendants of the chroot jail are ignored.
 			continue
 		}
-
 		fn(mountPath, m)
 	}
 }
@@ -91,6 +93,12 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 
 	var buf bytes.Buffer
 	forEachMount(mif.t, func(mountPath string, m *fs.Mount) {
+		mroot := m.Root()
+		if mroot == nil {
+			return // No longer valid.
+		}
+		defer mroot.DecRef()
+
 		// Format:
 		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
 		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
@@ -107,9 +115,6 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 
 		// (3) Major:Minor device ID. We don't have a superblock, so we
 		// just use the root inode device number.
-		mroot := m.Root()
-		defer mroot.DecRef()
-
 		sa := mroot.Inode.StableAttr
 		fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
 
@@ -165,7 +170,8 @@ func superBlockOpts(mountPath string, msrc *fs.MountSource) string {
 	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
 	// the cgroup name in the options. For now we just read that from the
 	// path.
-	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
+	//
+	// TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
 	// should get this value from the cgroup itself, and not rely on the
 	// path.
 	if msrc.FilesystemType == "cgroup" {
@@ -207,6 +213,9 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 		//
 		// The "needs dump"and fsck flags are always 0, which is allowed.
 		root := m.Root()
+		if root == nil {
+			return // No longer valid.
+		}
 		defer root.DecRef()
 
 		flags := root.Inode.MountSource.Flags
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 6f2775344..bd18177d4 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -40,44 +40,48 @@ import (
 
 // LINT.IfChange
 
-// newNet creates a new proc net entry.
-func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode {
+// newNetDir creates a new proc net entry.
+func newNetDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	k := t.Kernel()
+
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil {
+	if s := t.NetworkNamespace().Stack(); s != nil {
+		// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
+		// network namespace.
 		contents = map[string]*fs.Inode{
-			"dev":  seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
-			"snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc),
+			"dev":  seqfile.NewSeqFileInode(t, &netDev{s: s}, msrc),
+			"snmp": seqfile.NewSeqFileInode(t, &netSnmp{s: s}, msrc),
 
 			// The following files are simple stubs until they are
 			// implemented in netstack, if the file contains a
 			// header the stub is just the header otherwise it is
 			// an empty file.
-			"arp": newStaticProcInode(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device\n")),
+			"arp": newStaticProcInode(t, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device\n")),
 
-			"netlink":   newStaticProcInode(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n")),
-			"netstat":   newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")),
-			"packet":    newStaticProcInode(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n")),
-			"protocols": newStaticProcInode(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")),
+			"netlink":   newStaticProcInode(t, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n")),
+			"netstat":   newStaticProcInode(t, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")),
+			"packet":    newStaticProcInode(t, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n")),
+			"protocols": newStaticProcInode(t, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")),
 			// Linux sets psched values to: nsec per usec, psched
 			// tick in ns, 1000000, high res timer ticks per sec
 			// (ClockGetres returns 1ns resolution).
-			"psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
-			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function\n")),
-			"route":  seqfile.NewSeqFileInode(ctx, &netRoute{s: s}, msrc),
-			"tcp":    seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc),
-			"udp":    seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc),
-			"unix":   seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc),
+			"psched": newStaticProcInode(t, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
+			"ptype":  newStaticProcInode(t, msrc, []byte("Type Device      Function\n")),
+			"route":  seqfile.NewSeqFileInode(t, &netRoute{s: s}, msrc),
+			"tcp":    seqfile.NewSeqFileInode(t, &netTCP{k: k}, msrc),
+			"udp":    seqfile.NewSeqFileInode(t, &netUDP{k: k}, msrc),
+			"unix":   seqfile.NewSeqFileInode(t, &netUnix{k: k}, msrc),
 		}
 
 		if s.SupportsIPv6() {
-			contents["if_inet6"] = seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc)
-			contents["ipv6_route"] = newStaticProcInode(ctx, msrc, []byte(""))
-			contents["tcp6"] = seqfile.NewSeqFileInode(ctx, &netTCP6{k: k}, msrc)
-			contents["udp6"] = newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"))
+			contents["if_inet6"] = seqfile.NewSeqFileInode(t, &ifinet6{s: s}, msrc)
+			contents["ipv6_route"] = newStaticProcInode(t, msrc, []byte(""))
+			contents["tcp6"] = seqfile.NewSeqFileInode(t, &netTCP6{k: k}, msrc)
+			contents["udp6"] = newStaticProcInode(t, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"))
 		}
 	}
-	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(t, d, msrc, fs.SpecialDirectory, t)
 }
 
 // ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6.
@@ -834,4 +838,4 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 	return data, 0
 }
 
-// LINT.ThenChange(../../fsimpl/proc/tasks_net.go)
+// LINT.ThenChange(../../fsimpl/proc/task_net.go)
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index c8abb5052..c659224a7 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -70,6 +70,7 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string
 		"loadavg":     seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc),
 		"meminfo":     seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc),
 		"mounts":      newProcInode(ctx, ramfs.NewSymlink(ctx, fs.RootOwner, "self/mounts"), msrc, fs.Symlink, nil),
+		"net":         newProcInode(ctx, ramfs.NewSymlink(ctx, fs.RootOwner, "self/net"), msrc, fs.Symlink, nil),
 		"self":        newSelf(ctx, pidns, msrc),
 		"stat":        seqfile.NewSeqFileInode(ctx, &statData{k}, msrc),
 		"thread-self": newThreadSelf(ctx, pidns, msrc),
@@ -86,7 +87,6 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string
 	}
 
 	// Add more contents that need proc to be initialized.
-	p.AddChild(ctx, "net", p.newNetDir(ctx, k, msrc))
 	p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc))
 
 	return newProcInode(ctx, p, msrc, fs.SpecialDirectory, nil), nil
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 0772d4ae4..702fdd392 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -80,7 +80,7 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error {
+func (*tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
@@ -196,7 +196,7 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
+func (*tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
@@ -357,7 +357,9 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine
 
 func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
 		contents = map[string]*fs.Inode{
 			"ipv4": p.newSysNetIPv4Dir(ctx, msrc, s),
 			"core": p.newSysNetCore(ctx, msrc, s),
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index ca020e11e..4bbe90198 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -56,14 +57,23 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) {
 	return m, nil
 }
 
+func checkTaskState(t *kernel.Task) error {
+	switch t.ExitState() {
+	case kernel.TaskExitZombie:
+		return syserror.EACCES
+	case kernel.TaskExitDead:
+		return syserror.ESRCH
+	}
+	return nil
+}
+
 // taskDir represents a task-level directory.
 //
 // +stateify savable
 type taskDir struct {
 	ramfs.Dir
 
-	t     *kernel.Task
-	pidns *kernel.PIDNamespace
+	t *kernel.Task
 }
 
 var _ fs.InodeOperations = (*taskDir)(nil)
@@ -71,24 +81,27 @@ var _ fs.InodeOperations = (*taskDir)(nil)
 // newTaskDir creates a new proc task entry.
 func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
 	contents := map[string]*fs.Inode{
-		"auxv":      newAuxvec(t, msrc),
-		"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
-		"comm":      newComm(t, msrc),
-		"environ":   newExecArgInode(t, msrc, environExecArg),
-		"exe":       newExe(t, msrc),
-		"fd":        newFdDir(t, msrc),
-		"fdinfo":    newFdInfoDir(t, msrc),
-		"gid_map":   newGIDMap(t, msrc),
-		"io":        newIO(t, msrc, isThreadGroup),
-		"maps":      newMaps(t, msrc),
-		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
-		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
-		"ns":        newNamespaceDir(t, msrc),
-		"smaps":     newSmaps(t, msrc),
-		"stat":      newTaskStat(t, msrc, isThreadGroup, p.pidns),
-		"statm":     newStatm(t, msrc),
-		"status":    newStatus(t, msrc, p.pidns),
-		"uid_map":   newUIDMap(t, msrc),
+		"auxv":          newAuxvec(t, msrc),
+		"cmdline":       newExecArgInode(t, msrc, cmdlineExecArg),
+		"comm":          newComm(t, msrc),
+		"environ":       newExecArgInode(t, msrc, environExecArg),
+		"exe":           newExe(t, msrc),
+		"fd":            newFdDir(t, msrc),
+		"fdinfo":        newFdInfoDir(t, msrc),
+		"gid_map":       newGIDMap(t, msrc),
+		"io":            newIO(t, msrc, isThreadGroup),
+		"maps":          newMaps(t, msrc),
+		"mountinfo":     seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
+		"mounts":        seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		"net":           newNetDir(t, msrc),
+		"ns":            newNamespaceDir(t, msrc),
+		"oom_score":     newOOMScore(t, msrc),
+		"oom_score_adj": newOOMScoreAdj(t, msrc),
+		"smaps":         newSmaps(t, msrc),
+		"stat":          newTaskStat(t, msrc, isThreadGroup, p.pidns),
+		"statm":         newStatm(t, msrc),
+		"status":        newStatus(t, msrc, p.pidns),
+		"uid_map":       newUIDMap(t, msrc),
 	}
 	if isThreadGroup {
 		contents["task"] = p.newSubtasks(t, msrc)
@@ -249,12 +262,13 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 	return newProcInode(t, exeSymlink, msrc, fs.Symlink, t)
 }
 
-func (e *exe) executable() (d *fs.Dirent, err error) {
+func (e *exe) executable() (file fsbridge.File, err error) {
+	if err := checkTaskState(e.t); err != nil {
+		return nil, err
+	}
 	e.t.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
-			// TODO(b/34851096): Check shouldn't allow Readlink once the
-			// Task is zombied.
 			err = syserror.EACCES
 			return
 		}
@@ -262,9 +276,9 @@ func (e *exe) executable() (d *fs.Dirent, err error) {
 		// The MemoryManager may be destroyed, in which case
 		// MemoryManager.destroy will simply set the executable to nil
 		// (with locks held).
-		d = mm.Executable()
-		if d == nil {
-			err = syserror.ENOENT
+		file = mm.Executable()
+		if file == nil {
+			err = syserror.ESRCH
 		}
 	})
 	return
@@ -283,15 +297,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	}
 	defer exec.DecRef()
 
-	root := fs.RootFromContext(ctx)
-	if root == nil {
-		// This doesn't correspond to anything in Linux because the vfs is
-		// global there.
-		return "", syserror.EINVAL
-	}
-	defer root.DecRef()
-	n, _ := exec.FullName(root)
-	return n, nil
+	return exec.PathnameWithDeleted(ctx), nil
 }
 
 // namespaceSymlink represents a symlink in the namespacefs, such as the files
@@ -317,11 +323,22 @@ func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs.
 	return newProcInode(t, n, msrc, fs.Symlink, t)
 }
 
+// Readlink reads the symlink value.
+func (n *namespaceSymlink) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	if err := checkTaskState(n.t); err != nil {
+		return "", err
+	}
+	return n.Symlink.Readlink(ctx, inode)
+}
+
 // Getlink implements fs.InodeOperations.Getlink.
 func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) {
 	if !kernel.ContextCanTrace(ctx, n.t, false) {
 		return nil, syserror.EACCES
 	}
+	if err := checkTaskState(n.t); err != nil {
+		return nil, err
+	}
 
 	// Create a new regular file to fake the namespace file.
 	iops := fsutil.NewNoReadWriteFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0777), linux.PROC_SUPER_MAGIC)
@@ -803,4 +820,95 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc
 	return int64(n), err
 }
 
+// newOOMScore returns a oom_score file. It is a stub that always returns 0.
+// TODO(gvisor.dev/issue/1967)
+func newOOMScore(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newStaticProcInode(t, msrc, []byte("0\n"))
+}
+
+// oomScoreAdj is a file containing the oom_score adjustment for a task.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	fsutil.SimpleFileInode
+
+	t *kernel.Task
+}
+
+// +stateify savable
+type oomScoreAdjFile struct {
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+
+	t *kernel.Task
+}
+
+// newOOMScoreAdj returns a oom_score_adj file.
+func newOOMScoreAdj(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	i := &oomScoreAdj{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
+		t:               t,
+	}
+	return newProcInode(t, i, msrc, fs.SpecialFile, t)
+}
+
+// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
+// O_TRUNC is specified for any kind of existing Dirent but is not called via
+// (f)truncate for proc files.
+func (*oomScoreAdj) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &oomScoreAdjFile{t: o.t}), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if f.t.ExitState() == kernel.TaskExitDead {
+		return 0, syserror.ESRCH
+	}
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "%d\n", f.t.OOMScoreAdj())
+	if offset >= int64(buf.Len()) {
+		return 0, io.EOF
+	}
+	n, err := dst.CopyOut(ctx, buf.Bytes()[offset:])
+	return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if f.t.ExitState() == kernel.TaskExitDead {
+		return 0, syserror.ESRCH
+	}
+	if err := f.t.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
+
 // LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go)
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index d5be56c3f..bc117ca6a 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -44,9 +44,6 @@ const (
 	// lookup.
 	cacheRevalidate = "revalidate"
 
-	// TODO(edahlgren/mpratt): support a tmpfs size limit.
-	// size = "size"
-
 	// Permissions that exceed modeMask will be rejected.
 	modeMask = 01777
 
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 25abbc151..1dc75291d 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -39,7 +39,7 @@ var (
 	opensRO  = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.")
 	opensW   = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.")
 	reads    = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.")
-	readWait = metric.MustCreateNewUint64Metric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.")
+	readWait = metric.MustCreateNewUint64NanosecondsMetric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.")
 )
 
 // fileInodeOperations implements fs.InodeOperations for a regular tmpfs file.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 3c2b583ae..b095312fe 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -39,14 +39,13 @@ var fsInfo = fs.Info{
 
 // rename implements fs.InodeOperations.Rename for tmpfs nodes.
 func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
-	op, ok := oldParent.InodeOperations.(*Dir)
-	if !ok {
-		return syserror.EXDEV
-	}
-	np, ok := newParent.InodeOperations.(*Dir)
-	if !ok {
+	// Don't allow renames across different mounts.
+	if newParent.MountSource != oldParent.MountSource {
 		return syserror.EXDEV
 	}
+
+	op := oldParent.InodeOperations.(*Dir)
+	np := newParent.InodeOperations.(*Dir)
 	return ramfs.Rename(ctx, op.ramfsDir, oldName, np.ramfsDir, newName, replacement)
 }
 
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 12b1c6097..2e9dd2d55 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -27,6 +27,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 const (
 	// canonMaxBytes is the number of bytes that fit into a single line of
 	// terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE
@@ -443,3 +445,5 @@ func (l *lineDiscipline) peek(b []byte) int {
 	}
 	return size
 }
+
+// LINT.ThenChange(../../fsimpl/devpts/line_discipline.go)
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index f62da49bd..fe07fa929 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -26,6 +26,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // masterInodeOperations are the fs.InodeOperations for the master end of the
 // Terminal (ptmx file).
 //
@@ -232,3 +234,5 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
 		unimpl.EmitUnimplementedEvent(ctx)
 	}
 }
+
+// LINT.ThenChange(../../fsimpl/devpts/master.go)
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 1ca79c0b2..ceabb9b1e 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // waitBufMaxBytes is the maximum size of a wait buffer. It is based on
 // TTYB_DEFAULT_MEM_LIMIT.
 const waitBufMaxBytes = 131072
@@ -234,3 +236,5 @@ func (q *queue) waitBufAppend(b []byte) {
 	q.waitBuf = append(q.waitBuf, b)
 	q.waitBufLen += uint64(len(b))
 }
+
+// LINT.ThenChange(../../fsimpl/devpts/queue.go)
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 6a2dbc576..9871f6fc6 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // slaveInodeOperations are the fs.InodeOperations for the slave end of the
 // Terminal (pts file).
 //
@@ -172,3 +174,5 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem
 		return 0, syserror.ENOTTY
 	}
 }
+
+// LINT.ThenChange(../../fsimpl/devpts/slave.go)
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 5883f26db..ddcccf4da 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -23,6 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // Terminal is a pseudoterminal.
 //
 // +stateify savable
@@ -126,3 +128,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
 	}
 	return tm.slaveKTTY
 }
+
+// LINT.ThenChange(../../fsimpl/devpts/terminal.go)
diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD
new file mode 100644
index 000000000..bd5dac373
--- /dev/null
+++ b/pkg/sentry/fs/user/BUILD
@@ -0,0 +1,39 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "user",
+    srcs = [
+        "path.go",
+        "user.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
+
+go_test(
+    name = "user_test",
+    size = "small",
+    srcs = ["user_test.go"],
+    library = ":user",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/contexttest",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go
new file mode 100644
index 000000000..fbd4547a7
--- /dev/null
+++ b/pkg/sentry/fs/user/path.go
@@ -0,0 +1,169 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package user
+
+import (
+	"fmt"
+	"path"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ResolveExecutablePath resolves the given executable name given the working
+// dir and environment.
+func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs.MountNamespace, envv []string, wd, name string) (string, error) {
+	// Absolute paths can be used directly.
+	if path.IsAbs(name) {
+		return name, nil
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(name, '/') > 0 {
+		if wd == "" {
+			wd = "/"
+		}
+		if !path.IsAbs(wd) {
+			return "", fmt.Errorf("working directory %q must be absolute", wd)
+		}
+		return path.Join(wd, name), nil
+	}
+
+	// Otherwise, We must lookup the name in the paths, starting from the
+	// calling context's root directory.
+	paths := getPath(envv)
+
+	root := fs.RootFromContext(ctx)
+	if root == nil {
+		// Caller has no root. Don't bother traversing anything.
+		return "", syserror.ENOENT
+	}
+	defer root.DecRef()
+	for _, p := range paths {
+		if !path.IsAbs(p) {
+			// Relative paths aren't safe, no one should be using them.
+			log.Warningf("Skipping relative path %q in $PATH", p)
+			continue
+		}
+
+		binPath := path.Join(p, name)
+		traversals := uint(linux.MaxSymlinkTraversals)
+		d, err := mns.FindInode(ctx, root, nil, binPath, &traversals)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return "", err
+		}
+		defer d.DecRef()
+
+		// Check that it is a regular file.
+		if !fs.IsRegular(d.Inode.StableAttr) {
+			continue
+		}
+
+		// Check whether we can read and execute the found file.
+		if err := d.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil {
+			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
+			continue
+		}
+		return path.Join("/", p, name), nil
+	}
+
+	// Couldn't find it.
+	return "", syserror.ENOENT
+}
+
+// ResolveExecutablePathVFS2 resolves the given executable name given the
+// working dir and environment.
+func ResolveExecutablePathVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, envv []string, wd, name string) (string, error) {
+	// Absolute paths can be used directly.
+	if path.IsAbs(name) {
+		return name, nil
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(name, '/') > 0 {
+		if wd == "" {
+			wd = "/"
+		}
+		if !path.IsAbs(wd) {
+			return "", fmt.Errorf("working directory %q must be absolute", wd)
+		}
+		return path.Join(wd, name), nil
+	}
+
+	// Otherwise, We must lookup the name in the paths, starting from the
+	// calling context's root directory.
+	paths := getPath(envv)
+
+	root := mns.Root()
+	defer root.DecRef()
+	for _, p := range paths {
+		if !path.IsAbs(p) {
+			// Relative paths aren't safe, no one should be using them.
+			log.Warningf("Skipping relative path %q in $PATH", p)
+			continue
+		}
+
+		binPath := path.Join(p, name)
+		pop := &vfs.PathOperation{
+			Root:               root,
+			Start:              root,
+			Path:               fspath.Parse(binPath),
+			FollowFinalSymlink: true,
+		}
+		opts := &vfs.OpenOptions{
+			FileExec: true,
+			Flags:    linux.O_RDONLY,
+		}
+		dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return "", err
+		}
+		dentry.DecRef()
+
+		return binPath, nil
+	}
+
+	// Couldn't find it.
+	return "", syserror.ENOENT
+}
+
+// getPath returns the PATH as a slice of strings given the environment
+// variables.
+func getPath(env []string) []string {
+	const prefix = "PATH="
+	for _, e := range env {
+		if strings.HasPrefix(e, prefix) {
+			return strings.Split(strings.TrimPrefix(e, prefix), ":")
+		}
+	}
+	return nil
+}
diff --git a/runsc/boot/user.go b/pkg/sentry/fs/user/user.go
index f0aa52135..f4d525523 100644
--- a/runsc/boot/user.go
+++ b/pkg/sentry/fs/user/user.go
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package boot
+// Package user contains methods for resolving filesystem paths based on the
+// user and their environment.
+package user
 
 import (
 	"bufio"
@@ -23,8 +25,10 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -84,6 +88,48 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K
 		File: f,
 	}
 
+	return findHomeInPasswd(uint32(uid), r, defaultHome)
+}
+
+type fileReaderVFS2 struct {
+	ctx context.Context
+	fd  *vfs.FileDescription
+}
+
+func (r *fileReaderVFS2) Read(buf []byte) (int, error) {
+	n, err := r.fd.Read(r.ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+	return int(n), err
+}
+
+func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.KUID) (string, error) {
+	const defaultHome = "/"
+
+	root := mns.Root()
+	defer root.DecRef()
+
+	creds := auth.CredentialsFromContext(ctx)
+
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse("/etc/passwd"),
+	}
+
+	opts := &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	}
+
+	fd, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, target, opts)
+	if err != nil {
+		return defaultHome, nil
+	}
+	defer fd.DecRef()
+
+	r := &fileReaderVFS2{
+		ctx: ctx,
+		fd:  fd,
+	}
+
 	homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome)
 	if err != nil {
 		return "", err
@@ -92,10 +138,10 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K
 	return homeDir, nil
 }
 
-// maybeAddExecUserHome returns a new slice with the HOME enviroment variable
+// MaybeAddExecUserHome returns a new slice with the HOME enviroment variable
 // set if the slice does not already contain it, otherwise it returns the
 // original slice unmodified.
-func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
+func MaybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
 	// Check if the envv already contains HOME.
 	for _, env := range envv {
 		if strings.HasPrefix(env, "HOME=") {
@@ -111,6 +157,29 @@ func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.
 	if err != nil {
 		return nil, fmt.Errorf("error reading exec user: %v", err)
 	}
+
+	return append(envv, "HOME="+homeDir), nil
+}
+
+// MaybeAddExecUserHomeVFS2 returns a new slice with the HOME enviroment
+// variable set if the slice does not already contain it, otherwise it returns
+// the original slice unmodified.
+func MaybeAddExecUserHomeVFS2(ctx context.Context, vmns *vfs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
+	// Check if the envv already contains HOME.
+	for _, env := range envv {
+		if strings.HasPrefix(env, "HOME=") {
+			// We have it. Return the original slice unmodified.
+			return envv, nil
+		}
+	}
+
+	// Read /etc/passwd for the user's HOME directory and set the HOME
+	// environment variable as required by POSIX if it is not overridden by
+	// the user.
+	homeDir, err := getExecUserHomeVFS2(ctx, vmns, uid)
+	if err != nil {
+		return nil, fmt.Errorf("error reading exec user: %v", err)
+	}
 	return append(envv, "HOME="+homeDir), nil
 }
 
diff --git a/runsc/boot/user_test.go b/pkg/sentry/fs/user/user_test.go
index fb4e13dfb..7d8e9ac7c 100644
--- a/runsc/boot/user_test.go
+++ b/pkg/sentry/fs/user/user_test.go
@@ -12,167 +12,111 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package boot
+package user
 
 import (
-	"io/ioutil"
-	"os"
-	"path/filepath"
+	"fmt"
 	"strings"
-	"syscall"
 	"testing"
 
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func setupTempDir() (string, error) {
-	tmpDir, err := ioutil.TempDir(os.TempDir(), "exec-user-test")
+// createEtcPasswd creates /etc/passwd with the given contents and mode. If
+// mode is empty, then no file will be created. If mode is not a regular file
+// mode, then contents is ignored.
+func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode linux.FileMode) error {
+	if err := root.CreateDirectory(ctx, root, "etc", fs.FilePermsFromMode(0755)); err != nil {
+		return err
+	}
+	etc, err := root.Walk(ctx, root, "etc")
 	if err != nil {
-		return "", err
+		return err
 	}
-	return tmpDir, nil
-}
-
-func setupPasswd(contents string, perms os.FileMode) func() (string, error) {
-	return func() (string, error) {
-		tmpDir, err := setupTempDir()
-		if err != nil {
-			return "", err
-		}
-
-		if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
-			return "", err
-		}
-
-		f, err := os.Create(filepath.Join(tmpDir, "etc", "passwd"))
-		if err != nil {
-			return "", err
-		}
-		defer f.Close()
-
-		_, err = f.WriteString(contents)
+	defer etc.DecRef()
+	switch mode.FileType() {
+	case 0:
+		// Don't create anything.
+		return nil
+	case linux.S_IFREG:
+		passwd, err := etc.Create(ctx, root, "passwd", fs.FileFlags{Write: true}, fs.FilePermsFromMode(mode))
 		if err != nil {
-			return "", err
+			return err
 		}
-
-		err = f.Chmod(perms)
-		if err != nil {
-			return "", err
+		defer passwd.DecRef()
+		if _, err := passwd.Writev(ctx, usermem.BytesIOSequence([]byte(contents))); err != nil {
+			return err
 		}
-		return tmpDir, nil
+		return nil
+	case linux.S_IFDIR:
+		return etc.CreateDirectory(ctx, root, "passwd", fs.FilePermsFromMode(mode))
+	case linux.S_IFIFO:
+		return etc.CreateFifo(ctx, root, "passwd", fs.FilePermsFromMode(mode))
+	default:
+		return fmt.Errorf("unknown file type %x", mode.FileType())
 	}
 }
 
 // TestGetExecUserHome tests the getExecUserHome function.
 func TestGetExecUserHome(t *testing.T) {
 	tests := map[string]struct {
-		uid        auth.KUID
-		createRoot func() (string, error)
-		expected   string
+		uid            auth.KUID
+		passwdContents string
+		passwdMode     linux.FileMode
+		expected       string
 	}{
 		"success": {
-			uid:        1000,
-			createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0666),
-			expected:   "/home/adin",
+			uid:            1000,
+			passwdContents: "adin::1000:1111::/home/adin:/bin/sh",
+			passwdMode:     linux.S_IFREG | 0666,
+			expected:       "/home/adin",
+		},
+		"no_perms": {
+			uid:            1000,
+			passwdContents: "adin::1000:1111::/home/adin:/bin/sh",
+			passwdMode:     linux.S_IFREG,
+			expected:       "/",
 		},
 		"no_passwd": {
-			uid:        1000,
-			createRoot: setupTempDir,
-			expected:   "/",
+			uid:      1000,
+			expected: "/",
 		},
-		"no_perms": {
+		"directory": {
 			uid:        1000,
-			createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0000),
+			passwdMode: linux.S_IFDIR | 0666,
 			expected:   "/",
 		},
-		"directory": {
-			uid: 1000,
-			createRoot: func() (string, error) {
-				tmpDir, err := setupTempDir()
-				if err != nil {
-					return "", err
-				}
-
-				if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
-					return "", err
-				}
-
-				if err := syscall.Mkdir(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil {
-					return "", err
-				}
-
-				return tmpDir, nil
-			},
-			expected: "/",
-		},
 		// Currently we don't allow named pipes.
 		"named_pipe": {
-			uid: 1000,
-			createRoot: func() (string, error) {
-				tmpDir, err := setupTempDir()
-				if err != nil {
-					return "", err
-				}
-
-				if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
-					return "", err
-				}
-
-				if err := syscall.Mkfifo(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil {
-					return "", err
-				}
-
-				return tmpDir, nil
-			},
-			expected: "/",
+			uid:        1000,
+			passwdMode: linux.S_IFIFO | 0666,
+			expected:   "/",
 		},
 	}
 
 	for name, tc := range tests {
 		t.Run(name, func(t *testing.T) {
-			tmpDir, err := tc.createRoot()
-			if err != nil {
-				t.Fatalf("failed to create root dir: %v", err)
-			}
-
-			sandEnd, cleanup, err := startGofer(tmpDir)
-			if err != nil {
-				t.Fatalf("failed to create gofer: %v", err)
-			}
-			defer cleanup()
-
 			ctx := contexttest.Context(t)
-			conf := &Config{
-				RootDir:        "unused_root_dir",
-				Network:        NetworkNone,
-				DisableSeccomp: true,
-			}
+			msrc := fs.NewPseudoMountSource(ctx)
+			rootInode := tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc)
 
-			spec := &specs.Spec{
-				Root: &specs.Root{
-					Path:     tmpDir,
-					Readonly: true,
-				},
-				// Add /proc mount as tmpfs to avoid needing a kernel.
-				Mounts: []specs.Mount{
-					{
-						Destination: "/proc",
-						Type:        "tmpfs",
-					},
-				},
-			}
-
-			mntr := newContainerMounter(spec, []int{sandEnd}, nil, &podMountHints{})
-			mns, err := mntr.createMountNamespace(ctx, conf)
+			mns, err := fs.NewMountNamespace(ctx, rootInode)
 			if err != nil {
-				t.Fatalf("failed to create mount namespace: %v", err)
+				t.Fatalf("NewMountNamespace failed: %v", err)
 			}
-			ctx = fs.WithRoot(ctx, mns.Root())
-			if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
-				t.Fatalf("failed to create mount namespace: %v", err)
+			defer mns.DecRef()
+			root := mns.Root()
+			defer root.DecRef()
+			ctx = fs.WithRoot(ctx, root)
+
+			if err := createEtcPasswd(ctx, root, tc.passwdContents, tc.passwdMode); err != nil {
+				t.Fatalf("createEtcPasswd failed: %v", err)
 			}
 
 			got, err := getExecUserHome(ctx, mns, tc.uid)
diff --git a/pkg/sentry/fsbridge/BUILD b/pkg/sentry/fsbridge/BUILD
new file mode 100644
index 000000000..6c798f0bd
--- /dev/null
+++ b/pkg/sentry/fsbridge/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "fsbridge",
+    srcs = [
+        "bridge.go",
+        "fs.go",
+        "vfs.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go
new file mode 100644
index 000000000..8e7590721
--- /dev/null
+++ b/pkg/sentry/fsbridge/bridge.go
@@ -0,0 +1,54 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsbridge provides common interfaces to bridge between VFS1 and VFS2
+// files.
+package fsbridge
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// File provides a common interface to bridge between VFS1 and VFS2 files.
+type File interface {
+	// PathnameWithDeleted returns an absolute pathname to vd, consistent with
+	// Linux's d_path(). In particular, if vd.Dentry() has been disowned,
+	// PathnameWithDeleted appends " (deleted)" to the returned pathname.
+	PathnameWithDeleted(ctx context.Context) string
+
+	// ReadFull read all contents from the file.
+	ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
+
+	// ConfigureMMap mutates opts to implement mmap(2) for the file.
+	ConfigureMMap(context.Context, *memmap.MMapOpts) error
+
+	// Type returns the file type, e.g. linux.S_IFREG.
+	Type(context.Context) (linux.FileMode, error)
+
+	// IncRef increments reference.
+	IncRef()
+
+	// DecRef decrements reference.
+	DecRef()
+}
+
+// Lookup provides a common interface to open files.
+type Lookup interface {
+	// OpenPath opens a file.
+	OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error)
+}
diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go
new file mode 100644
index 000000000..093ce1fb3
--- /dev/null
+++ b/pkg/sentry/fsbridge/fs.go
@@ -0,0 +1,181 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsbridge
+
+import (
+	"io"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fsFile implements File interface over fs.File.
+//
+// +stateify savable
+type fsFile struct {
+	file *fs.File
+}
+
+var _ File = (*fsFile)(nil)
+
+// NewFSFile creates a new File over fs.File.
+func NewFSFile(file *fs.File) File {
+	return &fsFile{file: file}
+}
+
+// PathnameWithDeleted implements File.
+func (f *fsFile) PathnameWithDeleted(ctx context.Context) string {
+	root := fs.RootFromContext(ctx)
+	if root == nil {
+		// This doesn't correspond to anything in Linux because the vfs is
+		// global there.
+		return ""
+	}
+	defer root.DecRef()
+
+	name, _ := f.file.Dirent.FullName(root)
+	return name
+}
+
+// ReadFull implements File.
+func (f *fsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	var total int64
+	for dst.NumBytes() > 0 {
+		n, err := f.file.Preadv(ctx, dst, offset+total)
+		total += n
+		if err == io.EOF && total != 0 {
+			return total, io.ErrUnexpectedEOF
+		} else if err != nil {
+			return total, err
+		}
+		dst = dst.DropFirst64(n)
+	}
+	return total, nil
+}
+
+// ConfigureMMap implements File.
+func (f *fsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return f.file.ConfigureMMap(ctx, opts)
+}
+
+// Type implements File.
+func (f *fsFile) Type(context.Context) (linux.FileMode, error) {
+	return linux.FileMode(f.file.Dirent.Inode.StableAttr.Type.LinuxType()), nil
+}
+
+// IncRef implements File.
+func (f *fsFile) IncRef() {
+	f.file.IncRef()
+}
+
+// DecRef implements File.
+func (f *fsFile) DecRef() {
+	f.file.DecRef()
+}
+
+// fsLookup implements Lookup interface using fs.File.
+//
+// +stateify savable
+type fsLookup struct {
+	mntns *fs.MountNamespace
+
+	root       *fs.Dirent
+	workingDir *fs.Dirent
+}
+
+var _ Lookup = (*fsLookup)(nil)
+
+// NewFSLookup creates a new Lookup using VFS1.
+func NewFSLookup(mntns *fs.MountNamespace, root, workingDir *fs.Dirent) Lookup {
+	return &fsLookup{
+		mntns:      mntns,
+		root:       root,
+		workingDir: workingDir,
+	}
+}
+
+// OpenPath implements Lookup.
+func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error) {
+	var d *fs.Dirent
+	var err error
+	if resolveFinal {
+		d, err = l.mntns.FindInode(ctx, l.root, l.workingDir, path, remainingTraversals)
+	} else {
+		d, err = l.mntns.FindLink(ctx, l.root, l.workingDir, path, remainingTraversals)
+	}
+	if err != nil {
+		return nil, err
+	}
+	defer d.DecRef()
+
+	if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
+		return nil, syserror.ELOOP
+	}
+
+	fsPerm := openOptionsToPermMask(&opts)
+	if err := d.Inode.CheckPermission(ctx, fsPerm); err != nil {
+		return nil, err
+	}
+
+	// If they claim it's a directory, then make sure.
+	if strings.HasSuffix(path, "/") {
+		if d.Inode.StableAttr.Type != fs.Directory {
+			return nil, syserror.ENOTDIR
+		}
+	}
+
+	if opts.FileExec && d.Inode.StableAttr.Type != fs.RegularFile {
+		ctx.Infof("%q is not a regular file: %v", path, d.Inode.StableAttr.Type)
+		return nil, syserror.EACCES
+	}
+
+	f, err := d.Inode.GetFile(ctx, d, flagsToFileFlags(opts.Flags))
+	if err != nil {
+		return nil, err
+	}
+
+	return &fsFile{file: f}, nil
+}
+
+func openOptionsToPermMask(opts *vfs.OpenOptions) fs.PermMask {
+	mode := opts.Flags & linux.O_ACCMODE
+	return fs.PermMask{
+		Read:    mode == linux.O_RDONLY || mode == linux.O_RDWR,
+		Write:   mode == linux.O_WRONLY || mode == linux.O_RDWR,
+		Execute: opts.FileExec,
+	}
+}
+
+func flagsToFileFlags(flags uint32) fs.FileFlags {
+	return fs.FileFlags{
+		Direct:      flags&linux.O_DIRECT != 0,
+		DSync:       flags&(linux.O_DSYNC|linux.O_SYNC) != 0,
+		Sync:        flags&linux.O_SYNC != 0,
+		NonBlocking: flags&linux.O_NONBLOCK != 0,
+		Read:        (flags & linux.O_ACCMODE) != linux.O_WRONLY,
+		Write:       (flags & linux.O_ACCMODE) != linux.O_RDONLY,
+		Append:      flags&linux.O_APPEND != 0,
+		Directory:   flags&linux.O_DIRECTORY != 0,
+		Async:       flags&linux.O_ASYNC != 0,
+		LargeFile:   flags&linux.O_LARGEFILE != 0,
+		Truncate:    flags&linux.O_TRUNC != 0,
+	}
+}
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
new file mode 100644
index 000000000..89168220a
--- /dev/null
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -0,0 +1,142 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsbridge
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// VFSFile implements File interface over vfs.FileDescription.
+//
+// +stateify savable
+type VFSFile struct {
+	file *vfs.FileDescription
+}
+
+var _ File = (*VFSFile)(nil)
+
+// NewVFSFile creates a new File over fs.File.
+func NewVFSFile(file *vfs.FileDescription) File {
+	return &VFSFile{file: file}
+}
+
+// PathnameWithDeleted implements File.
+func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string {
+	root := vfs.RootFromContext(ctx)
+	defer root.DecRef()
+
+	vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
+	name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry())
+	return name
+}
+
+// ReadFull implements File.
+func (f *VFSFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	var total int64
+	for dst.NumBytes() > 0 {
+		n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{})
+		total += n
+		if err == io.EOF && total != 0 {
+			return total, io.ErrUnexpectedEOF
+		} else if err != nil {
+			return total, err
+		}
+		dst = dst.DropFirst64(n)
+	}
+	return total, nil
+}
+
+// ConfigureMMap implements File.
+func (f *VFSFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return f.file.ConfigureMMap(ctx, opts)
+}
+
+// Type implements File.
+func (f *VFSFile) Type(ctx context.Context) (linux.FileMode, error) {
+	stat, err := f.file.Stat(ctx, vfs.StatOptions{})
+	if err != nil {
+		return 0, err
+	}
+	return linux.FileMode(stat.Mode).FileType(), nil
+}
+
+// IncRef implements File.
+func (f *VFSFile) IncRef() {
+	f.file.IncRef()
+}
+
+// DecRef implements File.
+func (f *VFSFile) DecRef() {
+	f.file.DecRef()
+}
+
+// FileDescription returns the FileDescription represented by f. It does not
+// take an additional reference on the returned FileDescription.
+func (f *VFSFile) FileDescription() *vfs.FileDescription {
+	return f.file
+}
+
+// fsLookup implements Lookup interface using fs.File.
+//
+// +stateify savable
+type vfsLookup struct {
+	mntns *vfs.MountNamespace
+
+	root       vfs.VirtualDentry
+	workingDir vfs.VirtualDentry
+}
+
+var _ Lookup = (*vfsLookup)(nil)
+
+// NewVFSLookup creates a new Lookup using VFS2.
+func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) Lookup {
+	return &vfsLookup{
+		mntns:      mntns,
+		root:       root,
+		workingDir: workingDir,
+	}
+}
+
+// OpenPath implements Lookup.
+//
+// remainingTraversals is not configurable in VFS2, all callers are using the
+// default anyways.
+func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
+	vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
+	creds := auth.CredentialsFromContext(ctx)
+	path := fspath.Parse(pathname)
+	pop := &vfs.PathOperation{
+		Root:               l.root,
+		Start:              l.workingDir,
+		Path:               path,
+		FollowFinalSymlink: resolveFinal,
+	}
+	if path.Absolute {
+		pop.Start = l.root
+	}
+	fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return &VFSFile{file: fd}, nil
+}
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
new file mode 100644
index 000000000..585764223
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -0,0 +1,43 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "devpts",
+    srcs = [
+        "devpts.go",
+        "line_discipline.go",
+        "master.go",
+        "queue.go",
+        "slave.go",
+        "terminal.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/safemem",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/unimpl",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "devpts_test",
+    size = "small",
+    srcs = ["devpts_test.go"],
+    library = ":devpts",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/contexttest",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
new file mode 100644
index 000000000..c03c65445
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -0,0 +1,231 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package devpts provides a filesystem implementation that behaves like
+// devpts.
+package devpts
+
+import (
+	"fmt"
+	"math"
+	"sort"
+	"strconv"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the filesystem name.
+const Name = "devpts"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+var _ vfs.FilesystemType = (*FilesystemType)(nil)
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	// No data allowed.
+	if opts.Data != "" {
+		return nil, nil, syserror.EINVAL
+	}
+
+	fs, root, err := fstype.newFilesystem(vfsObj, creds)
+	if err != nil {
+		return nil, nil, err
+	}
+	return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+// newFilesystem creates a new devpts filesystem with root directory and ptmx
+// master inode. It returns the filesystem and root Dentry.
+func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	fs := &filesystem{
+		devMinor: devMinor,
+	}
+	fs.Filesystem.VFSFilesystem().Init(vfsObj, fstype, fs)
+
+	// Construct the root directory. This is always inode id 1.
+	root := &rootInode{
+		slaves: make(map[uint32]*slaveInode),
+	}
+	root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
+	root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	root.dentry.Init(root)
+
+	// Construct the pts master inode and dentry. Linux always uses inode
+	// id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx.
+	master := &masterInode{
+		root: root,
+	}
+	master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)
+	master.dentry.Init(master)
+
+	// Add the master as a child of the root.
+	links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{
+		"ptmx": &master.dentry,
+	})
+	root.IncLinks(links)
+
+	return fs, &root.dentry, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release()
+}
+
+// rootInode is the root directory inode for the devpts mounts.
+type rootInode struct {
+	kernfs.AlwaysValid
+	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.OrderedChildren
+
+	// Keep a reference to this inode's dentry.
+	dentry kernfs.Dentry
+
+	// master is the master pty inode. Immutable.
+	master *masterInode
+
+	// root is the root directory inode for this filesystem. Immutable.
+	root *rootInode
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// slaves maps pty ids to slave inodes.
+	slaves map[uint32]*slaveInode
+
+	// nextIdx is the next pty index to use. Must be accessed atomically.
+	//
+	// TODO(b/29356795): reuse indices when ptys are closed.
+	nextIdx uint32
+}
+
+var _ kernfs.Inode = (*rootInode)(nil)
+
+// allocateTerminal creates a new Terminal and installs a pts node for it.
+func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	if i.nextIdx == math.MaxUint32 {
+		return nil, syserror.ENOMEM
+	}
+	idx := i.nextIdx
+	i.nextIdx++
+
+	// Sanity check that slave with idx does not exist.
+	if _, ok := i.slaves[idx]; ok {
+		panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
+	}
+
+	// Create the new terminal and slave.
+	t := newTerminal(idx)
+	slave := &slaveInode{
+		root: i,
+		t:    t,
+	}
+	// Linux always uses pty index + 3 as the inode id. See
+	// fs/devpts/inode.c:devpts_pty_new().
+	slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
+	slave.dentry.Init(slave)
+	i.slaves[idx] = slave
+
+	return t, nil
+}
+
+// masterClose is called when the master end of t is closed.
+func (i *rootInode) masterClose(t *Terminal) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	// Sanity check that slave with idx exists.
+	if _, ok := i.slaves[t.n]; !ok {
+		panic(fmt.Sprintf("pty with index %d does not exist", t.n))
+	}
+	delete(i.slaves, t.n)
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// Lookup implements kernfs.Inode.Lookup.
+func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	idx, err := strconv.ParseUint(name, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	if si, ok := i.slaves[uint32(idx)]; ok {
+		si.dentry.IncRef()
+		return si.dentry.VFSDentry(), nil
+
+	}
+	return nil, syserror.ENOENT
+}
+
+// IterDirents implements kernfs.Inode.IterDirents.
+func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	ids := make([]int, 0, len(i.slaves))
+	for id := range i.slaves {
+		ids = append(ids, int(id))
+	}
+	sort.Ints(ids)
+	for _, id := range ids[relOffset:] {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(id), 10),
+			Type:    linux.DT_CHR,
+			Ino:     i.slaves[uint32(id)].InodeAttrs.Ino(),
+			NextOff: offset + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	return offset, nil
+}
diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go
new file mode 100644
index 000000000..b7c149047
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/devpts_test.go
@@ -0,0 +1,56 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func TestSimpleMasterToSlave(t *testing.T) {
+	ld := newLineDiscipline(linux.DefaultSlaveTermios)
+	ctx := contexttest.Context(t)
+	inBytes := []byte("hello, tty\n")
+	src := usermem.BytesIOSequence(inBytes)
+	outBytes := make([]byte, 32)
+	dst := usermem.BytesIOSequence(outBytes)
+
+	// Write to the input queue.
+	nw, err := ld.inputQueueWrite(ctx, src)
+	if err != nil {
+		t.Fatalf("error writing to input queue: %v", err)
+	}
+	if nw != int64(len(inBytes)) {
+		t.Fatalf("wrote wrong length: got %d, want %d", nw, len(inBytes))
+	}
+
+	// Read from the input queue.
+	nr, err := ld.inputQueueRead(ctx, dst)
+	if err != nil {
+		t.Fatalf("error reading from input queue: %v", err)
+	}
+	if nr != int64(len(inBytes)) {
+		t.Fatalf("read wrong length: got %d, want %d", nr, len(inBytes))
+	}
+
+	outStr := string(outBytes[:nr])
+	inStr := string(inBytes)
+	if outStr != inStr {
+		t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr)
+	}
+}
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
new file mode 100644
index 000000000..f7bc325d1
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -0,0 +1,445 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"bytes"
+	"unicode/utf8"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// canonMaxBytes is the number of bytes that fit into a single line of
+	// terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE
+	// in include/linux/tty.h.
+	canonMaxBytes = 4096
+
+	// nonCanonMaxBytes is the maximum number of bytes that can be read at
+	// a time in noncanonical mode.
+	nonCanonMaxBytes = canonMaxBytes - 1
+
+	spacesPerTab = 8
+)
+
+// lineDiscipline dictates how input and output are handled between the
+// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
+// pages are good resources for how to affect the line discipline:
+//
+//   * termios(3)
+//   * tty_ioctl(4)
+//
+// This file corresponds most closely to drivers/tty/n_tty.c.
+//
+// lineDiscipline has a simple structure but supports a multitude of options
+// (see the above man pages). It consists of two queues of bytes: one from the
+// terminal master to slave (the input queue) and one from slave to master (the
+// output queue). When bytes are written to one end of the pty, the line
+// discipline reads the bytes, modifies them or takes special action if
+// required, and enqueues them to be read by the other end of the pty:
+//
+//       input from terminal    +-------------+   input to process (e.g. bash)
+//    +------------------------>| input queue |---------------------------+
+//    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
+//    |                                                                   |
+//    |                                                                   v
+// masterFD                                                            slaveFD
+//    ^                                                                   |
+//    |                                                                   |
+//    |   output to terminal   +--------------+    output from process    |
+//    +------------------------| output queue |<--------------------------+
+//        (outputQueueRead)    +--------------+    (outputQueueWrite)
+//
+// Lock order:
+//  termiosMu
+//    inQueue.mu
+//      outQueue.mu
+//
+// +stateify savable
+type lineDiscipline struct {
+	// sizeMu protects size.
+	sizeMu sync.Mutex `state:"nosave"`
+
+	// size is the terminal size (width and height).
+	size linux.WindowSize
+
+	// inQueue is the input queue of the terminal.
+	inQueue queue
+
+	// outQueue is the output queue of the terminal.
+	outQueue queue
+
+	// termiosMu protects termios.
+	termiosMu sync.RWMutex `state:"nosave"`
+
+	// termios is the terminal configuration used by the lineDiscipline.
+	termios linux.KernelTermios
+
+	// column is the location in a row of the cursor. This is important for
+	// handling certain special characters like backspace.
+	column int
+
+	// masterWaiter is used to wait on the master end of the TTY.
+	masterWaiter waiter.Queue `state:"zerovalue"`
+
+	// slaveWaiter is used to wait on the slave end of the TTY.
+	slaveWaiter waiter.Queue `state:"zerovalue"`
+}
+
+func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
+	ld := lineDiscipline{termios: termios}
+	ld.inQueue.transformer = &inputQueueTransformer{}
+	ld.outQueue.transformer = &outputQueueTransformer{}
+	return &ld
+}
+
+// getTermios gets the linux.Termios for the tty.
+func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	// We must copy a Termios struct, not KernelTermios.
+	t := l.termios.ToTermios()
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return 0, err
+}
+
+// setTermios sets a linux.Termios for the tty.
+func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	l.termiosMu.Lock()
+	defer l.termiosMu.Unlock()
+	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
+	// We must copy a Termios struct, not KernelTermios.
+	var t linux.Termios
+	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	l.termios.FromTermios(t)
+
+	// If canonical mode is turned off, move bytes from inQueue's wait
+	// buffer to its read buffer. Anything already in the read buffer is
+	// now readable.
+	if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) {
+		l.inQueue.mu.Lock()
+		l.inQueue.pushWaitBufLocked(l)
+		l.inQueue.readable = true
+		l.inQueue.mu.Unlock()
+		l.slaveWaiter.Notify(waiter.EventIn)
+	}
+
+	return 0, err
+}
+
+func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	l.sizeMu.Lock()
+	defer l.sizeMu.Unlock()
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+}
+
+func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	l.sizeMu.Lock()
+	defer l.sizeMu.Unlock()
+	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+}
+
+func (l *lineDiscipline) masterReadiness() waiter.EventMask {
+	// We don't have to lock a termios because the default master termios
+	// is immutable.
+	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
+}
+
+func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
+}
+
+func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	return l.inQueue.readableSize(ctx, io, args)
+}
+
+func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, pushed, err := l.inQueue.read(ctx, dst, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.masterWaiter.Notify(waiter.EventOut)
+		if pushed {
+			l.slaveWaiter.Notify(waiter.EventIn)
+		}
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, err := l.inQueue.write(ctx, src, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.slaveWaiter.Notify(waiter.EventIn)
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	return l.outQueue.readableSize(ctx, io, args)
+}
+
+func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, pushed, err := l.outQueue.read(ctx, dst, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.slaveWaiter.Notify(waiter.EventOut)
+		if pushed {
+			l.masterWaiter.Notify(waiter.EventIn)
+		}
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, err := l.outQueue.write(ctx, src, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.masterWaiter.Notify(waiter.EventIn)
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+// transformer is a helper interface to make it easier to stateify queue.
+type transformer interface {
+	// transform functions require queue's mutex to be held.
+	transform(*lineDiscipline, *queue, []byte) int
+}
+
+// outputQueueTransformer implements transformer. It performs line discipline
+// transformations on the output queue.
+//
+// +stateify savable
+type outputQueueTransformer struct{}
+
+// transform does output processing for one end of the pty. See
+// drivers/tty/n_tty.c:do_output_char for an analogous kernel function.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q.mu must be held.
+func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
+	// transformOutput is effectively always in noncanonical mode, as the
+	// master termios never has ICANON set.
+
+	if !l.termios.OEnabled(linux.OPOST) {
+		q.readBuf = append(q.readBuf, buf...)
+		if len(q.readBuf) > 0 {
+			q.readable = true
+		}
+		return len(buf)
+	}
+
+	var ret int
+	for len(buf) > 0 {
+		size := l.peek(buf)
+		cBytes := append([]byte{}, buf[:size]...)
+		ret += size
+		buf = buf[size:]
+		// We're guaranteed that cBytes has at least one element.
+		switch cBytes[0] {
+		case '\n':
+			if l.termios.OEnabled(linux.ONLRET) {
+				l.column = 0
+			}
+			if l.termios.OEnabled(linux.ONLCR) {
+				q.readBuf = append(q.readBuf, '\r', '\n')
+				continue
+			}
+		case '\r':
+			if l.termios.OEnabled(linux.ONOCR) && l.column == 0 {
+				continue
+			}
+			if l.termios.OEnabled(linux.OCRNL) {
+				cBytes[0] = '\n'
+				if l.termios.OEnabled(linux.ONLRET) {
+					l.column = 0
+				}
+				break
+			}
+			l.column = 0
+		case '\t':
+			spaces := spacesPerTab - l.column%spacesPerTab
+			if l.termios.OutputFlags&linux.TABDLY == linux.XTABS {
+				l.column += spaces
+				q.readBuf = append(q.readBuf, bytes.Repeat([]byte{' '}, spacesPerTab)...)
+				continue
+			}
+			l.column += spaces
+		case '\b':
+			if l.column > 0 {
+				l.column--
+			}
+		default:
+			l.column++
+		}
+		q.readBuf = append(q.readBuf, cBytes...)
+	}
+	if len(q.readBuf) > 0 {
+		q.readable = true
+	}
+	return ret
+}
+
+// inputQueueTransformer implements transformer. It performs line discipline
+// transformations on the input queue.
+//
+// +stateify savable
+type inputQueueTransformer struct{}
+
+// transform does input processing for one end of the pty. Characters read are
+// transformed according to flags set in the termios struct. See
+// drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel
+// function.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q.mu must be held.
+func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
+	// If there's a line waiting to be read in canonical mode, don't write
+	// anything else to the read buffer.
+	if l.termios.LEnabled(linux.ICANON) && q.readable {
+		return 0
+	}
+
+	maxBytes := nonCanonMaxBytes
+	if l.termios.LEnabled(linux.ICANON) {
+		maxBytes = canonMaxBytes
+	}
+
+	var ret int
+	for len(buf) > 0 && len(q.readBuf) < canonMaxBytes {
+		size := l.peek(buf)
+		cBytes := append([]byte{}, buf[:size]...)
+		// We're guaranteed that cBytes has at least one element.
+		switch cBytes[0] {
+		case '\r':
+			if l.termios.IEnabled(linux.IGNCR) {
+				buf = buf[size:]
+				ret += size
+				continue
+			}
+			if l.termios.IEnabled(linux.ICRNL) {
+				cBytes[0] = '\n'
+			}
+		case '\n':
+			if l.termios.IEnabled(linux.INLCR) {
+				cBytes[0] = '\r'
+			}
+		}
+
+		// In canonical mode, we discard non-terminating characters
+		// after the first 4095.
+		if l.shouldDiscard(q, cBytes) {
+			buf = buf[size:]
+			ret += size
+			continue
+		}
+
+		// Stop if the buffer would be overfilled.
+		if len(q.readBuf)+size > maxBytes {
+			break
+		}
+		buf = buf[size:]
+		ret += size
+
+		// If we get EOF, make the buffer available for reading.
+		if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(cBytes[0]) {
+			q.readable = true
+			break
+		}
+
+		q.readBuf = append(q.readBuf, cBytes...)
+
+		// Anything written to the readBuf will have to be echoed.
+		if l.termios.LEnabled(linux.ECHO) {
+			l.outQueue.writeBytes(cBytes, l)
+			l.masterWaiter.Notify(waiter.EventIn)
+		}
+
+		// If we finish a line, make it available for reading.
+		if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(cBytes) {
+			q.readable = true
+			break
+		}
+	}
+
+	// In noncanonical mode, everything is readable.
+	if !l.termios.LEnabled(linux.ICANON) && len(q.readBuf) > 0 {
+		q.readable = true
+	}
+
+	return ret
+}
+
+// shouldDiscard returns whether c should be discarded. In canonical mode, if
+// too many bytes are enqueued, we keep reading input and discarding it until
+// we find a terminating character. Signal/echo processing still occurs.
+//
+// Precondition:
+// * l.termiosMu must be held for reading.
+// * q.mu must be held.
+func (l *lineDiscipline) shouldDiscard(q *queue, cBytes []byte) bool {
+	return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+len(cBytes) >= canonMaxBytes && !l.termios.IsTerminating(cBytes)
+}
+
+// peek returns the size in bytes of the next character to process. As long as
+// b isn't empty, peek returns a value of at least 1.
+func (l *lineDiscipline) peek(b []byte) int {
+	size := 1
+	// If UTF-8 support is enabled, runes might be multiple bytes.
+	if l.termios.IEnabled(linux.IUTF8) {
+		_, size = utf8.DecodeRune(b)
+	}
+	return size
+}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
new file mode 100644
index 000000000..7a7ce5d81
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -0,0 +1,222 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/unimpl"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// masterInode is the inode for the master end of the Terminal.
+type masterInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	// Keep a reference to this inode's dentry.
+	dentry kernfs.Dentry
+
+	// root is the devpts root inode.
+	root *rootInode
+}
+
+var _ kernfs.Inode = (*masterInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	t, err := mi.root.allocateTerminal(rp.Credentials())
+	if err != nil {
+		return nil, err
+	}
+
+	mi.IncRef()
+	fd := &masterFileDescription{
+		inode: mi,
+		t:     t,
+	}
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		mi.DecRef()
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := mi.InodeAttrs.Stat(vfsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	statx.Blksize = 1024
+	statx.RdevMajor = linux.TTYAUX_MAJOR
+	statx.RdevMinor = linux.PTMX_MINOR
+	return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+		return syserror.EINVAL
+	}
+	return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+type masterFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+
+	inode *masterInode
+	t     *Terminal
+}
+
+var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (mfd *masterFileDescription) Release() {
+	mfd.inode.root.masterClose(mfd.t)
+	mfd.inode.DecRef()
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (mfd *masterFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	mfd.t.ld.masterWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (mfd *masterFileDescription) EventUnregister(e *waiter.Entry) {
+	mfd.t.ld.masterWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (mfd *masterFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return mfd.t.ld.masterReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (mfd *masterFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return mfd.t.ld.outputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return mfd.t.ld.inputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the output queue read buffer.
+		return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args)
+	case linux.TCGETS:
+		// N.B. TCGETS on the master actually returns the configuration
+		// of the slave end.
+		return mfd.t.ld.getTermios(ctx, io, args)
+	case linux.TCSETS:
+		// N.B. TCSETS on the master actually affects the configuration
+		// of the slave end.
+		return mfd.t.ld.setTermios(ctx, io, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return mfd.t.ld.setTermios(ctx, io, args)
+	case linux.TIOCGPTN:
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	case linux.TIOCSPTLCK:
+		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
+		return 0, nil
+	case linux.TIOCGWINSZ:
+		return 0, mfd.t.ld.windowSize(ctx, io, args)
+	case linux.TIOCSWINSZ:
+		return 0, mfd.t.ld.setWindowSize(ctx, io, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return mfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return mfd.inode.Stat(fs, opts)
+}
+
+// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid.
+func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
+	switch cmd {
+	case linux.TCGETS,
+		linux.TCSETS,
+		linux.TCSETSW,
+		linux.TCSETSF,
+		linux.TIOCGWINSZ,
+		linux.TIOCSWINSZ,
+		linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+	}
+}
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
new file mode 100644
index 000000000..dffb4232c
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -0,0 +1,236 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// waitBufMaxBytes is the maximum size of a wait buffer. It is based on
+// TTYB_DEFAULT_MEM_LIMIT.
+const waitBufMaxBytes = 131072
+
+// queue represents one of the input or output queues between a pty master and
+// slave. Bytes written to a queue are added to the read buffer until it is
+// full, at which point they are written to the wait buffer. Bytes are
+// processed (i.e. undergo termios transformations) as they are added to the
+// read buffer. The read buffer is readable when its length is nonzero and
+// readable is true.
+//
+// +stateify savable
+type queue struct {
+	// mu protects everything in queue.
+	mu sync.Mutex `state:"nosave"`
+
+	// readBuf is buffer of data ready to be read when readable is true.
+	// This data has been processed.
+	readBuf []byte
+
+	// waitBuf contains data that can't fit into readBuf. It is put here
+	// until it can be loaded into the read buffer. waitBuf contains data
+	// that hasn't been processed.
+	waitBuf    [][]byte
+	waitBufLen uint64
+
+	// readable indicates whether the read buffer can be read from.  In
+	// canonical mode, there can be an unterminated line in the read buffer,
+	// so readable must be checked.
+	readable bool
+
+	// transform is the the queue's function for transforming bytes
+	// entering the queue. For example, transform might convert all '\r's
+	// entering the queue to '\n's.
+	transformer
+}
+
+// readReadiness returns whether q is ready to be read from.
+func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if len(q.readBuf) > 0 && q.readable {
+		return waiter.EventIn
+	}
+	return waiter.EventMask(0)
+}
+
+// writeReadiness returns whether q is ready to be written to.
+func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if q.waitBufLen < waitBufMaxBytes {
+		return waiter.EventOut
+	}
+	return waiter.EventMask(0)
+}
+
+// readableSize writes the number of readable bytes to userspace.
+func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	var size int32
+	if q.readable {
+		size = int32(len(q.readBuf))
+	}
+
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+
+}
+
+// read reads from q to userspace. It returns the number of bytes read as well
+// as whether the read caused more readable data to become available (whether
+// data was pushed from the wait buffer to the read buffer).
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	if !q.readable {
+		return 0, false, syserror.ErrWouldBlock
+	}
+
+	if dst.NumBytes() > canonMaxBytes {
+		dst = dst.TakeFirst(canonMaxBytes)
+	}
+
+	n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dst safemem.BlockSeq) (uint64, error) {
+		src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf))
+		n, err := safemem.CopySeq(dst, src)
+		if err != nil {
+			return 0, err
+		}
+		q.readBuf = q.readBuf[n:]
+
+		// If we read everything, this queue is no longer readable.
+		if len(q.readBuf) == 0 {
+			q.readable = false
+		}
+
+		return n, nil
+	}))
+	if err != nil {
+		return 0, false, err
+	}
+
+	// Move data from the queue's wait buffer to its read buffer.
+	nPushed := q.pushWaitBufLocked(l)
+
+	return int64(n), nPushed > 0, nil
+}
+
+// write writes to q from userspace.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	// Copy data into the wait buffer.
+	n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(src safemem.BlockSeq) (uint64, error) {
+		copyLen := src.NumBytes()
+		room := waitBufMaxBytes - q.waitBufLen
+		// If out of room, return EAGAIN.
+		if room == 0 && copyLen > 0 {
+			return 0, syserror.ErrWouldBlock
+		}
+		// Cap the size of the wait buffer.
+		if copyLen > room {
+			copyLen = room
+			src = src.TakeFirst64(room)
+		}
+		buf := make([]byte, copyLen)
+
+		// Copy the data into the wait buffer.
+		dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))
+		n, err := safemem.CopySeq(dst, src)
+		if err != nil {
+			return 0, err
+		}
+		q.waitBufAppend(buf)
+
+		return n, nil
+	}))
+	if err != nil {
+		return 0, err
+	}
+
+	// Push data from the wait to the read buffer.
+	q.pushWaitBufLocked(l)
+
+	return n, nil
+}
+
+// writeBytes writes to q from b.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	// Write to the wait buffer.
+	q.waitBufAppend(b)
+	q.pushWaitBufLocked(l)
+}
+
+// pushWaitBufLocked fills the queue's read buffer with data from the wait
+// buffer.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q.mu must be locked.
+func (q *queue) pushWaitBufLocked(l *lineDiscipline) int {
+	if q.waitBufLen == 0 {
+		return 0
+	}
+
+	// Move data from the wait to the read buffer.
+	var total int
+	var i int
+	for i = 0; i < len(q.waitBuf); i++ {
+		n := q.transform(l, q, q.waitBuf[i])
+		total += n
+		if n != len(q.waitBuf[i]) {
+			// The read buffer filled up without consuming the
+			// entire buffer.
+			q.waitBuf[i] = q.waitBuf[i][n:]
+			break
+		}
+	}
+
+	// Update wait buffer based on consumed data.
+	q.waitBuf = q.waitBuf[i:]
+	q.waitBufLen -= uint64(total)
+
+	return total
+}
+
+// Precondition: q.mu must be locked.
+func (q *queue) waitBufAppend(b []byte) {
+	q.waitBuf = append(q.waitBuf, b)
+	q.waitBufLen += uint64(len(b))
+}
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
new file mode 100644
index 000000000..526cd406c
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/slave.go
@@ -0,0 +1,182 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// slaveInode is the inode for the slave end of the Terminal.
+type slaveInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	// Keep a reference to this inode's dentry.
+	dentry kernfs.Dentry
+
+	// root is the devpts root inode.
+	root *rootInode
+
+	// t is the connected Terminal.
+	t *Terminal
+}
+
+var _ kernfs.Inode = (*slaveInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	si.IncRef()
+	fd := &slaveFileDescription{
+		inode: si,
+	}
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		si.DecRef()
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (si *slaveInode) Valid(context.Context) bool {
+	// Return valid if the slave still exists.
+	si.root.mu.Lock()
+	defer si.root.mu.Unlock()
+	_, ok := si.root.slaves[si.t.n]
+	return ok
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := si.InodeAttrs.Stat(vfsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	statx.Blksize = 1024
+	statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR
+	statx.RdevMinor = si.t.n
+	return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+		return syserror.EINVAL
+	}
+	return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+type slaveFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+
+	inode *slaveInode
+}
+
+var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (sfd *slaveFileDescription) Release() {
+	sfd.inode.DecRef()
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) {
+	sfd.inode.t.ld.slaveWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return sfd.inode.t.ld.slaveReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return sfd.inode.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return sfd.inode.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescripionImpl.Ioctl.
+func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the input queue read buffer.
+		return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args)
+	case linux.TCGETS:
+		return sfd.inode.t.ld.getTermios(ctx, io, args)
+	case linux.TCSETS:
+		return sfd.inode.t.ld.setTermios(ctx, io, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return sfd.inode.t.ld.setTermios(ctx, io, args)
+	case linux.TIOCGPTN:
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	case linux.TIOCGWINSZ:
+		return 0, sfd.inode.t.ld.windowSize(ctx, io, args)
+	case linux.TIOCSWINSZ:
+		return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return sfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return sfd.inode.Stat(fs, opts)
+}
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
new file mode 100644
index 000000000..7d2781c54
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -0,0 +1,120 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Terminal is a pseudoterminal.
+//
+// +stateify savable
+type Terminal struct {
+	// n is the terminal index. It is immutable.
+	n uint32
+
+	// ld is the line discipline of the terminal. It is immutable.
+	ld *lineDiscipline
+
+	// masterKTTY contains the controlling process of the master end of
+	// this terminal. This field is immutable.
+	masterKTTY *kernel.TTY
+
+	// slaveKTTY contains the controlling process of the slave end of this
+	// terminal. This field is immutable.
+	slaveKTTY *kernel.TTY
+}
+
+func newTerminal(n uint32) *Terminal {
+	termios := linux.DefaultSlaveTermios
+	t := Terminal{
+		n:          n,
+		ld:         newLineDiscipline(termios),
+		masterKTTY: &kernel.TTY{Index: n},
+		slaveKTTY:  &kernel.TTY{Index: n},
+	}
+	return &t
+}
+
+// setControllingTTY makes tm the controlling terminal of the calling thread
+// group.
+func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("setControllingTTY must be called from a task context")
+	}
+
+	return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int())
+}
+
+// releaseControllingTTY removes tm as the controlling terminal of the calling
+// thread group.
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("releaseControllingTTY must be called from a task context")
+	}
+
+	return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster))
+}
+
+// foregroundProcessGroup gets the process group ID of tm's foreground process.
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("foregroundProcessGroup must be called from a task context")
+	}
+
+	ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster))
+	if err != nil {
+		return 0, err
+	}
+
+	// Write it out to *arg.
+	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return 0, err
+}
+
+// foregroundProcessGroup sets tm's foreground process.
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("setForegroundProcessGroup must be called from a task context")
+	}
+
+	// Read in the process group ID.
+	var pgid int32
+	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
+		AddressSpaceActive: true,
+	}); err != nil {
+		return 0, err
+	}
+
+	ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid))
+	return uintptr(ret), err
+}
+
+func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
+	if isMaster {
+		return tm.masterKTTY
+	}
+	return tm.slaveKTTY
+}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index e03a0c665..142ee53b0 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -28,6 +28,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
+// Name is the default filesystem name.
+const Name = "devtmpfs"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct {
 	initOnce sync.Once
@@ -39,6 +42,11 @@ type FilesystemType struct {
 	root *vfs.Dentry
 }
 
+// Name implements vfs.FilesystemType.Name.
+func (*FilesystemType) Name() string {
+	return Name
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	fst.initOnce.Do(func() {
@@ -107,6 +115,7 @@ func (a *Accessor) wrapContext(ctx context.Context) *accessorContext {
 func (ac *accessorContext) Value(key interface{}) interface{} {
 	switch key {
 	case vfs.CtxMountNamespace:
+		ac.a.mntns.IncRef()
 		return ac.a.mntns
 	case vfs.CtxRoot:
 		ac.a.root.IncRef()
@@ -154,16 +163,25 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
 func (a *Accessor) UserspaceInit(ctx context.Context) error {
 	actx := a.wrapContext(ctx)
 
-	// systemd: src/shared/dev-setup.c:dev_setup()
+	// Initialize symlinks.
 	for _, symlink := range []struct {
 		source string
 		target string
 	}{
-		// /proc/kcore is not implemented.
+		// systemd: src/shared/dev-setup.c:dev_setup()
 		{source: "fd", target: "/proc/self/fd"},
 		{source: "stdin", target: "/proc/self/fd/0"},
 		{source: "stdout", target: "/proc/self/fd/1"},
 		{source: "stderr", target: "/proc/self/fd/2"},
+		// /proc/kcore is not implemented.
+
+		// Linux implements /dev/ptmx as a device node, but advises
+		// container implementations to create /dev/ptmx as a symlink
+		// to pts/ptmx (Documentation/filesystems/devpts.txt). Systemd
+		// follows this advice (src/nspawn/nspawn.c:setup_pts()), while
+		// LXC tries to create a bind mount and falls back to a symlink
+		// (src/lxc/conf.c:lxc_setup_devpts()).
+		{source: "ptmx", target: "pts/ptmx"},
 	} {
 		if err := a.vfsObj.SymlinkAt(actx, a.creds, a.pathOperationAt(symlink.source), symlink.target); err != nil {
 			return fmt.Errorf("failed to create symlink %q => %q: %v", symlink.source, symlink.target, err)
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 73308a2b5..b6d52c015 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -29,7 +29,10 @@ func TestDevtmpfs(t *testing.T) {
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
 
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	// Register tmpfs just so that we can have a root filesystem that isn't
 	// devtmpfs.
 	vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD
new file mode 100644
index 000000000..ea167d38c
--- /dev/null
+++ b/pkg/sentry/fsimpl/eventfd/BUILD
@@ -0,0 +1,33 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "eventfd",
+    srcs = ["eventfd.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fdnotifier",
+        "//pkg/log",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "eventfd_test",
+    size = "small",
+    srcs = ["eventfd_test.go"],
+    library = ":eventfd",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/vfs",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
new file mode 100644
index 000000000..c573d7935
--- /dev/null
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -0,0 +1,284 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package eventfd implements event fds.
+package eventfd
+
+import (
+	"math"
+	"sync"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// EventFileDescription implements FileDescriptionImpl for file-based event
+// notification (eventfd). Eventfds are usually internal to the Sentry but in
+// certain situations they may be converted into a host-backed eventfd.
+type EventFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	// queue is used to notify interested parties when the event object
+	// becomes readable or writable.
+	queue waiter.Queue `state:"zerovalue"`
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// val is the current value of the event counter.
+	val uint64
+
+	// semMode specifies whether the event is in "semaphore" mode.
+	semMode bool
+
+	// hostfd indicates whether this eventfd is passed through to the host.
+	hostfd int
+}
+
+var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil)
+
+// New creates a new event fd.
+func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) {
+	vd := vfsObj.NewAnonVirtualDentry("[eventfd]")
+	defer vd.DecRef()
+	efd := &EventFileDescription{
+		val:     initVal,
+		semMode: semMode,
+		hostfd:  -1,
+	}
+	if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+	}); err != nil {
+		return nil, err
+	}
+	return &efd.vfsfd, nil
+}
+
+// HostFD returns the host eventfd associated with this event.
+func (efd *EventFileDescription) HostFD() (int, error) {
+	efd.mu.Lock()
+	defer efd.mu.Unlock()
+	if efd.hostfd >= 0 {
+		return efd.hostfd, nil
+	}
+
+	flags := linux.EFD_NONBLOCK
+	if efd.semMode {
+		flags |= linux.EFD_SEMAPHORE
+	}
+
+	fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0)
+	if errno != 0 {
+		return -1, errno
+	}
+
+	if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil {
+		if closeErr := syscall.Close(int(fd)); closeErr != nil {
+			log.Warningf("close(%d) eventfd failed: %v", fd, closeErr)
+		}
+		return -1, err
+	}
+
+	efd.hostfd = int(fd)
+	return efd.hostfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release()
+func (efd *EventFileDescription) Release() {
+	efd.mu.Lock()
+	defer efd.mu.Unlock()
+	if efd.hostfd >= 0 {
+		fdnotifier.RemoveFD(int32(efd.hostfd))
+		if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil {
+			log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr)
+		}
+		efd.hostfd = -1
+	}
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	if dst.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := efd.read(ctx, dst); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	if src.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := efd.write(ctx, src); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+// Preconditions: Must be called with efd.mu locked.
+func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error {
+	var buf [8]byte
+	if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil {
+		if err == syscall.EWOULDBLOCK {
+			return syserror.ErrWouldBlock
+		}
+		return err
+	}
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
+func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error {
+	efd.mu.Lock()
+	if efd.hostfd >= 0 {
+		defer efd.mu.Unlock()
+		return efd.hostReadLocked(ctx, dst)
+	}
+
+	// We can't complete the read if the value is currently zero.
+	if efd.val == 0 {
+		efd.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	// Update the value based on the mode the event is operating in.
+	var val uint64
+	if efd.semMode {
+		val = 1
+		// Consistent with Linux, this is done even if writing to memory fails.
+		efd.val--
+	} else {
+		val = efd.val
+		efd.val = 0
+	}
+
+	efd.mu.Unlock()
+
+	// Notify writers. We do this even if we were already writable because
+	// it is possible that a writer is waiting to write the maximum value
+	// to the event.
+	efd.queue.Notify(waiter.EventOut)
+
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
+// Preconditions: Must be called with efd.mu locked.
+func (efd *EventFileDescription) hostWriteLocked(val uint64) error {
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := syscall.Write(efd.hostfd, buf[:])
+	if err == syscall.EWOULDBLOCK {
+		return syserror.ErrWouldBlock
+	}
+	return err
+}
+
+func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error {
+	var buf [8]byte
+	if _, err := src.CopyIn(ctx, buf[:]); err != nil {
+		return err
+	}
+	val := usermem.ByteOrder.Uint64(buf[:])
+
+	return efd.Signal(val)
+}
+
+// Signal is an internal function to signal the event fd.
+func (efd *EventFileDescription) Signal(val uint64) error {
+	if val == math.MaxUint64 {
+		return syscall.EINVAL
+	}
+
+	efd.mu.Lock()
+
+	if efd.hostfd >= 0 {
+		defer efd.mu.Unlock()
+		return efd.hostWriteLocked(val)
+	}
+
+	// We only allow writes that won't cause the value to go over the max
+	// uint64 minus 1.
+	if val > math.MaxUint64-1-efd.val {
+		efd.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	efd.val += val
+	efd.mu.Unlock()
+
+	// Always trigger a notification.
+	efd.queue.Notify(waiter.EventIn)
+
+	return nil
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	efd.mu.Lock()
+	defer efd.mu.Unlock()
+
+	if efd.hostfd >= 0 {
+		return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask)
+	}
+
+	ready := waiter.EventMask(0)
+	if efd.val > 0 {
+		ready |= waiter.EventIn
+	}
+
+	if efd.val < math.MaxUint64-1 {
+		ready |= waiter.EventOut
+	}
+
+	return mask & ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) {
+	efd.queue.EventRegister(entry, mask)
+
+	efd.mu.Lock()
+	defer efd.mu.Unlock()
+	if efd.hostfd >= 0 {
+		fdnotifier.UpdateFD(int32(efd.hostfd))
+	}
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) {
+	efd.queue.EventUnregister(entry)
+
+	efd.mu.Lock()
+	defer efd.mu.Unlock()
+	if efd.hostfd >= 0 {
+		fdnotifier.UpdateFD(int32(efd.hostfd))
+	}
+}
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go
new file mode 100644
index 000000000..20e3adffc
--- /dev/null
+++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go
@@ -0,0 +1,97 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventfd
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestEventFD(t *testing.T) {
+	initVals := []uint64{
+		0,
+		// Using a non-zero initial value verifies that writing to an
+		// eventfd signals when the eventfd's counter was already
+		// non-zero.
+		343,
+	}
+
+	for _, initVal := range initVals {
+		ctx := contexttest.Context(t)
+		vfsObj := &vfs.VirtualFilesystem{}
+		if err := vfsObj.Init(); err != nil {
+			t.Fatalf("VFS init: %v", err)
+		}
+
+		// Make a new eventfd that is writable.
+		eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR)
+		if err != nil {
+			t.Fatalf("New() failed: %v", err)
+		}
+		defer eventfd.DecRef()
+
+		// Register a callback for a write event.
+		w, ch := waiter.NewChannelEntry(nil)
+		eventfd.EventRegister(&w, waiter.EventIn)
+		defer eventfd.EventUnregister(&w)
+
+		data := []byte("00000124")
+		// Create and submit a write request.
+		n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+		if err != nil {
+			t.Fatal(err)
+		}
+		if n != 8 {
+			t.Errorf("eventfd.write wrote %d bytes, not full int64", n)
+		}
+
+		// Check if the callback fired due to the write event.
+		select {
+		case <-ch:
+		default:
+			t.Errorf("Didn't get notified of EventIn after write")
+		}
+	}
+}
+
+func TestEventFDStat(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
+
+	// Make a new eventfd that is writable.
+	eventfd, err := New(vfsObj, 0, false, linux.O_RDWR)
+	if err != nil {
+		t.Fatalf("New() failed: %v", err)
+	}
+	defer eventfd.DecRef()
+
+	statx, err := eventfd.Stat(ctx, vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	})
+	if err != nil {
+		t.Fatalf("eventfd.Stat failed: %v", err)
+	}
+	if statx.Size != 0 {
+		t.Errorf("eventfd size should be 0")
+	}
+}
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 6f78f478f..ff861d0fe 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -15,6 +15,17 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "ext",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
 go_library(
     name = "ext",
     srcs = [
@@ -26,6 +37,7 @@ go_library(
         "extent_file.go",
         "file_description.go",
         "filesystem.go",
+        "fstree.go",
         "inode.go",
         "regular_file.go",
         "symlink.go",
@@ -45,6 +57,7 @@ go_library(
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/vfs",
         "//pkg/sync",
@@ -80,8 +93,8 @@ go_test(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/test/testutil",
         "//pkg/usermem",
-        "//runsc/testutil",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
     ],
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 2015a8871..89caee3df 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -52,7 +52,10 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		return nil, nil, nil, nil, err
+	}
 	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index a080cb189..6bd1a9fc6 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -22,6 +22,10 @@ import (
 type dentry struct {
 	vfsd vfs.Dentry
 
+	// Protected by filesystem.mu.
+	parent *dentry
+	name   string
+
 	// inode is the inode represented by this dentry. Multiple Dentries may
 	// share a single non-directory Inode (with hard links). inode is
 	// immutable.
@@ -56,3 +60,15 @@ func (d *dentry) DecRef() {
 	// inode.decRef().
 	d.inode.decRef()
 }
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index ebb72b75e..12b875c8f 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -21,7 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
-	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -31,6 +30,10 @@ import (
 type directory struct {
 	inode inode
 
+	// childCache maps filenames to dentries for children for which dentries
+	// have been instantiated. childCache is protected by filesystem.mu.
+	childCache map[string]*dentry
+
 	// mu serializes the changes to childList.
 	// Lock Order (outermost locks must be taken first):
 	//   directory.mu
@@ -50,9 +53,13 @@ type directory struct {
 	childMap map[string]*dirent
 }
 
-// newDirectroy is the directory constructor.
-func newDirectroy(inode inode, newDirent bool) (*directory, error) {
-	file := &directory{inode: inode, childMap: make(map[string]*dirent)}
+// newDirectory is the directory constructor.
+func newDirectory(inode inode, newDirent bool) (*directory, error) {
+	file := &directory{
+		inode:      inode,
+		childCache: make(map[string]*dentry),
+		childMap:   make(map[string]*dirent),
+	}
 	file.inode.impl = file
 
 	// Initialize childList by reading dirents from the underlying file.
@@ -188,14 +195,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 				childType = fs.ToInodeType(childInode.diskInode.Mode().FileType())
 			}
 
-			if !cb.Handle(vfs.Dirent{
+			if err := cb.Handle(vfs.Dirent{
 				Name:    child.diskDirent.FileName(),
 				Type:    fs.ToDirentType(childType),
 				Ino:     uint64(child.diskDirent.Inode()),
 				NextOff: fd.off + 1,
-			}) {
+			}); err != nil {
 				dir.childList.InsertBefore(child, fd.iter)
-				return nil
+				return err
 			}
 			fd.off++
 		}
@@ -299,9 +306,3 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 	fd.off = offset
 	return offset, nil
 }
-
-// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
-func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
-	// mmap(2) specifies that EACCESS should be returned for non-regular file fds.
-	return syserror.EACCES
-}
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index 373d23b74..dac6effbf 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -30,6 +30,9 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// Name is the name of this filesystem.
+const Name = "ext"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
@@ -91,42 +94,61 @@ func isCompatible(sb disklayout.SuperBlock) bool {
 	return true
 }
 
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
 	// EACCESS should be returned according to mount(2). Filesystem independent
 	// flags (like readonly) are currently not available in pkg/sentry/vfs.
 
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+
 	dev, err := getDeviceFd(source, opts)
 	if err != nil {
 		return nil, nil, err
 	}
 
-	fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
-	fs.vfsfs.Init(vfsObj, &fs)
+	fs := filesystem{
+		dev:        dev,
+		inodeCache: make(map[uint32]*inode),
+		devMinor:   devMinor,
+	}
+	fs.vfsfs.Init(vfsObj, &fsType, &fs)
 	fs.sb, err = readSuperBlock(dev)
 	if err != nil {
+		fs.vfsfs.DecRef()
 		return nil, nil, err
 	}
 
 	if fs.sb.Magic() != linux.EXT_SUPER_MAGIC {
 		// mount(2) specifies that EINVAL should be returned if the superblock is
 		// invalid.
+		fs.vfsfs.DecRef()
 		return nil, nil, syserror.EINVAL
 	}
 
 	// Refuse to mount if the filesystem is incompatible.
 	if !isCompatible(fs.sb) {
+		fs.vfsfs.DecRef()
 		return nil, nil, syserror.EINVAL
 	}
 
 	fs.bgs, err = readBlockGroups(dev, fs.sb)
 	if err != nil {
+		fs.vfsfs.DecRef()
 		return nil, nil, err
 	}
 
 	rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode)
 	if err != nil {
+		fs.vfsfs.DecRef()
 		return nil, nil, err
 	}
 	rootInode.incRef()
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 05f992826..64e9a579f 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -32,9 +32,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/pkg/usermem"
-
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 const (
@@ -65,7 +64,10 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
@@ -496,9 +498,9 @@ func newIterDirentCb() *iterDirentsCb {
 }
 
 // Handle implements vfs.IterDirentsCallback.Handle.
-func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool {
+func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) error {
 	cb.dirents = append(cb.dirents, dirent)
-	return true
+	return nil
 }
 
 // TestIterDirents tests the FileDescriptionImpl.IterDirents functionality.
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index e05429d41..557963e03 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -62,6 +64,10 @@ type filesystem struct {
 	// bgs represents all the block group descriptors for the filesystem.
 	// Immutable after initialization.
 	bgs []disklayout.BlockGroup
+
+	// devMinor is this filesystem's device minor number. Immutable after
+	// initialization.
+	devMinor uint32
 }
 
 // Compiles only if filesystem implements vfs.FilesystemImpl.
@@ -87,14 +93,33 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
 	}
 
 	for {
-		nextVFSD, err := rp.ResolveComponent(vfsd)
-		if err != nil {
-			return nil, nil, err
+		name := rp.Component()
+		if name == "." {
+			rp.Advance()
+			return vfsd, inode, nil
 		}
-		if nextVFSD == nil {
-			// Since the Dentry tree is not the sole source of truth for extfs, if it's
-			// not in the Dentry tree, it might need to be pulled from disk.
-			childDirent, ok := inode.impl.(*directory).childMap[rp.Component()]
+		d := vfsd.Impl().(*dentry)
+		if name == ".." {
+			isRoot, err := rp.CheckRoot(vfsd)
+			if err != nil {
+				return nil, nil, err
+			}
+			if isRoot || d.parent == nil {
+				rp.Advance()
+				return vfsd, inode, nil
+			}
+			if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+				return nil, nil, err
+			}
+			rp.Advance()
+			return &d.parent.vfsd, d.parent.inode, nil
+		}
+
+		dir := inode.impl.(*directory)
+		child, ok := dir.childCache[name]
+		if !ok {
+			// We may need to instantiate a new dentry for this child.
+			childDirent, ok := dir.childMap[name]
 			if !ok {
 				// The underlying inode does not exist on disk.
 				return nil, nil, syserror.ENOENT
@@ -113,21 +138,22 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
 			}
 			// incRef because this is being added to the dentry tree.
 			childInode.incRef()
-			child := newDentry(childInode)
-			vfsd.InsertChild(&child.vfsd, rp.Component())
-
-			// Continue as usual now that nextVFSD is not nil.
-			nextVFSD = &child.vfsd
+			child = newDentry(childInode)
+			child.parent = d
+			child.name = name
+			dir.childCache[name] = child
 		}
-		nextInode := nextVFSD.Impl().(*dentry).inode
-		if nextInode.isSymlink() && rp.ShouldFollowSymlink() {
-			if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil {
+		if err := rp.CheckMount(&child.vfsd); err != nil {
+			return nil, nil, err
+		}
+		if child.inode.isSymlink() && rp.ShouldFollowSymlink() {
+			if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil {
 				return nil, nil, err
 			}
 			continue
 		}
 		rp.Advance()
-		return nextVFSD, nextInode, nil
+		return &child.vfsd, child.inode, nil
 	}
 }
 
@@ -255,6 +281,15 @@ func (fs *filesystem) statTo(stat *linux.Statfs) {
 	// TODO(b/134676337): Set Statfs.Flags and Statfs.FSID.
 }
 
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+	return inode.checkPermissions(rp.Credentials(), ats)
+}
+
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
 	vfsd, inode, err := fs.walk(rp, false)
@@ -335,7 +370,9 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 }
 
 // Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {}
+func (fs *filesystem) Release() {
+	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+}
 
 // Sync implements vfs.FilesystemImpl.Sync.
 func (fs *filesystem) Sync(ctx context.Context) error {
@@ -453,8 +490,22 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return nil, err
+	}
+	if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+
+	// TODO(b/134676337): Support sockets.
+	return nil, syserror.ECONNREFUSED
+}
+
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	_, _, err := fs.walk(rp, false)
 	if err != nil {
 		return nil, err
@@ -463,7 +514,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	_, _, err := fs.walk(rp, false)
 	if err != nil {
 		return "", err
@@ -493,5 +544,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
 }
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 6962083f5..485f86f4b 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -136,7 +136,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 		}
 		return &f.inode, nil
 	case linux.ModeDirectory:
-		f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+		f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType)
 		if err != nil {
 			return nil, err
 		}
@@ -186,7 +186,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 }
 
 func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
-	return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID())
+	return vfs.GenericCheckPermissions(creds, ats, in.diskInode.Mode(), in.diskInode.UID(), in.diskInode.GID())
 }
 
 // statTo writes the statx fields to the output parameter.
@@ -204,6 +204,8 @@ func (in *inode) statTo(stat *linux.Statx) {
 	stat.Atime = in.diskInode.AccessTime().StatxTimestamp()
 	stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp()
 	stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp()
+	stat.DevMajor = linux.UNNAMED_MAJOR
+	stat.DevMinor = in.fs.devMinor
 	// TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks
 	// (including metadata blocks) required to represent this file.
 }
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 4ba76a1e8..f5f35a3bc 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
@@ -15,18 +15,30 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "gofer",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
 go_library(
     name = "gofer",
     srcs = [
         "dentry_list.go",
         "directory.go",
         "filesystem.go",
+        "fstree.go",
         "gofer.go",
         "handle.go",
-        "handle_unsafe.go",
+        "host_named_pipe.go",
         "p9file.go",
-        "pagemath.go",
         "regular_file.go",
+        "socket.go",
         "special_file.go",
         "symlink.go",
         "time.go",
@@ -36,20 +48,41 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fd",
+        "//pkg/fdnotifier",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
         "//pkg/safemem",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/hostfd",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
+        "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/unet",
         "//pkg/usermem",
+        "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "gofer_test",
+    srcs = ["gofer_test.go"],
+    library = ":gofer",
+    deps = [
+        "//pkg/p9",
+        "//pkg/sentry/contexttest",
     ],
 )
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 6d4ebc2bf..b98218753 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -15,27 +15,100 @@
 package gofer
 
 import (
+	"fmt"
 	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func (d *dentry) isDir() bool {
 	return d.fileType() == linux.S_IFDIR
 }
 
-// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop !=
-// InteropModeShared.
-func (d *dentry) cacheNegativeChildLocked(name string) {
-	if d.negativeChildren == nil {
-		d.negativeChildren = make(map[string]struct{})
+// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked.
+// d.isDir(). child must be a newly-created dentry that has never had a parent.
+func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
+	d.IncRef() // reference held by child on its parent
+	child.parent = d
+	child.name = name
+	if d.children == nil {
+		d.children = make(map[string]*dentry)
 	}
-	d.negativeChildren[name] = struct{}{}
+	d.children[name] = child
+}
+
+// Preconditions: d.dirMu must be locked. d.isDir().
+func (d *dentry) cacheNegativeLookupLocked(name string) {
+	// Don't cache negative lookups if InteropModeShared is in effect (since
+	// this makes remote lookup unavoidable), or if d.isSynthetic() (in which
+	// case the only files in the directory are those for which a dentry exists
+	// in d.children). Instead, just delete any previously-cached dentry.
+	if d.fs.opts.interop == InteropModeShared || d.isSynthetic() {
+		delete(d.children, name)
+		return
+	}
+	if d.children == nil {
+		d.children = make(map[string]*dentry)
+	}
+	d.children[name] = nil
+}
+
+type createSyntheticOpts struct {
+	name string
+	mode linux.FileMode
+	kuid auth.KUID
+	kgid auth.KGID
+
+	// The endpoint for a synthetic socket. endpoint should be nil if the file
+	// being created is not a socket.
+	endpoint transport.BoundEndpoint
+
+	// pipe should be nil if the file being created is not a pipe.
+	pipe *pipe.VFSPipe
+}
+
+// createSyntheticChildLocked creates a synthetic file with the given name
+// in d.
+//
+// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
+// a child with the given name.
+func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
+	d2 := &dentry{
+		refs:      1, // held by d
+		fs:        d.fs,
+		mode:      uint32(opts.mode),
+		uid:       uint32(opts.kuid),
+		gid:       uint32(opts.kgid),
+		blockSize: usermem.PageSize, // arbitrary
+		handle: handle{
+			fd: -1,
+		},
+		nlink: uint32(2),
+	}
+	switch opts.mode.FileType() {
+	case linux.S_IFDIR:
+		// Nothing else needs to be done.
+	case linux.S_IFSOCK:
+		d2.endpoint = opts.endpoint
+	case linux.S_IFIFO:
+		d2.pipe = opts.pipe
+	default:
+		panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
+	}
+	d2.pf.dentry = d2
+	d2.vfsd.Init(d2)
+
+	d.cacheNewChildLocked(d2, opts.name)
+	d.syntheticChildren++
 }
 
 type directoryFD struct {
@@ -56,17 +129,22 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
 
+	d := fd.dentry()
 	if fd.dirents == nil {
-		ds, err := fd.dentry().getDirents(ctx)
+		ds, err := d.getDirents(ctx)
 		if err != nil {
 			return err
 		}
 		fd.dirents = ds
 	}
 
+	if d.cachedMetadataAuthoritative() {
+		d.touchAtime(fd.vfsfd.Mount())
+	}
+
 	for fd.off < int64(len(fd.dirents)) {
-		if !cb.Handle(fd.dirents[fd.off]) {
-			return nil
+		if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+			return err
 		}
 		fd.off++
 	}
@@ -75,23 +153,21 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 
 // Preconditions: d.isDir(). There exists at least one directoryFD representing d.
 func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
-	// 9P2000.L's readdir does not specify behavior in the presence of
-	// concurrent mutation of an iterated directory, so implementations may
-	// duplicate or omit entries in this case, which violates POSIX semantics.
-	// Thus we read all directory entries while holding d.dirMu to exclude
-	// directory mutations. (Note that it is impossible for the client to
-	// exclude concurrent mutation from other remote filesystem users. Since
-	// there is no way to detect if the server has incorrectly omitted
-	// directory entries, we simply assume that the server is well-behaved
-	// under InteropModeShared.) This is inconsistent with Linux (which appears
-	// to assume that directory fids have the correct semantics, and translates
-	// struct file_operations::readdir calls directly to readdir RPCs), but is
-	// consistent with VFS1.
-	//
-	// NOTE(b/135560623): In particular, some gofer implementations may not
-	// retain state between calls to Readdir, so may not provide a coherent
-	// directory stream across in the presence of mutation.
+	// NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
+	// presence of concurrent mutation of an iterated directory, so
+	// implementations may duplicate or omit entries in this case, which
+	// violates POSIX semantics. Thus we read all directory entries while
+	// holding d.dirMu to exclude directory mutations. (Note that it is
+	// impossible for the client to exclude concurrent mutation from other
+	// remote filesystem users. Since there is no way to detect if the server
+	// has incorrectly omitted directory entries, we simply assume that the
+	// server is well-behaved under InteropModeShared.) This is inconsistent
+	// with Linux (which appears to assume that directory fids have the correct
+	// semantics, and translates struct file_operations::readdir calls directly
+	// to readdir RPCs), but is consistent with VFS1.
 
+	// filesystem.renameMu is needed for d.parent, and must be locked before
+	// dentry.dirMu.
 	d.fs.renameMu.RLock()
 	defer d.fs.renameMu.RUnlock()
 	d.dirMu.Lock()
@@ -102,7 +178,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 
 	// It's not clear if 9P2000.L's readdir is expected to return "." and "..",
 	// so we generate them here.
-	parent := d.vfsd.ParentOrSelf().Impl().(*dentry)
+	parent := genericParentOrSelf(d)
 	dirents := []vfs.Dirent{
 		{
 			Name:    ".",
@@ -117,50 +193,81 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 			NextOff: 2,
 		},
 	}
-	off := uint64(0)
-	const count = 64 * 1024 // for consistency with the vfs1 client
-	d.handleMu.RLock()
-	defer d.handleMu.RUnlock()
-	if !d.handleReadable {
-		// This should not be possible because a readable handle should have
-		// been opened when the calling directoryFD was opened.
-		panic("gofer.dentry.getDirents called without a readable handle")
-	}
-	for {
-		p9ds, err := d.handle.file.readdir(ctx, off, count)
-		if err != nil {
-			return nil, err
+	var realChildren map[string]struct{}
+	if !d.isSynthetic() {
+		if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared {
+			// Record the set of children d actually has so that we don't emit
+			// duplicate entries for synthetic children.
+			realChildren = make(map[string]struct{})
 		}
-		if len(p9ds) == 0 {
-			// Cache dirents for future directoryFDs if permitted.
-			if d.fs.opts.interop != InteropModeShared {
-				d.dirents = dirents
+		off := uint64(0)
+		const count = 64 * 1024 // for consistency with the vfs1 client
+		d.handleMu.RLock()
+		if !d.handleReadable {
+			// This should not be possible because a readable handle should
+			// have been opened when the calling directoryFD was opened.
+			d.handleMu.RUnlock()
+			panic("gofer.dentry.getDirents called without a readable handle")
+		}
+		for {
+			p9ds, err := d.handle.file.readdir(ctx, off, count)
+			if err != nil {
+				d.handleMu.RUnlock()
+				return nil, err
+			}
+			if len(p9ds) == 0 {
+				d.handleMu.RUnlock()
+				break
 			}
-			return dirents, nil
+			for _, p9d := range p9ds {
+				if p9d.Name == "." || p9d.Name == ".." {
+					continue
+				}
+				dirent := vfs.Dirent{
+					Name:    p9d.Name,
+					Ino:     p9d.QID.Path,
+					NextOff: int64(len(dirents) + 1),
+				}
+				// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
+				// DMSOCKET.
+				switch p9d.Type {
+				case p9.TypeSymlink:
+					dirent.Type = linux.DT_LNK
+				case p9.TypeDir:
+					dirent.Type = linux.DT_DIR
+				default:
+					dirent.Type = linux.DT_REG
+				}
+				dirents = append(dirents, dirent)
+				if realChildren != nil {
+					realChildren[p9d.Name] = struct{}{}
+				}
+			}
+			off = p9ds[len(p9ds)-1].Offset
 		}
-		for _, p9d := range p9ds {
-			if p9d.Name == "." || p9d.Name == ".." {
+	}
+	// Emit entries for synthetic children.
+	if d.syntheticChildren != 0 {
+		for _, child := range d.children {
+			if child == nil || !child.isSynthetic() {
 				continue
 			}
-			dirent := vfs.Dirent{
-				Name:    p9d.Name,
-				Ino:     p9d.QID.Path,
-				NextOff: int64(len(dirents) + 1),
-			}
-			// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
-			// DMSOCKET.
-			switch p9d.Type {
-			case p9.TypeSymlink:
-				dirent.Type = linux.DT_LNK
-			case p9.TypeDir:
-				dirent.Type = linux.DT_DIR
-			default:
-				dirent.Type = linux.DT_REG
+			if _, ok := realChildren[child.name]; ok {
+				continue
 			}
-			dirents = append(dirents, dirent)
+			dirents = append(dirents, vfs.Dirent{
+				Name:    child.name,
+				Type:    uint8(atomic.LoadUint32(&child.mode) >> 12),
+				Ino:     child.ino,
+				NextOff: int64(len(dirents) + 1),
+			})
 		}
-		off = p9ds[len(p9ds)-1].Offset
 	}
+	// Cache dirents for future directoryFDs if permitted.
+	if d.cachedMetadataAuthoritative() {
+		d.dirents = dirents
+	}
+	return dirents, nil
 }
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 138adb9f7..36e0e1856 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -21,20 +21,28 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Sync implements vfs.FilesystemImpl.Sync.
 func (fs *filesystem) Sync(ctx context.Context) error {
-	// Snapshot current dentries and special files.
+	// Snapshot current syncable dentries and special files.
 	fs.syncMu.Lock()
-	ds := make([]*dentry, 0, len(fs.dentries))
-	for d := range fs.dentries {
+	ds := make([]*dentry, 0, len(fs.syncableDentries))
+	for d := range fs.syncableDentries {
+		d.IncRef()
 		ds = append(ds, d)
 	}
 	sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs))
 	for sffd := range fs.specialFileFDs {
+		sffd.vfsfd.IncRef()
 		sffds = append(sffds, sffd)
 	}
 	fs.syncMu.Unlock()
@@ -45,9 +53,6 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 
 	// Sync regular files.
 	for _, d := range ds {
-		if !d.TryIncRef() {
-			continue
-		}
 		err := d.syncSharedHandle(ctx)
 		d.DecRef()
 		if err != nil && retErr == nil {
@@ -58,9 +63,6 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 	// Sync special files, which may be writable but do not use dentry shared
 	// handles (so they won't be synced by the above).
 	for _, sffd := range sffds {
-		if !sffd.vfsfd.TryIncRef() {
-			continue
-		}
 		err := sffd.Sync(ctx)
 		sffd.vfsfd.DecRef()
 		if err != nil && retErr == nil {
@@ -112,13 +114,15 @@ func putDentrySlice(ds *[]*dentry) {
 // to *ds.
 //
 // Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached
-// metadata must be up to date.
+// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
+// must be up to date.
+//
+// Postconditions: The returned dentry's cached metadata is up to date.
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
 	}
-	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 afterSymlink:
@@ -128,39 +132,42 @@ afterSymlink:
 		return d, nil
 	}
 	if name == ".." {
-		parentVFSD, err := rp.ResolveParent(&d.vfsd)
-		if err != nil {
+		if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
 			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
 		}
-		parent := parentVFSD.Impl().(*dentry)
-		if fs.opts.interop == InteropModeShared {
-			// We must assume that parentVFSD is correct, because if d has been
-			// moved elsewhere in the remote filesystem so that its parent has
-			// changed, we have no way of determining its new parent's location
-			// in the filesystem. Get updated metadata for parentVFSD.
-			_, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask())
+		// We must assume that d.parent is correct, because if d has been moved
+		// elsewhere in the remote filesystem so that its parent has changed,
+		// we have no way of determining its new parent's location in the
+		// filesystem.
+		//
+		// Call rp.CheckMount() before updating d.parent's metadata, since if
+		// we traverse to another mount then d.parent's metadata is irrelevant.
+		if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		if d != d.parent && !d.cachedMetadataAuthoritative() {
+			_, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask())
 			if err != nil {
 				return nil, err
 			}
-			parent.updateFromP9Attrs(attrMask, &attr)
+			d.parent.updateFromP9Attrs(attrMask, &attr)
 		}
 		rp.Advance()
-		return parent, nil
+		return d.parent, nil
 	}
-	childVFSD, err := rp.ResolveChild(&d.vfsd, name)
-	if err != nil {
-		return nil, err
-	}
-	// FIXME(jamieliu): Linux performs revalidation before mount lookup
-	// (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(),
-	// __follow_mount_rcu()).
-	child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds)
+	child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), d, name, ds)
 	if err != nil {
 		return nil, err
 	}
 	if child == nil {
 		return nil, syserror.ENOENT
 	}
+	if err := rp.CheckMount(&child.vfsd); err != nil {
+		return nil, err
+	}
 	if child.isSymlink() && rp.ShouldFollowSymlink() {
 		target, err := child.readlink(ctx, rp.Mount())
 		if err != nil {
@@ -175,38 +182,37 @@ afterSymlink:
 	return child, nil
 }
 
-// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
-// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
-// nil) to verify that the returned child (or lack thereof) is correct. If no file
-// exists at name, revalidateChildLocked returns (nil, nil).
+// getChildLocked returns a dentry representing the child of parent with the
+// given name. If no such child exists, getChildLocked returns (nil, nil).
 //
 // Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
 // parent.isDir(). name is not "." or "..".
 //
-// Postconditions: If revalidateChildLocked returns a non-nil dentry, its
-// cached metadata is up to date.
-func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) {
-	if childVFSD != nil && fs.opts.interop != InteropModeShared {
-		// We have a cached dentry that is assumed to be correct.
-		return childVFSD.Impl().(*dentry), nil
-	}
-	// We either don't have a cached dentry or need to verify that it's still
-	// correct, either of which requires a remote lookup. Check if this name is
-	// valid before performing the lookup.
+// Postconditions: If getChildLocked returns a non-nil dentry, its cached
+// metadata is up to date.
+func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
 	if len(name) > maxFilenameLen {
 		return nil, syserror.ENAMETOOLONG
 	}
-	// Check if we've already cached this lookup with a negative result.
-	if _, ok := parent.negativeChildren[name]; ok {
-		return nil, nil
+	child, ok := parent.children[name]
+	if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() {
+		// Whether child is nil or not, it is cached information that is
+		// assumed to be correct.
+		return child, nil
 	}
-	// Perform the remote lookup.
+	// We either don't have cached information or need to verify that it's
+	// still correct, either of which requires a remote lookup. Check if this
+	// name is valid before performing the lookup.
+	return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
+}
+
+// Preconditions: As for getChildLocked. !parent.isSynthetic().
+func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
 	qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
 	if err != nil && err != syserror.ENOENT {
 		return nil, err
 	}
-	if childVFSD != nil {
-		child := childVFSD.Impl().(*dentry)
+	if child != nil {
 		if !file.isNil() && qid.Path == child.ino {
 			// The file at this path hasn't changed. Just update cached
 			// metadata.
@@ -214,29 +220,44 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 			child.updateFromP9Attrs(attrMask, &attr)
 			return child, nil
 		}
-		// The file at this path has changed or no longer exists. Remove
-		// the stale dentry from the tree, and re-evaluate its caching
-		// status (i.e. if it has 0 references, drop it).
-		vfsObj.ForceDeleteDentry(childVFSD)
+		if file.isNil() && child.isSynthetic() {
+			// We have a synthetic file, and no remote file has arisen to
+			// replace it.
+			return child, nil
+		}
+		// The file at this path has changed or no longer exists. Mark the
+		// dentry invalidated, and re-evaluate its caching status (i.e. if it
+		// has 0 references, drop it). Wait to update parent.children until we
+		// know what to replace the existing dentry with (i.e. one of the
+		// returns below), to avoid a redundant map access.
+		vfsObj.InvalidateDentry(&child.vfsd)
+		if child.isSynthetic() {
+			// Normally we don't mark invalidated dentries as deleted since
+			// they may still exist (but at a different path), and also for
+			// consistency with Linux. However, synthetic files are guaranteed
+			// to become unreachable if their dentries are invalidated, so
+			// treat their invalidation as deletion.
+			child.setDeleted()
+			parent.syntheticChildren--
+			child.decRefLocked()
+			parent.dirents = nil
+		}
 		*ds = appendDentry(*ds, child)
-		childVFSD = nil
 	}
 	if file.isNil() {
 		// No file exists at this path now. Cache the negative lookup if
 		// allowed.
-		if fs.opts.interop != InteropModeShared {
-			parent.cacheNegativeChildLocked(name)
-		}
+		parent.cacheNegativeLookupLocked(name)
 		return nil, nil
 	}
 	// Create a new dentry representing the file.
-	child, err := fs.newDentry(ctx, file, qid, attrMask, &attr)
+	child, err = fs.newDentry(ctx, file, qid, attrMask, &attr)
 	if err != nil {
 		file.close(ctx)
+		delete(parent.children, name)
 		return nil, err
 	}
-	parent.IncRef() // reference held by child on its parent
-	parent.vfsd.InsertChild(&child.vfsd, name)
+	parent.cacheNewChildLocked(child, name)
 	// For now, child has 0 references, so our caller should call
 	// child.checkCachingLocked().
 	*ds = appendDentry(*ds, child)
@@ -248,8 +269,9 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop ==
-// InteropModeShared, then d's cached metadata must be up to date.
+// Preconditions: fs.renameMu must be locked. !rp.Done(). If
+// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
+// date.
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -271,7 +293,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
 // Preconditions: fs.renameMu must be locked.
 func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
 	d := rp.Start().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !d.cachedMetadataAuthoritative() {
 		// Get updated metadata for rp.Start() as required by fs.stepLocked().
 		if err := d.updateFromGetattr(ctx); err != nil {
 			return nil, err
@@ -293,16 +315,17 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 }
 
 // doCreateAt checks that creating a file at rp is permitted, then invokes
-// create to do so.
+// createInRemoteDir (if the parent directory is a real remote directory) or
+// createInSyntheticDir (if the parent directory is synthetic) to do so.
 //
 // Preconditions: !rp.Done(). For the final path component in rp,
 // !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	start := rp.Start().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !start.cachedMetadataAuthoritative() {
 		// Get updated metadata for start as required by
 		// fs.walkParentDirLocked().
 		if err := start.updateFromGetattr(ctx); err != nil {
@@ -313,12 +336,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err != nil {
 		return err
 	}
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
-	if parent.isDeleted() {
-		return syserror.ENOENT
-	}
 	name := rp.Component()
 	if name == "." || name == ".." {
 		return syserror.EEXIST
@@ -329,6 +349,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if !dir && rp.MustBeDir() {
 		return syserror.ENOENT
 	}
+	if parent.isDeleted() {
+		return syserror.ENOENT
+	}
 	mnt := rp.Mount()
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return err
@@ -336,6 +359,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	defer mnt.EndWrite()
 	parent.dirMu.Lock()
 	defer parent.dirMu.Unlock()
+	if parent.isSynthetic() {
+		if child := parent.children[name]; child != nil {
+			return syserror.EEXIST
+		}
+		if createInSyntheticDir == nil {
+			return syserror.EPERM
+		}
+		if err := createInSyntheticDir(parent, name); err != nil {
+			return err
+		}
+		parent.touchCMtime()
+		parent.dirents = nil
+		return nil
+	}
 	if fs.opts.interop == InteropModeShared {
 		// The existence of a dentry at name would be inconclusive because the
 		// file it represents may have been deleted from the remote filesystem,
@@ -344,18 +381,21 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		// will fail with EEXIST like we would have. If the RPC succeeds, and a
 		// stale dentry exists, the dentry will fail revalidation next time
 		// it's used.
-		return create(parent, name)
+		return createInRemoteDir(parent, name)
 	}
-	if parent.vfsd.Child(name) != nil {
+	if child := parent.children[name]; child != nil {
 		return syserror.EEXIST
 	}
 	// No cached dentry exists; however, there might still be an existing file
 	// at name. As above, we attempt the file creation RPC anyway.
-	if err := create(parent, name); err != nil {
+	if err := createInRemoteDir(parent, name); err != nil {
 		return err
 	}
-	parent.touchCMtime(ctx)
-	delete(parent.negativeChildren, name)
+	if child, ok := parent.children[name]; ok && child == nil {
+		// Delete the now-stale negative dentry.
+		delete(parent.children, name)
+	}
+	parent.touchCMtime()
 	parent.dirents = nil
 	return nil
 }
@@ -366,7 +406,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	start := rp.Start().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !start.cachedMetadataAuthoritative() {
 		// Get updated metadata for start as required by
 		// fs.walkParentDirLocked().
 		if err := start.updateFromGetattr(ctx); err != nil {
@@ -377,7 +417,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	if err != nil {
 		return err
 	}
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -400,66 +440,110 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	}
 	vfsObj := rp.VirtualFilesystem()
 	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
 	parent.dirMu.Lock()
 	defer parent.dirMu.Unlock()
-	childVFSD := parent.vfsd.Child(name)
-	var child *dentry
+	child, ok := parent.children[name]
+	if ok && child == nil {
+		return syserror.ENOENT
+	}
 	// We only need a dentry representing the file at name if it can be a mount
-	// point. If childVFSD is nil, then it can't be a mount point. If childVFSD
-	// is non-nil but stale, the actual file can't be a mount point either; we
+	// point. If child is nil, then it can't be a mount point. If child is
+	// non-nil but stale, the actual file can't be a mount point either; we
 	// detect this case by just speculatively calling PrepareDeleteDentry and
 	// only revalidating the dentry if that fails (indicating that the existing
 	// dentry is a mount point).
-	if childVFSD != nil {
-		child = childVFSD.Impl().(*dentry)
-		if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
-			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds)
+	if child != nil {
+		child.dirMu.Lock()
+		defer child.dirMu.Unlock()
+		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+			if parent.cachedMetadataAuthoritative() {
+				return err
+			}
+			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
 			if err != nil {
 				return err
 			}
 			if child != nil {
-				childVFSD = &child.vfsd
-				if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+				if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 					return err
 				}
-			} else {
-				childVFSD = nil
 			}
 		}
-	} else if _, ok := parent.negativeChildren[name]; ok {
-		return syserror.ENOENT
 	}
 	flags := uint32(0)
+	// If a dentry exists, use it for best-effort checks on its deletability.
 	if dir {
-		if child != nil && !child.isDir() {
-			return syserror.ENOTDIR
+		if child != nil {
+			// child must be an empty directory.
+			if child.syntheticChildren != 0 {
+				// This is definitely not an empty directory, irrespective of
+				// fs.opts.interop.
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+				return syserror.ENOTEMPTY
+			}
+			// If InteropModeShared is in effect and the first call to
+			// PrepareDeleteDentry above succeeded, then child wasn't
+			// revalidated (so we can't expect its file type to be correct) and
+			// individually revalidating its children (to confirm that they
+			// still exist) would be a waste of time.
+			if child.cachedMetadataAuthoritative() {
+				if !child.isDir() {
+					vfsObj.AbortDeleteDentry(&child.vfsd)
+					return syserror.ENOTDIR
+				}
+				for _, grandchild := range child.children {
+					if grandchild != nil {
+						vfsObj.AbortDeleteDentry(&child.vfsd)
+						return syserror.ENOTEMPTY
+					}
+				}
+			}
 		}
 		flags = linux.AT_REMOVEDIR
 	} else {
+		// child must be a non-directory file.
 		if child != nil && child.isDir() {
+			vfsObj.AbortDeleteDentry(&child.vfsd)
 			return syserror.EISDIR
 		}
 		if rp.MustBeDir() {
+			if child != nil {
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+			}
 			return syserror.ENOTDIR
 		}
 	}
-	err = parent.file.unlinkAt(ctx, name, flags)
-	if err != nil {
-		if childVFSD != nil {
-			vfsObj.AbortDeleteDentry(childVFSD)
+	if parent.isSynthetic() {
+		if child == nil {
+			return syserror.ENOENT
+		}
+	} else {
+		err = parent.file.unlinkAt(ctx, name, flags)
+		if err != nil {
+			if child != nil {
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+			}
+			return err
 		}
-		return err
-	}
-	if fs.opts.interop != InteropModeShared {
-		parent.touchCMtime(ctx)
-		parent.cacheNegativeChildLocked(name)
-		parent.dirents = nil
 	}
 	if child != nil {
+		vfsObj.CommitDeleteDentry(&child.vfsd)
 		child.setDeleted()
-		vfsObj.CommitDeleteDentry(childVFSD)
+		if child.isSynthetic() {
+			parent.syntheticChildren--
+			child.decRefLocked()
+		}
 		ds = appendDentry(ds, child)
 	}
+	parent.cacheNegativeLookupLocked(name)
+	if parent.cachedMetadataAuthoritative() {
+		parent.dirents = nil
+		parent.touchCMtime()
+		if dir {
+			parent.decLinks()
+		}
+	}
 	return nil
 }
 
@@ -498,6 +582,18 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) {
 	putDentrySlice(*ds)
 }
 
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.checkPermissions(creds, ats)
+}
+
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
 	var ds *[]*dentry
@@ -511,7 +607,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 		if !d.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 			return nil, err
 		}
 	}
@@ -525,7 +621,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 	start := rp.Start().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !start.cachedMetadataAuthoritative() {
 		// Get updated metadata for start as required by
 		// fs.walkParentDirLocked().
 		if err := start.updateFromGetattr(ctx); err != nil {
@@ -548,15 +644,42 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		}
 		// 9P2000.L supports hard links, but we don't.
 		return syserror.EPERM
-	})
+	}, nil)
 }
 
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	creds := rp.Credentials()
 	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
-		creds := rp.Credentials()
-		_, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
-		return err
+		if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
+			if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+				return err
+			}
+			ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err)
+			parent.createSyntheticChildLocked(&createSyntheticOpts{
+				name: name,
+				mode: linux.S_IFDIR | opts.Mode,
+				kuid: creds.EffectiveKUID,
+				kgid: creds.EffectiveKGID,
+			})
+		}
+		if fs.opts.interop != InteropModeShared {
+			parent.incLinks()
+		}
+		return nil
+	}, func(parent *dentry, name string) error {
+		if !opts.ForSyntheticMountpoint {
+			// Can't create non-synthetic files in synthetic directories.
+			return syserror.EPERM
+		}
+		parent.createSyntheticChildLocked(&createSyntheticOpts{
+			name: name,
+			mode: linux.S_IFDIR | opts.Mode,
+			kuid: creds.EffectiveKUID,
+			kgid: creds.EffectiveKGID,
+		})
+		parent.incLinks()
+		return nil
 	})
 }
 
@@ -565,8 +688,32 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
 		creds := rp.Credentials()
 		_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		// If the gofer does not allow creating a socket or pipe, create a
+		// synthetic one, i.e. one that is kept entirely in memory.
+		if err == syserror.EPERM {
+			switch opts.Mode.FileType() {
+			case linux.S_IFSOCK:
+				parent.createSyntheticChildLocked(&createSyntheticOpts{
+					name:     name,
+					mode:     opts.Mode,
+					kuid:     creds.EffectiveKUID,
+					kgid:     creds.EffectiveKGID,
+					endpoint: opts.Endpoint,
+				})
+				return nil
+			case linux.S_IFIFO:
+				parent.createSyntheticChildLocked(&createSyntheticOpts{
+					name: name,
+					mode: opts.Mode,
+					kuid: creds.EffectiveKUID,
+					kgid: creds.EffectiveKGID,
+					pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+				})
+				return nil
+			}
+		}
 		return err
-	})
+	}, nil)
 }
 
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
@@ -586,7 +733,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
 
 	start := rp.Start().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !start.cachedMetadataAuthoritative() {
 		// Get updated metadata for start as required by fs.stepLocked().
 		if err := start.updateFromGetattr(ctx); err != nil {
 			return nil, err
@@ -602,14 +749,18 @@ afterTrailingSymlink:
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Determine whether or not we need to create a file.
 	parent.dirMu.Lock()
 	child, err := fs.stepLocked(ctx, rp, parent, &ds)
 	if err == syserror.ENOENT && mayCreate {
-		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts)
+		if parent.isSynthetic() {
+			parent.dirMu.Unlock()
+			return nil, syserror.EPERM
+		}
+		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds)
 		parent.dirMu.Unlock()
 		return fd, err
 	}
@@ -639,24 +790,25 @@ afterTrailingSymlink:
 // Preconditions: fs.renameMu must be locked.
 func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
-	if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil {
+	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
 	mnt := rp.Mount()
-	filetype := d.fileType()
-	switch {
-	case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD:
-		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil {
-			return nil, err
-		}
-		fd := &regularFileFD{}
-		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
-			AllowDirectIO: true,
-		}); err != nil {
-			return nil, err
+	switch d.fileType() {
+	case linux.S_IFREG:
+		if !d.fs.opts.regularFilesUseSpecialFileFD {
+			if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil {
+				return nil, err
+			}
+			fd := &regularFileFD{}
+			if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+				AllowDirectIO: true,
+			}); err != nil {
+				return nil, err
+			}
+			return &fd.vfsfd, nil
 		}
-		return &fd.vfsfd, nil
-	case filetype == linux.S_IFDIR:
+	case linux.S_IFDIR:
 		// Can't open directories with O_CREAT.
 		if opts.Flags&linux.O_CREAT != 0 {
 			return nil, syserror.EISDIR
@@ -668,39 +820,100 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 		if opts.Flags&linux.O_DIRECT != 0 {
 			return nil, syserror.EINVAL
 		}
-		if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
-			return nil, err
+		if !d.isSynthetic() {
+			if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
+				return nil, err
+			}
 		}
 		fd := &directoryFD{}
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
-	case filetype == linux.S_IFLNK:
+	case linux.S_IFLNK:
 		// Can't open symlinks without O_PATH (which is unimplemented).
 		return nil, syserror.ELOOP
-	default:
-		if opts.Flags&linux.O_DIRECT != 0 {
-			return nil, syserror.EINVAL
+	case linux.S_IFSOCK:
+		if d.isSynthetic() {
+			return nil, syserror.ENXIO
 		}
-		h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0)
-		if err != nil {
-			return nil, err
+		if d.fs.iopts.OpenSocketsByConnecting {
+			return d.connectSocketLocked(ctx, opts)
 		}
-		fd := &specialFileFD{
-			handle: h,
+	case linux.S_IFIFO:
+		if d.isSynthetic() {
+			return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags)
 		}
-		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+	}
+	return d.openSpecialFileLocked(ctx, mnt, opts)
+}
+
+func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if opts.Flags&linux.O_DIRECT != 0 {
+		return nil, syserror.EINVAL
+	}
+	fdObj, err := d.file.connect(ctx, p9.AnonymousSocket)
+	if err != nil {
+		return nil, err
+	}
+	fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fdObj.FD(), &host.NewFDOptions{
+		HaveFlags: true,
+		Flags:     opts.Flags,
+	})
+	if err != nil {
+		fdObj.Close()
+		return nil, err
+	}
+	fdObj.Release()
+	return fd, nil
+}
+
+func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
+	if opts.Flags&linux.O_DIRECT != 0 {
+		return nil, syserror.EINVAL
+	}
+	// We assume that the server silently inserts O_NONBLOCK in the open flags
+	// for all named pipes (because all existing gofers do this).
+	//
+	// NOTE(b/133875563): This makes named pipe opens racy, because the
+	// mechanisms for translating nonblocking to blocking opens can only detect
+	// the instantaneous presence of a peer holding the other end of the pipe
+	// open, not whether the pipe was *previously* opened by a peer that has
+	// since closed its end.
+	isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0
+retry:
+	h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
+	if err != nil {
+		if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO {
+			// An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails
+			// with ENXIO if opening the same named pipe with O_WRONLY would
+			// block because there are no readers of the pipe.
+			if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+				return nil, err
+			}
+			goto retry
+		}
+		return nil, err
+	}
+	if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 {
+		if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil {
 			h.close(ctx)
 			return nil, err
 		}
-		return &fd.vfsfd, nil
 	}
+	fd, err := newSpecialFileFD(h, mnt, d, opts.Flags)
+	if err != nil {
+		h.close(ctx)
+		return nil, err
+	}
+	return &fd.vfsfd, nil
 }
 
 // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
-func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
-	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+// !d.isSynthetic().
+func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
 	}
 	if d.isDeleted() {
@@ -721,7 +934,11 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	}
 	creds := rp.Credentials()
 	name := rp.Component()
-	fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+	// Filter file creation flags and O_LARGEFILE out; the create RPC already
+	// has the semantics of O_CREAT|O_EXCL, while some servers will choke on
+	// O_LARGEFILE.
+	createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE))
+	fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
 	if err != nil {
 		dirfile.close(ctx)
 		return nil, err
@@ -729,7 +946,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	// Then we need to walk to the file we just created to get a non-open fid
 	// representing it, and to get its metadata. This must use d.file since, as
 	// explained above, dirfile was invalidated by dirfile.Create().
-	walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
+	_, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
 	if err != nil {
 		openFile.close(ctx)
 		if fdobj != nil {
@@ -737,17 +954,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		return nil, err
 	}
-	// Sanity-check that we walked to the file we created.
-	if createQID.Path != walkQID.Path {
-		// Probably due to concurrent remote filesystem mutation?
-		ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop)
-		nonOpenFile.close(ctx)
-		openFile.close(ctx)
-		if fdobj != nil {
-			fdobj.Close()
-		}
-		return nil, syserror.EAGAIN
-	}
 
 	// Construct the new dentry.
 	child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr)
@@ -759,6 +965,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		return nil, err
 	}
+	*ds = appendDentry(*ds, child)
 	// Incorporate the fid that was opened by lcreate.
 	useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
 	if useRegularFileFD {
@@ -771,22 +978,15 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags)
 		child.handleMu.Unlock()
 	}
-	// Take a reference on the new dentry to be held by the new file
-	// description. (This reference also means that the new dentry is not
-	// eligible for caching yet, so we don't need to append to a dentry slice.)
-	child.refs = 1
 	// Insert the dentry into the tree.
-	d.IncRef() // reference held by child on its parent d
-	d.vfsd.InsertChild(&child.vfsd, name)
-	if d.fs.opts.interop != InteropModeShared {
-		d.touchCMtime(ctx)
-		delete(d.negativeChildren, name)
+	d.cacheNewChildLocked(child, name)
+	if d.cachedMetadataAuthoritative() {
+		d.touchCMtime()
 		d.dirents = nil
 	}
 
 	// Finally, construct a file description representing the created file.
 	var childVFSFD *vfs.FileDescription
-	mnt.IncRef()
 	if useRegularFileFD {
 		fd := &regularFileFD{}
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
@@ -796,17 +996,16 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		childVFSFD = &fd.vfsfd
 	} else {
-		fd := &specialFileFD{
-			handle: handle{
-				file: openFile,
-				fd:   -1,
-			},
+		h := handle{
+			file: openFile,
+			fd:   -1,
 		}
 		if fdobj != nil {
-			fd.handle.fd = int32(fdobj.Release())
+			h.fd = int32(fdobj.Release())
 		}
-		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-			fd.handle.close(ctx)
+		fd, err := newSpecialFileFD(h, mnt, child, opts.Flags)
+		if err != nil {
+			h.close(ctx)
 			return nil, err
 		}
 		childVFSFD = &fd.vfsfd
@@ -857,12 +1056,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	defer mnt.EndWrite()
 
 	oldParent := oldParentVD.Dentry().Impl().(*dentry)
-	if fs.opts.interop == InteropModeShared {
+	if !oldParent.cachedMetadataAuthoritative() {
 		if err := oldParent.updateFromGetattr(ctx); err != nil {
 			return err
 		}
 	}
-	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+	if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	vfsObj := rp.VirtualFilesystem()
@@ -870,7 +1069,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	// directory, we need to check for write permission on it.
 	oldParent.dirMu.Lock()
 	defer oldParent.dirMu.Unlock()
-	renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds)
+	renamed, err := fs.getChildLocked(ctx, vfsObj, oldParent, oldName, &ds)
 	if err != nil {
 		return err
 	}
@@ -878,11 +1077,11 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		return syserror.ENOENT
 	}
 	if renamed.isDir() {
-		if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) {
+		if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
 			return syserror.EINVAL
 		}
 		if oldParent != newParent {
-			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+			if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 				return err
 			}
 		}
@@ -893,7 +1092,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 
 	if oldParent != newParent {
-		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 			return err
 		}
 		newParent.dirMu.Lock()
@@ -902,52 +1101,99 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if newParent.isDeleted() {
 		return syserror.ENOENT
 	}
-	replacedVFSD := newParent.vfsd.Child(newName)
-	var replaced *dentry
-	// This is similar to unlinkAt, except:
-	//
-	// - We revalidate the replaced dentry unconditionally for simplicity.
-	//
-	// - If rp.MustBeDir(), then we need a dentry representing the replaced
-	// file regardless to confirm that it's a directory.
-	if replacedVFSD != nil || rp.MustBeDir() {
-		replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds)
-		if err != nil {
-			return err
-		}
-		if replaced != nil {
-			if replaced.isDir() {
-				if !renamed.isDir() {
-					return syserror.EISDIR
-				}
-			} else {
-				if rp.MustBeDir() || renamed.isDir() {
-					return syserror.ENOTDIR
-				}
+	replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds)
+	if err != nil {
+		return err
+	}
+	var replacedVFSD *vfs.Dentry
+	if replaced != nil {
+		replacedVFSD = &replaced.vfsd
+		if replaced.isDir() {
+			if !renamed.isDir() {
+				return syserror.EISDIR
 			}
-			replacedVFSD = &replaced.vfsd
 		} else {
-			replacedVFSD = nil
+			if rp.MustBeDir() || renamed.isDir() {
+				return syserror.ENOTDIR
+			}
 		}
 	}
 
 	if oldParent == newParent && oldName == newName {
 		return nil
 	}
-	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
 		return err
 	}
-	if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
-		vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
-		return err
+
+	// Update the remote filesystem.
+	if !renamed.isSynthetic() {
+		if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
+			vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+			return err
+		}
+	} else if replaced != nil && !replaced.isSynthetic() {
+		// We are replacing an existing real file with a synthetic one, so we
+		// need to unlink the former.
+		flags := uint32(0)
+		if replaced.isDir() {
+			flags = linux.AT_REMOVEDIR
+		}
+		if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil {
+			vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+			return err
+		}
+	}
+
+	// Update the dentry tree.
+	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
+	if replaced != nil {
+		replaced.setDeleted()
+		if replaced.isSynthetic() {
+			newParent.syntheticChildren--
+			replaced.decRefLocked()
+		}
+		ds = appendDentry(ds, replaced)
 	}
-	if fs.opts.interop != InteropModeShared {
-		oldParent.cacheNegativeChildLocked(oldName)
+	oldParent.cacheNegativeLookupLocked(oldName)
+	// We don't use newParent.cacheNewChildLocked() since we don't want to mess
+	// with reference counts and queue oldParent for checkCachingLocked if the
+	// parent isn't actually changing.
+	if oldParent != newParent {
+		ds = appendDentry(ds, oldParent)
+		newParent.IncRef()
+		if renamed.isSynthetic() {
+			oldParent.syntheticChildren--
+			newParent.syntheticChildren++
+		}
+	}
+	renamed.parent = newParent
+	renamed.name = newName
+	if newParent.children == nil {
+		newParent.children = make(map[string]*dentry)
+	}
+	newParent.children[newName] = renamed
+
+	// Update metadata.
+	if renamed.cachedMetadataAuthoritative() {
+		renamed.touchCtime()
+	}
+	if oldParent.cachedMetadataAuthoritative() {
 		oldParent.dirents = nil
-		delete(newParent.negativeChildren, newName)
+		oldParent.touchCMtime()
+		if renamed.isDir() {
+			oldParent.decLinks()
+		}
+	}
+	if newParent.cachedMetadataAuthoritative() {
 		newParent.dirents = nil
+		newParent.touchCMtime()
+		if renamed.isDir() {
+			newParent.incLinks()
+		}
 	}
-	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD)
 	return nil
 }
 
@@ -994,6 +1240,10 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if err != nil {
 		return linux.Statfs{}, err
 	}
+	// If d is synthetic, invoke statfs on the first ancestor of d that isn't.
+	for d.isSynthetic() {
+		d = d.parent
+	}
 	fsstat, err := d.file.statFS(ctx)
 	if err != nil {
 		return linux.Statfs{}, err
@@ -1023,7 +1273,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		creds := rp.Credentials()
 		_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
 		return err
-	})
+	}, nil)
 }
 
 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
@@ -1031,8 +1281,34 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return fs.unlinkAt(ctx, rp, false /* dir */)
 }
 
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(&ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	if d.isSocket() {
+		if !d.isSynthetic() {
+			d.IncRef()
+			return &endpoint{
+				dentry: d,
+				file:   d.file.file,
+				path:   opts.Addr,
+			}, nil
+		}
+		return d.endpoint, nil
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
@@ -1040,11 +1316,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 	if err != nil {
 		return nil, err
 	}
-	return d.listxattr(ctx)
+	return d.listxattr(ctx, rp.Credentials(), size)
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
@@ -1052,7 +1328,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam
 	if err != nil {
 		return "", err
 	}
-	return d.getxattr(ctx, name)
+	return d.getxattr(ctx, rp.Credentials(), &opts)
 }
 
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
@@ -1064,7 +1340,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	if err != nil {
 		return err
 	}
-	return d.setxattr(ctx, &opts)
+	return d.setxattr(ctx, rp.Credentials(), &opts)
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
@@ -1076,12 +1352,12 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 	if err != nil {
 		return err
 	}
-	return d.removexattr(ctx, name)
+	return d.removexattr(ctx, rp.Credentials(), name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.renameMu.RLock()
 	defer fs.renameMu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
 }
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index d0552bd99..3f3bd56f0 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -27,13 +27,15 @@
 //             dentry.handleMu
 //               dentry.dataMu
 //
-// Locking dentry.dirMu in multiple dentries requires holding
-// filesystem.renameMu for writing.
+// Locking dentry.dirMu in multiple dentries requires that either ancestor
+// dentries are locked before descendant dentries, or that filesystem.renameMu
+// is locked for writing.
 package gofer
 
 import (
 	"fmt"
 	"strconv"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -44,14 +46,20 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// Name is the default filesystem name.
+const Name = "9p"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
@@ -64,16 +72,17 @@ type filesystem struct {
 	mfp pgalloc.MemoryFileProvider
 
 	// Immutable options.
-	opts filesystemOptions
+	opts  filesystemOptions
+	iopts InternalFilesystemOptions
 
 	// client is the client used by this filesystem. client is immutable.
 	client *p9.Client
 
-	// uid and gid are the effective KUID and KGID of the filesystem's creator,
-	// and are used as the owner and group for files that don't specify one.
-	// uid and gid are immutable.
-	uid auth.KUID
-	gid auth.KGID
+	// clock is a realtime clock used to set timestamps in file operations.
+	clock ktime.Clock
+
+	// devMinor is the filesystem's minor device number. devMinor is immutable.
+	devMinor uint32
 
 	// renameMu serves two purposes:
 	//
@@ -94,11 +103,12 @@ type filesystem struct {
 	cachedDentries    dentryList
 	cachedDentriesLen uint64
 
-	// dentries contains all dentries in this filesystem. specialFileFDs
-	// contains all open specialFileFDs. These fields are protected by syncMu.
-	syncMu         sync.Mutex
-	dentries       map[*dentry]struct{}
-	specialFileFDs map[*specialFileFD]struct{}
+	// syncableDentries contains all dentries in this filesystem for which
+	// !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
+	// These fields are protected by syncMu.
+	syncMu           sync.Mutex
+	syncableDentries map[*dentry]struct{}
+	specialFileFDs   map[*specialFileFD]struct{}
 }
 
 type filesystemOptions struct {
@@ -106,6 +116,8 @@ type filesystemOptions struct {
 	fd      int
 	aname   string
 	interop InteropMode // derived from the "cache" mount option
+	dfltuid auth.KUID
+	dfltgid auth.KGID
 	msize   uint32
 	version string
 
@@ -127,9 +139,12 @@ type filesystemOptions struct {
 
 	// If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
 	// filesystem may not be coherent with writable host FDs opened later, so
-	// mappings of the former must be replaced by mappings of the latter. This
-	// is usually only the case when the remote filesystem is an overlayfs
-	// mount on Linux < 4.19.
+	// all uses of the former must be replaced by uses of the latter. This is
+	// usually only the case when the remote filesystem is a Linux overlayfs
+	// mount. (Prior to Linux 4.18, patch series centered on commit
+	// d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were
+	// incoherent between pre-copy-up and post-copy-up FDs; after that patch
+	// series, only memory mappings are incoherent.)
 	overlayfsStaleRead bool
 
 	// If regularFilesUseSpecialFileFD is true, application FDs representing
@@ -179,7 +194,8 @@ const (
 	// InteropModeShared is appropriate when there are users of the remote
 	// filesystem that may mutate its state other than the client.
 	//
-	// - The client must verify cached filesystem state before using it.
+	// - The client must verify ("revalidate") cached filesystem state before
+	// using it.
 	//
 	// - Client changes to filesystem state must be sent to the remote
 	// filesystem synchronously.
@@ -196,6 +212,34 @@ const (
 	InteropModeShared
 )
 
+// InternalFilesystemOptions may be passed as
+// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+type InternalFilesystemOptions struct {
+	// If LeakConnection is true, do not close the connection to the server
+	// when the Filesystem is released. This is necessary for deployments in
+	// which servers can handle only a single client and report failure if that
+	// client disconnects.
+	LeakConnection bool
+
+	// If OpenSocketsByConnecting is true, silently translate attempts to open
+	// files identifying as sockets to connect RPCs.
+	OpenSocketsByConnecting bool
+}
+
+// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
+// UIDs and GIDs used for files that do not provide a specific owner or group
+// respectively.
+const (
+	// uint32(-2) doesn't work in Go.
+	_V9FS_DEFUID = auth.KUID(4294967294)
+	_V9FS_DEFGID = auth.KGID(4294967294)
+)
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
@@ -276,6 +320,31 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		}
 	}
 
+	// Parse the default UID and GID.
+	fsopts.dfltuid = _V9FS_DEFUID
+	if dfltuidstr, ok := mopts["dfltuid"]; ok {
+		delete(mopts, "dfltuid")
+		dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr)
+			return nil, nil, syserror.EINVAL
+		}
+		// In Linux, dfltuid is interpreted as a UID and is converted to a KUID
+		// in the caller's user namespace, but goferfs isn't
+		// application-mountable.
+		fsopts.dfltuid = auth.KUID(dfltuid)
+	}
+	fsopts.dfltgid = _V9FS_DEFGID
+	if dfltgidstr, ok := mopts["dfltgid"]; ok {
+		delete(mopts, "dfltgid")
+		dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.dfltgid = auth.KGID(dfltgid)
+	}
+
 	// Parse the 9P message size.
 	fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
 	if msizestr, ok := mopts["msize"]; ok {
@@ -329,6 +398,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		return nil, nil, syserror.EINVAL
 	}
 
+	// Handle internal options.
+	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
+	if opts.InternalData != nil && !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData)
+		return nil, nil, syserror.EINVAL
+	}
+	// If !ok, iopts being the zero value is correct.
+
 	// Establish a connection with the server.
 	conn, err := unet.NewSocket(fsopts.fd)
 	if err != nil {
@@ -362,16 +439,23 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	}
 
 	// Construct the filesystem object.
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		attachFile.close(ctx)
+		client.Close()
+		return nil, nil, err
+	}
 	fs := &filesystem{
-		mfp:            mfp,
-		opts:           fsopts,
-		uid:            creds.EffectiveKUID,
-		gid:            creds.EffectiveKGID,
-		client:         client,
-		dentries:       make(map[*dentry]struct{}),
-		specialFileFDs: make(map[*specialFileFD]struct{}),
+		mfp:              mfp,
+		opts:             fsopts,
+		iopts:            iopts,
+		client:           client,
+		clock:            ktime.RealtimeClockFromContext(ctx),
+		devMinor:         devMinor,
+		syncableDentries: make(map[*dentry]struct{}),
+		specialFileFDs:   make(map[*specialFileFD]struct{}),
 	}
-	fs.vfsfs.Init(vfsObj, fs)
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
 
 	// Construct the root dentry.
 	root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
@@ -395,7 +479,7 @@ func (fs *filesystem) Release() {
 	mf := fs.mfp.MemoryFile()
 
 	fs.syncMu.Lock()
-	for d := range fs.dentries {
+	for d := range fs.syncableDentries {
 		d.handleMu.Lock()
 		d.dataMu.Lock()
 		if d.handleWritable {
@@ -421,8 +505,12 @@ func (fs *filesystem) Release() {
 	// fs.
 	fs.syncMu.Unlock()
 
-	// Close the connection to the server. This implicitly clunks all fids.
-	fs.client.Close()
+	if !fs.iopts.LeakConnection {
+		// Close the connection to the server. This implicitly clunks all fids.
+		fs.client.Close()
+	}
+
+	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
 }
 
 // dentry implements vfs.DentryImpl.
@@ -430,16 +518,33 @@ type dentry struct {
 	vfsd vfs.Dentry
 
 	// refs is the reference count. Each dentry holds a reference on its
-	// parent, even if disowned. refs is accessed using atomic memory
-	// operations.
+	// parent, even if disowned. An additional reference is held on all
+	// synthetic dentries until they are unlinked or invalidated. When refs
+	// reaches 0, the dentry may be added to the cache or destroyed. If refs ==
+	// -1, the dentry has already been destroyed. refs is accessed using atomic
+	// memory operations.
 	refs int64
 
 	// fs is the owning filesystem. fs is immutable.
 	fs *filesystem
 
+	// parent is this dentry's parent directory. Each dentry holds a reference
+	// on its parent. If this dentry is a filesystem root, parent is nil.
+	// parent is protected by filesystem.renameMu.
+	parent *dentry
+
+	// name is the name of this dentry in its parent. If this dentry is a
+	// filesystem root, name is the empty string. name is protected by
+	// filesystem.renameMu.
+	name string
+
 	// We don't support hard links, so each dentry maps 1:1 to an inode.
 
 	// file is the unopened p9.File that backs this dentry. file is immutable.
+	//
+	// If file.isNil(), this dentry represents a synthetic file, i.e. a file
+	// that does not exist on the remote filesystem. As of this writing, the
+	// only files that can be synthetic are sockets, pipes, and directories.
 	file p9file
 
 	// If deleted is non-zero, the file represented by this dentry has been
@@ -454,15 +559,26 @@ type dentry struct {
 
 	dirMu sync.Mutex
 
-	// If this dentry represents a directory, and InteropModeShared is not in
-	// effect, negativeChildren is a set of child names in this directory that
-	// are known not to exist. negativeChildren is protected by dirMu.
-	negativeChildren map[string]struct{}
-
-	// If this dentry represents a directory, InteropModeShared is not in
-	// effect, and dirents is not nil, it is a cache of all entries in the
-	// directory, in the order they were returned by the server. dirents is
-	// protected by dirMu.
+	// If this dentry represents a directory, children contains:
+	//
+	// - Mappings of child filenames to dentries representing those children.
+	//
+	// - Mappings of child filenames that are known not to exist to nil
+	// dentries (only if InteropModeShared is not in effect and the directory
+	// is not synthetic).
+	//
+	// children is protected by dirMu.
+	children map[string]*dentry
+
+	// If this dentry represents a directory, syntheticChildren is the number
+	// of child dentries for which dentry.isSynthetic() == true.
+	// syntheticChildren is protected by dirMu.
+	syntheticChildren int
+
+	// If this dentry represents a directory,
+	// dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it
+	// is a cache of all entries in the directory, in the order they were
+	// returned by the server. dirents is protected by dirMu.
 	dirents []vfs.Dirent
 
 	// Cached metadata; protected by metadataMu and accessed using atomic
@@ -482,6 +598,11 @@ type dentry struct {
 	// locked to mutate it).
 	size uint64
 
+	// nlink counts the number of hard links to this dentry. It's updated and
+	// accessed using atomic operations. It's not protected by metadataMu like the
+	// other metadata fields.
+	nlink uint32
+
 	mapsMu sync.Mutex
 
 	// If this dentry represents a regular file, mappings tracks mappings of
@@ -533,6 +654,14 @@ type dentry struct {
 	// and target are protected by dataMu.
 	haveTarget bool
 	target     string
+
+	// If this dentry represents a synthetic socket file, endpoint is the
+	// transport endpoint bound to this file.
+	endpoint transport.BoundEndpoint
+
+	// If this dentry represents a synthetic named pipe, pipe is the pipe
+	// endpoint bound to this file.
+	pipe *pipe.VFSPipe
 }
 
 // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
@@ -554,6 +683,8 @@ func dentryAttrMask() p9.AttrMask {
 // initially has no references, but is not cached; it is the caller's
 // responsibility to set the dentry's reference count and/or call
 // dentry.checkCachingLocked() as appropriate.
+//
+// Preconditions: !file.isNil().
 func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
 	if !mask.Mode {
 		ctx.Warningf("can't create gofer.dentry without file type")
@@ -569,8 +700,8 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 		file:      file,
 		ino:       qid.Path,
 		mode:      uint32(attr.Mode),
-		uid:       uint32(fs.uid),
-		gid:       uint32(fs.gid),
+		uid:       uint32(fs.opts.dfltuid),
+		gid:       uint32(fs.opts.dfltgid),
 		blockSize: usermem.PageSize,
 		handle: handle{
 			fd: -1,
@@ -578,10 +709,10 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	}
 	d.pf.dentry = d
 	if mask.UID {
-		d.uid = uint32(attr.UID)
+		d.uid = dentryUIDFromP9UID(attr.UID)
 	}
 	if mask.GID {
-		d.gid = uint32(attr.GID)
+		d.gid = dentryGIDFromP9GID(attr.GID)
 	}
 	if mask.Size {
 		d.size = attr.Size
@@ -601,14 +732,25 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	if mask.BTime {
 		d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
 	}
+	if mask.NLink {
+		d.nlink = uint32(attr.NLink)
+	}
 	d.vfsd.Init(d)
 
 	fs.syncMu.Lock()
-	fs.dentries[d] = struct{}{}
+	fs.syncableDentries[d] = struct{}{}
 	fs.syncMu.Unlock()
 	return d, nil
 }
 
+func (d *dentry) isSynthetic() bool {
+	return d.file.isNil()
+}
+
+func (d *dentry) cachedMetadataAuthoritative() bool {
+	return d.fs.opts.interop != InteropModeShared || d.isSynthetic()
+}
+
 // updateFromP9Attrs is called to update d's metadata after an update from the
 // remote filesystem.
 func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
@@ -621,10 +763,10 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
 		atomic.StoreUint32(&d.mode, uint32(attr.Mode))
 	}
 	if mask.UID {
-		atomic.StoreUint32(&d.uid, uint32(attr.UID))
+		atomic.StoreUint32(&d.uid, dentryUIDFromP9UID(attr.UID))
 	}
 	if mask.GID {
-		atomic.StoreUint32(&d.gid, uint32(attr.GID))
+		atomic.StoreUint32(&d.gid, dentryGIDFromP9GID(attr.GID))
 	}
 	// There is no P9_GETATTR_* bit for I/O block size.
 	if attr.BlockSize != 0 {
@@ -642,6 +784,9 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
 	if mask.BTime {
 		atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
 	}
+	if mask.NLink {
+		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
+	}
 	if mask.Size {
 		d.dataMu.Lock()
 		atomic.StoreUint64(&d.size, attr.Size)
@@ -650,6 +795,7 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
 	d.metadataMu.Unlock()
 }
 
+// Preconditions: !d.isSynthetic()
 func (d *dentry) updateFromGetattr(ctx context.Context) error {
 	// Use d.handle.file, which represents a 9P fid that has been opened, in
 	// preference to d.file, which represents a 9P fid that has not. This may
@@ -684,10 +830,7 @@ func (d *dentry) fileType() uint32 {
 func (d *dentry) statTo(stat *linux.Statx) {
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
 	stat.Blksize = atomic.LoadUint32(&d.blockSize)
-	stat.Nlink = 1
-	if d.isDir() {
-		stat.Nlink = 2
-	}
+	stat.Nlink = atomic.LoadUint32(&d.nlink)
 	stat.UID = atomic.LoadUint32(&d.uid)
 	stat.GID = atomic.LoadUint32(&d.gid)
 	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
@@ -700,7 +843,8 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
 	stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
 	stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
-	// TODO(jamieliu): device number
+	stat.DevMajor = linux.UNNAMED_MAJOR
+	stat.DevMinor = d.fs.devMinor
 }
 
 func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
@@ -710,7 +854,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
 		return syserror.EPERM
 	}
-	if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
 		return err
 	}
 	if err := mnt.CheckBeginWrite(); err != nil {
@@ -719,53 +864,63 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	defer mnt.EndWrite()
 	setLocalAtime := false
 	setLocalMtime := false
-	if d.fs.opts.interop != InteropModeShared {
+	if d.cachedMetadataAuthoritative() {
 		// Timestamp updates will be handled locally.
 		setLocalAtime = stat.Mask&linux.STATX_ATIME != 0
 		setLocalMtime = stat.Mask&linux.STATX_MTIME != 0
 		stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME
-		if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) {
-			// Truncate updates mtime.
-			setLocalMtime = true
-			stat.Mtime.Nsec = linux.UTIME_NOW
+
+		// Prepare for truncate.
+		if stat.Mask&linux.STATX_SIZE != 0 {
+			switch d.mode & linux.S_IFMT {
+			case linux.S_IFREG:
+				if !setLocalMtime {
+					// Truncate updates mtime.
+					setLocalMtime = true
+					stat.Mtime.Nsec = linux.UTIME_NOW
+				}
+			case linux.S_IFDIR:
+				return syserror.EISDIR
+			default:
+				return syserror.EINVAL
+			}
 		}
 	}
 	d.metadataMu.Lock()
 	defer d.metadataMu.Unlock()
-	if stat.Mask != 0 {
-		if err := d.file.setAttr(ctx, p9.SetAttrMask{
-			Permissions:        stat.Mask&linux.STATX_MODE != 0,
-			UID:                stat.Mask&linux.STATX_UID != 0,
-			GID:                stat.Mask&linux.STATX_GID != 0,
-			Size:               stat.Mask&linux.STATX_SIZE != 0,
-			ATime:              stat.Mask&linux.STATX_ATIME != 0,
-			MTime:              stat.Mask&linux.STATX_MTIME != 0,
-			ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW,
-			MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW,
-		}, p9.SetAttr{
-			Permissions:      p9.FileMode(stat.Mode),
-			UID:              p9.UID(stat.UID),
-			GID:              p9.GID(stat.GID),
-			Size:             stat.Size,
-			ATimeSeconds:     uint64(stat.Atime.Sec),
-			ATimeNanoSeconds: uint64(stat.Atime.Nsec),
-			MTimeSeconds:     uint64(stat.Mtime.Sec),
-			MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
-		}); err != nil {
-			return err
+	if !d.isSynthetic() {
+		if stat.Mask != 0 {
+			if err := d.file.setAttr(ctx, p9.SetAttrMask{
+				Permissions:        stat.Mask&linux.STATX_MODE != 0,
+				UID:                stat.Mask&linux.STATX_UID != 0,
+				GID:                stat.Mask&linux.STATX_GID != 0,
+				Size:               stat.Mask&linux.STATX_SIZE != 0,
+				ATime:              stat.Mask&linux.STATX_ATIME != 0,
+				MTime:              stat.Mask&linux.STATX_MTIME != 0,
+				ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW,
+				MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW,
+			}, p9.SetAttr{
+				Permissions:      p9.FileMode(stat.Mode),
+				UID:              p9.UID(stat.UID),
+				GID:              p9.GID(stat.GID),
+				Size:             stat.Size,
+				ATimeSeconds:     uint64(stat.Atime.Sec),
+				ATimeNanoSeconds: uint64(stat.Atime.Nsec),
+				MTimeSeconds:     uint64(stat.Mtime.Sec),
+				MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
+			}); err != nil {
+				return err
+			}
+		}
+		if d.fs.opts.interop == InteropModeShared {
+			// There's no point to updating d's metadata in this case since
+			// it'll be overwritten by revalidation before the next time it's
+			// used anyway. (InteropModeShared inhibits client caching of
+			// regular file data, so there's no cache to truncate either.)
+			return nil
 		}
 	}
-	if d.fs.opts.interop == InteropModeShared {
-		// There's no point to updating d's metadata in this case since it'll
-		// be overwritten by revalidation before the next time it's used
-		// anyway. (InteropModeShared inhibits client caching of regular file
-		// data, so there's no cache to truncate either.)
-		return nil
-	}
-	now, haveNow := nowFromContext(ctx)
-	if !haveNow {
-		ctx.Warningf("gofer.dentry.setStat: current time not available")
-	}
+	now := d.fs.clock.Now().Nanoseconds()
 	if stat.Mask&linux.STATX_MODE != 0 {
 		atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
 	}
@@ -777,25 +932,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	}
 	if setLocalAtime {
 		if stat.Atime.Nsec == linux.UTIME_NOW {
-			if haveNow {
-				atomic.StoreInt64(&d.atime, now)
-			}
+			atomic.StoreInt64(&d.atime, now)
 		} else {
 			atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
 		}
 	}
 	if setLocalMtime {
 		if stat.Mtime.Nsec == linux.UTIME_NOW {
-			if haveNow {
-				atomic.StoreInt64(&d.mtime, now)
-			}
+			atomic.StoreInt64(&d.mtime, now)
 		} else {
 			atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
 		}
 	}
-	if haveNow {
-		atomic.StoreInt64(&d.ctime, now)
-	}
+	atomic.StoreInt64(&d.ctime, now)
 	if stat.Mask&linux.STATX_SIZE != 0 {
 		d.dataMu.Lock()
 		oldSize := d.size
@@ -807,8 +956,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 		// so we can't race with Write or another truncate.)
 		d.dataMu.Unlock()
 		if d.size < oldSize {
-			oldpgend := pageRoundUp(oldSize)
-			newpgend := pageRoundUp(d.size)
+			oldpgend, _ := usermem.PageRoundUp(oldSize)
+			newpgend, _ := usermem.PageRoundUp(d.size)
 			if oldpgend != newpgend {
 				d.mapsMu.Lock()
 				d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
@@ -832,8 +981,22 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 	return nil
 }
 
-func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
-	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+func dentryUIDFromP9UID(uid p9.UID) uint32 {
+	if !uid.Ok() {
+		return uint32(auth.OverflowUID)
+	}
+	return uint32(uid)
+}
+
+func dentryGIDFromP9GID(gid p9.GID) uint32 {
+	if !gid.Ok() {
+		return uint32(auth.OverflowGID)
+	}
+	return uint32(gid)
 }
 
 // IncRef implements vfs.DentryImpl.IncRef.
@@ -847,7 +1010,7 @@ func (d *dentry) IncRef() {
 func (d *dentry) TryIncRef() bool {
 	for {
 		refs := atomic.LoadInt64(&d.refs)
-		if refs == 0 {
+		if refs <= 0 {
 			return false
 		}
 		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
@@ -867,16 +1030,44 @@ func (d *dentry) DecRef() {
 	}
 }
 
+// decRefLocked decrements d's reference count without calling
+// d.checkCachingLocked, even if d's reference count reaches 0; callers are
+// responsible for ensuring that d.checkCachingLocked will be called later.
+func (d *dentry) decRefLocked() {
+	if refs := atomic.AddInt64(&d.refs, -1); refs < 0 {
+		panic("gofer.dentry.decRefLocked() called without holding a reference")
+	}
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+	return nil
+}
+
 // checkCachingLocked should be called after d's reference count becomes 0 or it
 // becomes disowned.
 //
+// It may be called on a destroyed dentry. For example,
+// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
+// for the same dentry when the dentry is visited more than once in the same
+// operation. One of the calls may destroy the dentry, so subsequent calls will
+// do nothing.
+//
 // Preconditions: d.fs.renameMu must be locked for writing.
 func (d *dentry) checkCachingLocked() {
 	// Dentries with a non-zero reference count must be retained. (The only way
 	// to obtain a reference on a dentry with zero references is via path
 	// resolution, which requires renameMu, so if d.refs is zero then it will
 	// remain zero while we hold renameMu for writing.)
-	if atomic.LoadInt64(&d.refs) != 0 {
+	refs := atomic.LoadInt64(&d.refs)
+	if refs > 0 {
 		if d.cached {
 			d.fs.cachedDentries.Remove(d)
 			d.fs.cachedDentriesLen--
@@ -884,9 +1075,13 @@ func (d *dentry) checkCachingLocked() {
 		}
 		return
 	}
-	// Non-child dentries with zero references are no longer reachable by path
-	// resolution and should be dropped immediately.
-	if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() {
+	if refs == -1 {
+		// Dentry has already been destroyed.
+		return
+	}
+	// Deleted and invalidated dentries with zero references are no longer
+	// reachable by path resolution and should be dropped immediately.
+	if d.vfsd.IsDead() {
 		if d.cached {
 			d.fs.cachedDentries.Remove(d)
 			d.fs.cachedDentriesLen--
@@ -911,34 +1106,45 @@ func (d *dentry) checkCachingLocked() {
 		d.fs.cachedDentries.Remove(victim)
 		d.fs.cachedDentriesLen--
 		victim.cached = false
-		// victim.refs may have become non-zero from an earlier path
-		// resolution since it was inserted into fs.cachedDentries; see
-		// dentry.incRefLocked(). Either way, we brought
-		// fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so
-		// we don't loop.
+		// victim.refs may have become non-zero from an earlier path resolution
+		// since it was inserted into fs.cachedDentries.
 		if atomic.LoadInt64(&victim.refs) == 0 {
-			if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil {
-				victimParent := victimParentVFSD.Impl().(*dentry)
-				victimParent.dirMu.Lock()
-				if !victim.vfsd.IsDisowned() {
-					// victim can't be a mount point (in any mount
-					// namespace), since VFS holds references on mount
-					// points.
-					d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd)
+			if victim.parent != nil {
+				victim.parent.dirMu.Lock()
+				if !victim.vfsd.IsDead() {
+					// Note that victim can't be a mount point (in any mount
+					// namespace), since VFS holds references on mount points.
+					d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd)
+					delete(victim.parent.children, victim.name)
 					// We're only deleting the dentry, not the file it
 					// represents, so we don't need to update
 					// victimParent.dirents etc.
 				}
-				victimParent.dirMu.Unlock()
+				victim.parent.dirMu.Unlock()
 			}
 			victim.destroyLocked()
 		}
+		// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
+		// back down to fs.opts.maxCachedDentries, so we don't loop.
 	}
 }
 
+// destroyLocked destroys the dentry. It may flushes dirty pages from cache,
+// close p9 file and remove reference on parent dentry.
+//
 // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is
 // not a child dentry.
 func (d *dentry) destroyLocked() {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("dentry.destroyLocked() called with references on the dentry")
+	}
+
 	ctx := context.Background()
 	d.handleMu.Lock()
 	if !d.handle.file.isNil() {
@@ -958,17 +1164,20 @@ func (d *dentry) destroyLocked() {
 		d.handle.close(ctx)
 	}
 	d.handleMu.Unlock()
-	d.file.close(ctx)
-	// Remove d from the set of all dentries.
-	d.fs.syncMu.Lock()
-	delete(d.fs.dentries, d)
-	d.fs.syncMu.Unlock()
-	// Drop the reference held by d on its parent.
-	if parentVFSD := d.vfsd.Parent(); parentVFSD != nil {
-		parent := parentVFSD.Impl().(*dentry)
-		// This is parent.DecRef() without recursive locking of d.fs.renameMu.
-		if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 {
-			parent.checkCachingLocked()
+
+	if !d.file.isNil() {
+		d.file.close(ctx)
+		d.file = p9file{}
+		// Remove d from the set of syncable dentries.
+		d.fs.syncMu.Lock()
+		delete(d.fs.syncableDentries, d)
+		d.fs.syncMu.Unlock()
+	}
+	// Drop the reference held by d on its parent without recursively locking
+	// d.fs.renameMu.
+	if d.parent != nil {
+		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+			d.parent.checkCachingLocked()
 		} else if refs < 0 {
 			panic("gofer.dentry.DecRef() called without holding a reference")
 		}
@@ -983,24 +1192,65 @@ func (d *dentry) setDeleted() {
 	atomic.StoreUint32(&d.deleted, 1)
 }
 
-func (d *dentry) listxattr(ctx context.Context) ([]string, error) {
-	return nil, syserror.ENOTSUP
+// We only support xattrs prefixed with "user." (see b/148380782). Currently,
+// there is no need to expose any other xattrs through a gofer.
+func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+	if d.file.isNil() {
+		return nil, nil
+	}
+	xattrMap, err := d.file.listXattr(ctx, size)
+	if err != nil {
+		return nil, err
+	}
+	xattrs := make([]string, 0, len(xattrMap))
+	for x := range xattrMap {
+		if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+			xattrs = append(xattrs, x)
+		}
+	}
+	return xattrs, nil
 }
 
-func (d *dentry) getxattr(ctx context.Context, name string) (string, error) {
-	// TODO(jamieliu): add vfs.GetxattrOptions.Size
-	return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX)
+func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if d.file.isNil() {
+		return "", syserror.ENODATA
+	}
+	if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+		return "", err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
-func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if d.file.isNil() {
+		return syserror.EPERM
+	}
+	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
-func (d *dentry) removexattr(ctx context.Context, name string) error {
-	return syserror.ENOTSUP
+func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+	if d.file.isNil() {
+		return syserror.EPERM
+	}
+	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	return d.file.removeXattr(ctx, name)
 }
 
-// Preconditions: d.isRegularFile() || d.isDirectory().
+// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory().
 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
 	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
 	// O_TRUNC).
@@ -1042,13 +1292,13 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 				// using the old file descriptor, preventing us from safely
 				// closing it. We could handle this by invalidating existing
 				// memmap.Translations, but this is expensive. Instead, use
-				// dup2() to make the old file descriptor refer to the new file
+				// dup3 to make the old file descriptor refer to the new file
 				// description, then close the new file descriptor (which is no
 				// longer needed). Racing callers may use the old or new file
 				// description, but this doesn't matter since they refer to the
 				// same file (unless d.fs.opts.overlayfsStaleRead is true,
 				// which we handle separately).
-				if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil {
+				if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil {
 					d.handleMu.Unlock()
 					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
 					h.close(ctx)
@@ -1091,6 +1341,26 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 	return nil
 }
 
+// incLinks increments link count.
+//
+// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32.
+func (d *dentry) incLinks() {
+	v := atomic.AddUint32(&d.nlink, 1)
+	if v < 2 {
+		panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v))
+	}
+}
+
+// decLinks decrements link count.
+//
+// Preconditions: d.nlink > 1.
+func (d *dentry) decLinks() {
+	v := atomic.AddUint32(&d.nlink, ^uint32(0))
+	if v == 0 {
+		panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v))
+	}
+}
+
 // fileDescription is embedded by gofer implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
@@ -1109,7 +1379,8 @@ func (fd *fileDescription) dentry() *dentry {
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	d := fd.dentry()
-	if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+	const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
+	if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
 		// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
 		// available?
 		if err := d.updateFromGetattr(ctx); err != nil {
@@ -1127,21 +1398,21 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 }
 
 // Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) {
-	return fd.dentry().listxattr(ctx)
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
 }
 
 // Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) {
-	return fd.dentry().getxattr(ctx, name)
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
 // Setxattr implements vfs.FileDescriptionImpl.Setxattr.
 func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
-	return fd.dentry().setxattr(ctx, &opts)
+	return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
 // Removexattr implements vfs.FileDescriptionImpl.Removexattr.
 func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
-	return fd.dentry().removexattr(ctx, name)
+	return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name)
 }
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
new file mode 100644
index 000000000..adff39490
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync/atomic"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+)
+
+func TestDestroyIdempotent(t *testing.T) {
+	fs := filesystem{
+		syncableDentries: make(map[*dentry]struct{}),
+		opts: filesystemOptions{
+			// Test relies on no dentry being held in the cache.
+			maxCachedDentries: 0,
+		},
+	}
+
+	ctx := contexttest.Context(t)
+	attr := &p9.Attr{
+		Mode: p9.ModeRegular,
+	}
+	mask := p9.AttrMask{
+		Mode: true,
+		Size: true,
+	}
+	parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+	if err != nil {
+		t.Fatalf("fs.newDentry(): %v", err)
+	}
+
+	child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+	if err != nil {
+		t.Fatalf("fs.newDentry(): %v", err)
+	}
+	parent.cacheNewChildLocked(child, "child")
+
+	child.checkCachingLocked()
+	if got := atomic.LoadInt64(&child.refs); got != -1 {
+		t.Fatalf("child.refs=%d, want: -1", got)
+	}
+	// Parent will also be destroyed when child reference is removed.
+	if got := atomic.LoadInt64(&parent.refs); got != -1 {
+		t.Fatalf("parent.refs=%d, want: -1", got)
+	}
+	child.checkCachingLocked()
+	child.checkCachingLocked()
+}
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index cfe66f797..724a3f1f7 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/hostfd"
 )
 
 // handle represents a remote "open file descriptor", consisting of an opened
@@ -77,7 +78,7 @@ func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offs
 	}
 	if h.fd >= 0 {
 		ctx.UninterruptibleSleepStart(false)
-		n, err := hostPreadv(h.fd, dsts, int64(offset))
+		n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */)
 		ctx.UninterruptibleSleepFinish(false)
 		return n, err
 	}
@@ -103,7 +104,7 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
 	}
 	if h.fd >= 0 {
 		ctx.UninterruptibleSleepStart(false)
-		n, err := hostPwritev(h.fd, srcs, int64(offset))
+		n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */)
 		ctx.UninterruptibleSleepFinish(false)
 		return n, err
 	}
diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go
deleted file mode 100644
index 19560ab26..000000000
--- a/pkg/sentry/fsimpl/gofer/handle_unsafe.go
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package gofer
-
-import (
-	"syscall"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/safemem"
-)
-
-// Preconditions: !dsts.IsEmpty().
-func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) {
-	// No buffering is necessary regardless of safecopy; host syscalls will
-	// return EFAULT if appropriate, instead of raising SIGBUS.
-	if dsts.NumBlocks() == 1 {
-		// Use pread() instead of preadv() to avoid iovec allocation and
-		// copying.
-		dst := dsts.Head()
-		n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0)
-		if e != 0 {
-			return 0, e
-		}
-		return uint64(n), nil
-	}
-	iovs := safemem.IovecsFromBlockSeq(dsts)
-	n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0)
-	if e != 0 {
-		return 0, e
-	}
-	return uint64(n), nil
-}
-
-// Preconditions: !srcs.IsEmpty().
-func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) {
-	// No buffering is necessary regardless of safecopy; host syscalls will
-	// return EFAULT if appropriate, instead of raising SIGBUS.
-	if srcs.NumBlocks() == 1 {
-		// Use pwrite() instead of pwritev() to avoid iovec allocation and
-		// copying.
-		src := srcs.Head()
-		n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0)
-		if e != 0 {
-			return 0, e
-		}
-		return uint64(n), nil
-	}
-	iovs := safemem.IovecsFromBlockSeq(srcs)
-	n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0)
-	if e != 0 {
-		return 0, e
-	}
-	return uint64(n), nil
-}
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
new file mode 100644
index 000000000..7294de7d6
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create
+// pipes after sentry initialization due to syscall filters.
+var (
+	tempPipeMu      sync.Mutex
+	tempPipeReadFD  int
+	tempPipeWriteFD int
+	tempPipeBuf     [1]byte
+)
+
+func init() {
+	var pipeFDs [2]int
+	if err := unix.Pipe(pipeFDs[:]); err != nil {
+		panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err))
+	}
+	tempPipeReadFD = pipeFDs[0]
+	tempPipeWriteFD = pipeFDs[1]
+}
+
+func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error {
+	for {
+		ok, err := nonblockingPipeHasWriter(fd)
+		if err != nil {
+			return err
+		}
+		if ok {
+			return nil
+		}
+		if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+			return err
+		}
+	}
+}
+
+func nonblockingPipeHasWriter(fd int32) (bool, error) {
+	tempPipeMu.Lock()
+	defer tempPipeMu.Unlock()
+	// Copy 1 byte from fd into the temporary pipe.
+	n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK)
+	if err == syserror.EAGAIN {
+		// The pipe represented by fd is empty, but has a writer.
+		return true, nil
+	}
+	if err != nil {
+		return false, err
+	}
+	if n == 0 {
+		// The pipe represented by fd is empty and has no writer.
+		return false, nil
+	}
+	// The pipe represented by fd is non-empty, so it either has, or has
+	// previously had, a writer. Remove the byte copied to the temporary pipe
+	// before returning.
+	if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 {
+		panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err))
+	}
+	return true, nil
+}
+
+func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error {
+	t := time.NewTimer(100 * time.Millisecond)
+	defer t.Stop()
+	cancel := ctx.SleepStart()
+	select {
+	case <-t.C:
+		ctx.SleepFinish(true)
+		return nil
+	case <-cancel:
+		ctx.SleepFinish(false)
+		return syserror.ErrInterrupted
+	}
+}
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 755ac2985..87f0b877f 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -85,6 +85,13 @@ func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAt
 	return err
 }
 
+func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
+	ctx.UninterruptibleSleepStart(false)
+	xattrs, err := f.file.ListXattr(size)
+	ctx.UninterruptibleSleepFinish(false)
+	return xattrs, err
+}
+
 func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) {
 	ctx.UninterruptibleSleepStart(false)
 	val, err := f.file.GetXattr(name, size)
@@ -99,6 +106,13 @@ func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32)
 	return err
 }
 
+func (f p9file) removeXattr(ctx context.Context, name string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.RemoveXattr(name)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := f.file.Allocate(mode, offset, length)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 54c1031a7..0d10cf7ac 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -104,7 +104,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	putDentryReadWriter(rw)
 	if d.fs.opts.interop != InteropModeShared {
 		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
-		d.touchAtime(ctx, fd.vfsfd.Mount())
+		d.touchAtime(fd.vfsfd.Mount())
 	}
 	return n, err
 }
@@ -126,6 +126,11 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 	if opts.Flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
+	limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
+	if err != nil {
+		return 0, err
+	}
+	src = src.TakeFirst64(limit)
 
 	d := fd.dentry()
 	d.metadataMu.Lock()
@@ -134,10 +139,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
 		// file_update_time(). This is d.touchCMtime(), but without locking
 		// d.metadataMu (recursively).
-		if now, ok := nowFromContext(ctx); ok {
-			atomic.StoreInt64(&d.mtime, now)
-			atomic.StoreInt64(&d.ctime, now)
-		}
+		d.touchCMtimeLocked()
 	}
 	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
 		// Write dirty cached pages that will be touched by the write back to
@@ -146,9 +148,9 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 			return 0, err
 		}
 		// Remove touched pages from the cache.
-		pgstart := pageRoundDown(uint64(offset))
-		pgend := pageRoundUp(uint64(offset + src.NumBytes()))
-		if pgend < pgstart {
+		pgstart := usermem.PageRoundDown(uint64(offset))
+		pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes()))
+		if !ok {
 			return 0, syserror.EINVAL
 		}
 		mr := memmap.MappableRange{pgstart, pgend}
@@ -304,9 +306,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
 			if fillCache {
 				// Read into the cache, then re-enter the loop to read from the
 				// cache.
+				gapEnd, _ := usermem.PageRoundUp(gapMR.End)
 				reqMR := memmap.MappableRange{
-					Start: pageRoundDown(gapMR.Start),
-					End:   pageRoundUp(gapMR.End),
+					Start: usermem.PageRoundDown(gapMR.Start),
+					End:   gapEnd,
 				}
 				optMR := gap.Range()
 				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt)
@@ -361,8 +364,15 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
 	rw.d.handleMu.RLock()
 	if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
 		n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off)
-		rw.d.handleMu.RUnlock()
 		rw.off += n
+		rw.d.dataMu.Lock()
+		if rw.off > rw.d.size {
+			atomic.StoreUint64(&rw.d.size, rw.off)
+			// The remote file's size will implicitly be extended to the correct
+			// value when we write back to it.
+		}
+		rw.d.dataMu.Unlock()
+		rw.d.handleMu.RUnlock()
 		return n, err
 	}
 
@@ -662,7 +672,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
 
 	// Constrain translations to d.size (rounded up) to prevent translation to
 	// pages that may be concurrently truncated.
-	pgend := pageRoundUp(d.size)
+	pgend, _ := usermem.PageRoundUp(d.size)
 	var beyondEOF bool
 	if required.End > pgend {
 		if required.Start >= pgend {
@@ -809,43 +819,15 @@ type dentryPlatformFile struct {
 // IncRef implements platform.File.IncRef.
 func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
 	d.dataMu.Lock()
-	seg, gap := d.fdRefs.Find(fr.Start)
-	for {
-		switch {
-		case seg.Ok() && seg.Start() < fr.End:
-			seg = d.fdRefs.Isolate(seg, fr)
-			seg.SetValue(seg.Value() + 1)
-			seg, gap = seg.NextNonEmpty()
-		case gap.Ok() && gap.Start() < fr.End:
-			newRange := gap.Range().Intersect(fr)
-			usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
-			seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
-		default:
-			d.fdRefs.MergeAdjacent(fr)
-			d.dataMu.Unlock()
-			return
-		}
-	}
+	d.fdRefs.IncRefAndAccount(fr)
+	d.dataMu.Unlock()
 }
 
 // DecRef implements platform.File.DecRef.
 func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
 	d.dataMu.Lock()
-	seg := d.fdRefs.FindSegment(fr.Start)
-
-	for seg.Ok() && seg.Start() < fr.End {
-		seg = d.fdRefs.Isolate(seg, fr)
-		if old := seg.Value(); old == 1 {
-			usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
-			seg = d.fdRefs.Remove(seg).NextSegment()
-		} else {
-			seg.SetValue(old - 1)
-			seg = seg.NextSegment()
-		}
-	}
-	d.fdRefs.MergeAdjacent(fr)
+	d.fdRefs.DecRefAndAccount(fr)
 	d.dataMu.Unlock()
-
 }
 
 // MapInternal implements platform.File.MapInternal.
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
new file mode 100644
index 000000000..d6dbe9092
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -0,0 +1,146 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func (d *dentry) isSocket() bool {
+	return d.fileType() == linux.S_IFSOCK
+}
+
+// endpoint is a Gofer-backed transport.BoundEndpoint.
+//
+// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt()
+// is called and either BoundEndpoint.BidirectionalConnect or
+// BoundEndpoint.UnidirectionalConnect is called.
+type endpoint struct {
+	// dentry is the filesystem dentry which produced this endpoint.
+	dentry *dentry
+
+	// file is the p9 file that contains a single unopened fid.
+	file p9.File
+
+	// path is the sentry path where this endpoint is bound.
+	path string
+}
+
+func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) {
+	switch t {
+	case linux.SOCK_STREAM:
+		return p9.StreamSocket, true
+	case linux.SOCK_SEQPACKET:
+		return p9.SeqpacketSocket, true
+	case linux.SOCK_DGRAM:
+		return p9.DgramSocket, true
+	}
+	return 0, false
+}
+
+// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect.
+func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error {
+	cf, ok := sockTypeToP9(ce.Type())
+	if !ok {
+		return syserr.ErrConnectionRefused
+	}
+
+	// No lock ordering required as only the ConnectingEndpoint has a mutex.
+	ce.Lock()
+
+	// Check connecting state.
+	if ce.Connected() {
+		ce.Unlock()
+		return syserr.ErrAlreadyConnected
+	}
+	if ce.Listening() {
+		ce.Unlock()
+		return syserr.ErrInvalidEndpointState
+	}
+
+	c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue())
+	if err != nil {
+		ce.Unlock()
+		return err
+	}
+
+	returnConnect(c, c)
+	ce.Unlock()
+	if err := c.Init(); err != nil {
+		return syserr.FromError(err)
+	}
+
+	return nil
+}
+
+// UnidirectionalConnect implements
+// transport.BoundEndpoint.UnidirectionalConnect.
+func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) {
+	c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{})
+	if err != nil {
+		return nil, err
+	}
+
+	if err := c.Init(); err != nil {
+		return nil, syserr.FromError(err)
+	}
+
+	// We don't need the receiver.
+	c.CloseRecv()
+	c.Release()
+
+	return c, nil
+}
+
+func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) {
+	hostFile, err := e.file.Connect(flags)
+	if err != nil {
+		return nil, syserr.ErrConnectionRefused
+	}
+	// Dup the fd so that the new endpoint can manage its lifetime.
+	hostFD, err := syscall.Dup(hostFile.FD())
+	if err != nil {
+		log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err)
+		return nil, syserr.FromError(err)
+	}
+	// After duplicating, we no longer need hostFile.
+	hostFile.Close()
+
+	c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path)
+	if serr != nil {
+		log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr)
+		return nil, serr
+	}
+	return c, nil
+}
+
+// Release implements transport.BoundEndpoint.Release.
+func (e *endpoint) Release() {
+	e.dentry.DecRef()
+}
+
+// Passcred implements transport.BoundEndpoint.Passcred.
+func (e *endpoint) Passcred() bool {
+	return false
+}
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 08c691c47..ff6126b87 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -19,33 +19,69 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// specialFileFD implements vfs.FileDescriptionImpl for files other than
-// regular files, directories, and symlinks: pipes, sockets, etc. It is also
-// used for regular files when filesystemOptions.specialRegularFiles is in
-// effect. specialFileFD differs from regularFileFD by using per-FD handles
-// instead of shared per-dentry handles, and never buffering I/O.
+// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device
+// special files, and (when filesystemOptions.specialRegularFiles is in effect)
+// regular files. specialFileFD differs from regularFileFD by using per-FD
+// handles instead of shared per-dentry handles, and never buffering I/O.
 type specialFileFD struct {
 	fileDescription
 
-	// handle is immutable.
+	// handle is used for file I/O. handle is immutable.
 	handle handle
 
-	// off is the file offset. off is protected by mu. (POSIX 2.9.7 only
-	// requires operations using the file offset to be atomic for regular files
-	// and symlinks; however, since specialFileFD may be used for regular
-	// files, we apply this atomicity unconditionally.)
+	// seekable is true if this file description represents a file for which
+	// file offset is significant, i.e. a regular file. seekable is immutable.
+	seekable bool
+
+	// mayBlock is true if this file description represents a file for which
+	// queue may send I/O readiness events. mayBlock is immutable.
+	mayBlock bool
+	queue    waiter.Queue
+
+	// If seekable is true, off is the file offset. off is protected by mu.
 	mu  sync.Mutex
 	off int64
 }
 
+func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) {
+	ftype := d.fileType()
+	seekable := ftype == linux.S_IFREG
+	mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK
+	fd := &specialFileFD{
+		handle:   h,
+		seekable: seekable,
+		mayBlock: mayBlock,
+	}
+	if mayBlock && h.fd >= 0 {
+		if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
+			return nil, err
+		}
+	}
+	if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+		DenyPRead:  !seekable,
+		DenyPWrite: !seekable,
+	}); err != nil {
+		if mayBlock && h.fd >= 0 {
+			fdnotifier.RemoveFD(h.fd)
+		}
+		return nil, err
+	}
+	return fd, nil
+}
+
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *specialFileFD) Release() {
+	if fd.mayBlock && fd.handle.fd >= 0 {
+		fdnotifier.RemoveFD(fd.handle.fd)
+	}
 	fd.handle.close(context.Background())
 	fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 	fs.syncMu.Lock()
@@ -61,9 +97,35 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error {
 	return fd.handle.file.flush(ctx)
 }
 
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if fd.mayBlock {
+		return fdnotifier.NonBlockingPoll(fd.handle.fd, mask)
+	}
+	return fd.fileDescription.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	if fd.mayBlock {
+		fd.queue.EventRegister(e, mask)
+		return
+	}
+	fd.fileDescription.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
+	if fd.mayBlock {
+		fd.queue.EventUnregister(e)
+		return
+	}
+	fd.fileDescription.EventUnregister(e)
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	if offset < 0 {
+	if fd.seekable && offset < 0 {
 		return 0, syserror.EINVAL
 	}
 	if opts.Flags != 0 {
@@ -76,10 +138,13 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	// hold here since specialFileFD doesn't client-cache data. Just buffer the
 	// read instead.
 	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
-		d.touchAtime(ctx, fd.vfsfd.Mount())
+		d.touchAtime(fd.vfsfd.Mount())
 	}
 	buf := make([]byte, dst.NumBytes())
 	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if err == syserror.EAGAIN {
+		err = syserror.ErrWouldBlock
+	}
 	if n == 0 {
 		return 0, err
 	}
@@ -91,6 +156,10 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 
 // Read implements vfs.FileDescriptionImpl.Read.
 func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	if !fd.seekable {
+		return fd.PRead(ctx, dst, -1, opts)
+	}
+
 	fd.mu.Lock()
 	n, err := fd.PRead(ctx, dst, fd.off, opts)
 	fd.off += n
@@ -100,16 +169,24 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	if offset < 0 {
+	if fd.seekable && offset < 0 {
 		return 0, syserror.EINVAL
 	}
 	if opts.Flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 
+	if fd.seekable {
+		limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
+		if err != nil {
+			return 0, err
+		}
+		src = src.TakeFirst64(limit)
+	}
+
 	// Do a buffered write. See rationale in PRead.
 	if d := fd.dentry(); d.fs.opts.interop != InteropModeShared {
-		d.touchCMtime(ctx)
+		d.touchCMtime()
 	}
 	buf := make([]byte, src.NumBytes())
 	// Don't do partial writes if we get a partial read from src.
@@ -117,11 +194,18 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 		return 0, err
 	}
 	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if err == syserror.EAGAIN {
+		err = syserror.ErrWouldBlock
+	}
 	return int64(n), err
 }
 
 // Write implements vfs.FileDescriptionImpl.Write.
 func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	if !fd.seekable {
+		return fd.PWrite(ctx, src, -1, opts)
+	}
+
 	fd.mu.Lock()
 	n, err := fd.PWrite(ctx, src, fd.off, opts)
 	fd.off += n
@@ -131,6 +215,9 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	if !fd.seekable {
+		return 0, syserror.ESPIPE
+	}
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
 	switch whence {
diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go
index adf43be60..2ec819f86 100644
--- a/pkg/sentry/fsimpl/gofer/symlink.go
+++ b/pkg/sentry/fsimpl/gofer/symlink.go
@@ -27,7 +27,7 @@ func (d *dentry) isSymlink() bool {
 // Precondition: d.isSymlink().
 func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
 	if d.fs.opts.interop != InteropModeShared {
-		d.touchAtime(ctx, mnt)
+		d.touchAtime(mnt)
 		d.dataMu.Lock()
 		if d.haveTarget {
 			target := d.target
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 7598ec6a8..1d5aa82dc 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -18,8 +18,6 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
@@ -38,23 +36,15 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
 	}
 }
 
-func nowFromContext(ctx context.Context) (int64, bool) {
-	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
-		return clock.Now().Nanoseconds(), true
-	}
-	return 0, false
-}
-
 // Preconditions: fs.interop != InteropModeShared.
-func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) {
-	if err := mnt.CheckBeginWrite(); err != nil {
+func (d *dentry) touchAtime(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime {
 		return
 	}
-	now, ok := nowFromContext(ctx)
-	if !ok {
-		mnt.EndWrite()
+	if err := mnt.CheckBeginWrite(); err != nil {
 		return
 	}
+	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
 	atomic.StoreInt64(&d.atime, now)
 	d.metadataMu.Unlock()
@@ -63,13 +53,25 @@ func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) {
 
 // Preconditions: fs.interop != InteropModeShared. The caller has successfully
 // called vfs.Mount.CheckBeginWrite().
-func (d *dentry) touchCMtime(ctx context.Context) {
-	now, ok := nowFromContext(ctx)
-	if !ok {
-		return
-	}
+func (d *dentry) touchCtime() {
+	now := d.fs.clock.Now().Nanoseconds()
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.ctime, now)
+	d.metadataMu.Unlock()
+}
+
+// Preconditions: fs.interop != InteropModeShared. The caller has successfully
+// called vfs.Mount.CheckBeginWrite().
+func (d *dentry) touchCMtime() {
+	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
 	atomic.StoreInt64(&d.mtime, now)
 	atomic.StoreInt64(&d.ctime, now)
 	d.metadataMu.Unlock()
 }
+
+func (d *dentry) touchCMtimeLocked() {
+	now := d.fs.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&d.mtime, now)
+	atomic.StoreInt64(&d.ctime, now)
+}
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
new file mode 100644
index 000000000..ca0fe6d2b
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -0,0 +1,51 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "host",
+    srcs = [
+        "control.go",
+        "host.go",
+        "ioctl_unsafe.go",
+        "mmap.go",
+        "socket.go",
+        "socket_iovec.go",
+        "socket_unsafe.go",
+        "tty.go",
+        "util.go",
+        "util_unsafe.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fdnotifier",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/safemem",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/hostfd",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/unimpl",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/unet",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go
new file mode 100644
index 000000000..b9082a20f
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/control.go
@@ -0,0 +1,96 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+type scmRights struct {
+	fds []int
+}
+
+func newSCMRights(fds []int) control.SCMRightsVFS2 {
+	return &scmRights{fds}
+}
+
+// Files implements control.SCMRights.Files.
+func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFilesVFS2, bool) {
+	n := max
+	var trunc bool
+	if l := len(c.fds); n > l {
+		n = l
+	} else if n < l {
+		trunc = true
+	}
+
+	rf := control.RightsFilesVFS2(fdsToFiles(ctx, c.fds[:n]))
+
+	// Only consume converted FDs (fdsToFiles may convert fewer than n FDs).
+	c.fds = c.fds[len(rf):]
+	return rf, trunc
+}
+
+// Clone implements transport.RightsControlMessage.Clone.
+func (c *scmRights) Clone() transport.RightsControlMessage {
+	// Host rights never need to be cloned.
+	return nil
+}
+
+// Release implements transport.RightsControlMessage.Release.
+func (c *scmRights) Release() {
+	for _, fd := range c.fds {
+		syscall.Close(fd)
+	}
+	c.fds = nil
+}
+
+// If an error is encountered, only files created before the error will be
+// returned. This is what Linux does.
+func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription {
+	files := make([]*vfs.FileDescription, 0, len(fds))
+	for _, fd := range fds {
+		// Get flags. We do it here because they may be modified
+		// by subsequent functions.
+		fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+		if errno != 0 {
+			ctx.Warningf("Error retrieving host FD flags: %v", error(errno))
+			break
+		}
+
+		// Create the file backed by hostFD.
+		file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */)
+		if err != nil {
+			ctx.Warningf("Error creating file from host FD: %v", err)
+			break
+		}
+
+		if err := file.SetStatusFlags(ctx, auth.CredentialsFromContext(ctx), uint32(fileFlags&linux.O_NONBLOCK)); err != nil {
+			ctx.Warningf("Error setting flags on host FD file: %v", err)
+			break
+		}
+
+		files = append(files, file)
+	}
+	return files
+}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
new file mode 100644
index 000000000..18b127521
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -0,0 +1,714 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host provides a filesystem implementation for host files imported as
+// file descriptors.
+package host
+
+import (
+	"fmt"
+	"math"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/hostfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// NewFDOptions contains options to NewFD.
+type NewFDOptions struct {
+	// If IsTTY is true, the file descriptor is a TTY.
+	IsTTY bool
+
+	// If HaveFlags is true, use Flags for the new file description. Otherwise,
+	// the new file description will inherit flags from hostFD.
+	HaveFlags bool
+	Flags     uint32
+}
+
+// NewFD returns a vfs.FileDescription representing the given host file
+// descriptor. mnt must be Kernel.HostMount().
+func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) {
+	fs, ok := mnt.Filesystem().Impl().(*filesystem)
+	if !ok {
+		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
+	}
+
+	// Retrieve metadata.
+	var s unix.Stat_t
+	if err := unix.Fstat(hostFD, &s); err != nil {
+		return nil, err
+	}
+
+	flags := opts.Flags
+	if !opts.HaveFlags {
+		// Get flags for the imported FD.
+		flagsInt, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0)
+		if err != nil {
+			return nil, err
+		}
+		flags = uint32(flagsInt)
+	}
+
+	fileMode := linux.FileMode(s.Mode)
+	fileType := fileMode.FileType()
+
+	// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
+	// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
+	// devices.
+	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
+	seekable := err != syserror.ESPIPE
+
+	i := &inode{
+		hostFD:     hostFD,
+		ino:        fs.NextIno(),
+		isTTY:      opts.IsTTY,
+		wouldBlock: wouldBlock(uint32(fileType)),
+		seekable:   seekable,
+		canMap:     canMap(uint32(fileType)),
+	}
+	i.pf.inode = i
+
+	// Non-seekable files can't be memory mapped, assert this.
+	if !i.seekable && i.canMap {
+		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
+	}
+
+	// If the hostFD would block, we must set it to non-blocking and handle
+	// blocking behavior in the sentry.
+	if i.wouldBlock {
+		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+			return nil, err
+		}
+		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+			return nil, err
+		}
+	}
+
+	d := &kernfs.Dentry{}
+	d.Init(i)
+
+	// i.open will take a reference on d.
+	defer d.DecRef()
+
+	// For simplicity, fileDescription.offset is set to 0. Technically, we
+	// should only set to 0 on files that are not seekable (sockets, pipes,
+	// etc.), and use the offset from the host fd otherwise when importing.
+	return i.open(ctx, d.VFSDentry(), mnt, flags)
+}
+
+// ImportFD sets up and returns a vfs.FileDescription from a donated fd.
+func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
+	return NewFD(ctx, mnt, hostFD, &NewFDOptions{
+		IsTTY: isTTY,
+	})
+}
+
+// filesystemType implements vfs.FilesystemType.
+type filesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	panic("host.filesystemType.GetFilesystem should never be called")
+}
+
+// Name implements FilesystemType.Name.
+func (filesystemType) Name() string {
+	return "none"
+}
+
+// NewFilesystem sets up and returns a new hostfs filesystem.
+//
+// Note that there should only ever be one instance of host.filesystem,
+// a global mount for host fds.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, err
+	}
+	fs := &filesystem{
+		devMinor: devMinor,
+	}
+	fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
+	return fs.VFSFilesystem(), nil
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+func (fs *filesystem) Release() {
+	fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release()
+}
+
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	d := vd.Dentry().Impl().(*kernfs.Dentry)
+	inode := d.Inode().(*inode)
+	b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino))
+	return vfs.PrependPathSyntheticError{}
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	// When the reference count reaches zero, the host fd is closed.
+	refs.AtomicRefCount
+
+	// hostFD contains the host fd that this file was originally created from,
+	// which must be available at time of restore.
+	//
+	// This field is initialized at creation time and is immutable.
+	hostFD int
+
+	// ino is an inode number unique within this filesystem.
+	//
+	// This field is initialized at creation time and is immutable.
+	ino uint64
+
+	// isTTY is true if this file represents a TTY.
+	//
+	// This field is initialized at creation time and is immutable.
+	isTTY bool
+
+	// seekable is false if the host fd points to a file representing a stream,
+	// e.g. a socket or a pipe. Such files are not seekable and can return
+	// EWOULDBLOCK for I/O operations.
+	//
+	// This field is initialized at creation time and is immutable.
+	seekable bool
+
+	// wouldBlock is true if the host FD would return EWOULDBLOCK for
+	// operations that would block.
+	//
+	// This field is initialized at creation time and is immutable.
+	wouldBlock bool
+
+	// Event queue for blocking operations.
+	queue waiter.Queue
+
+	// canMap specifies whether we allow the file to be memory mapped.
+	//
+	// This field is initialized at creation time and is immutable.
+	canMap bool
+
+	// mapsMu protects mappings.
+	mapsMu sync.Mutex
+
+	// If canMap is true, mappings tracks mappings of hostFD into
+	// memmap.MappingSpaces.
+	mappings memmap.MappingSet
+
+	// pf implements platform.File for mappings of hostFD.
+	pf inodePlatformFile
+}
+
+// CheckPermissions implements kernfs.Inode.
+func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &s); err != nil {
+		return err
+	}
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
+}
+
+// Mode implements kernfs.Inode.
+func (i *inode) Mode() linux.FileMode {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &s); err != nil {
+		// Retrieving the mode from the host fd using fstat(2) should not fail.
+		// If the syscall does not succeed, something is fundamentally wrong.
+		panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err))
+	}
+	return linux.FileMode(s.Mode)
+}
+
+// Stat implements kernfs.Inode.
+func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	if opts.Mask&linux.STATX__RESERVED != 0 {
+		return linux.Statx{}, syserror.EINVAL
+	}
+	if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
+		return linux.Statx{}, syserror.EINVAL
+	}
+
+	fs := vfsfs.Impl().(*filesystem)
+
+	// Limit our host call only to known flags.
+	mask := opts.Mask & linux.STATX_ALL
+	var s unix.Statx_t
+	err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s)
+	if err == syserror.ENOSYS {
+		// Fallback to fstat(2), if statx(2) is not supported on the host.
+		//
+		// TODO(b/151263641): Remove fallback.
+		return i.fstat(fs)
+	}
+	if err != nil {
+		return linux.Statx{}, err
+	}
+
+	// Unconditionally fill blksize, attributes, and device numbers, as
+	// indicated by /include/uapi/linux/stat.h. Inode number is always
+	// available, since we use our own rather than the host's.
+	ls := linux.Statx{
+		Mask:           linux.STATX_INO,
+		Blksize:        s.Blksize,
+		Attributes:     s.Attributes,
+		Ino:            i.ino,
+		AttributesMask: s.Attributes_mask,
+		DevMajor:       linux.UNNAMED_MAJOR,
+		DevMinor:       fs.devMinor,
+	}
+
+	// Copy other fields that were returned by the host. RdevMajor/RdevMinor
+	// are never copied (and therefore left as zero), so as not to expose host
+	// device numbers.
+	ls.Mask |= s.Mask & linux.STATX_ALL
+	if s.Mask&linux.STATX_TYPE != 0 {
+		ls.Mode |= s.Mode & linux.S_IFMT
+	}
+	if s.Mask&linux.STATX_MODE != 0 {
+		ls.Mode |= s.Mode &^ linux.S_IFMT
+	}
+	if s.Mask&linux.STATX_NLINK != 0 {
+		ls.Nlink = s.Nlink
+	}
+	if s.Mask&linux.STATX_UID != 0 {
+		ls.UID = s.Uid
+	}
+	if s.Mask&linux.STATX_GID != 0 {
+		ls.GID = s.Gid
+	}
+	if s.Mask&linux.STATX_ATIME != 0 {
+		ls.Atime = unixToLinuxStatxTimestamp(s.Atime)
+	}
+	if s.Mask&linux.STATX_BTIME != 0 {
+		ls.Btime = unixToLinuxStatxTimestamp(s.Btime)
+	}
+	if s.Mask&linux.STATX_CTIME != 0 {
+		ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime)
+	}
+	if s.Mask&linux.STATX_MTIME != 0 {
+		ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime)
+	}
+	if s.Mask&linux.STATX_SIZE != 0 {
+		ls.Size = s.Size
+	}
+	if s.Mask&linux.STATX_BLOCKS != 0 {
+		ls.Blocks = s.Blocks
+	}
+
+	return ls, nil
+}
+
+// fstat is a best-effort fallback for inode.Stat() if the host does not
+// support statx(2).
+//
+// We ignore the mask and sync flags in opts and simply supply
+// STATX_BASIC_STATS, as fstat(2) itself does not allow the specification
+// of a mask or sync flags. fstat(2) does not provide any metadata
+// equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so
+// those fields remain empty.
+func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
+	var s unix.Stat_t
+	if err := unix.Fstat(i.hostFD, &s); err != nil {
+		return linux.Statx{}, err
+	}
+
+	// As with inode.Stat(), we always use internal device and inode numbers,
+	// and never expose the host's represented device numbers.
+	return linux.Statx{
+		Mask:     linux.STATX_BASIC_STATS,
+		Blksize:  uint32(s.Blksize),
+		Nlink:    uint32(s.Nlink),
+		UID:      s.Uid,
+		GID:      s.Gid,
+		Mode:     uint16(s.Mode),
+		Ino:      i.ino,
+		Size:     uint64(s.Size),
+		Blocks:   uint64(s.Blocks),
+		Atime:    timespecToStatxTimestamp(s.Atim),
+		Ctime:    timespecToStatxTimestamp(s.Ctim),
+		Mtime:    timespecToStatxTimestamp(s.Mtim),
+		DevMajor: linux.UNNAMED_MAJOR,
+		DevMinor: fs.devMinor,
+	}, nil
+}
+
+// SetStat implements kernfs.Inode.
+func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	s := opts.Stat
+
+	m := s.Mask
+	if m == 0 {
+		return nil
+	}
+	if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
+		return syserror.EPERM
+	}
+	var hostStat syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &hostStat); err != nil {
+		return err
+	}
+	if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
+		return err
+	}
+
+	if m&linux.STATX_MODE != 0 {
+		if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil {
+			return err
+		}
+	}
+	if m&linux.STATX_SIZE != 0 {
+		if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
+			return err
+		}
+		oldSize := uint64(hostStat.Size)
+		if s.Size < oldSize {
+			oldpgend, _ := usermem.PageRoundUp(oldSize)
+			newpgend, _ := usermem.PageRoundUp(s.Size)
+			if oldpgend != newpgend {
+				i.mapsMu.Lock()
+				i.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+					// Compare Linux's mm/truncate.c:truncate_setsize() =>
+					// truncate_pagecache() =>
+					// mm/memory.c:unmap_mapping_range(evencows=1).
+					InvalidatePrivate: true,
+				})
+				i.mapsMu.Unlock()
+			}
+		}
+	}
+	if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
+		ts := [2]syscall.Timespec{
+			toTimespec(s.Atime, m&linux.STATX_ATIME == 0),
+			toTimespec(s.Mtime, m&linux.STATX_MTIME == 0),
+		}
+		if err := setTimestamps(i.hostFD, &ts); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// DecRef implements kernfs.Inode.
+func (i *inode) DecRef() {
+	i.AtomicRefCount.DecRefWithDestructor(i.Destroy)
+}
+
+// Destroy implements kernfs.Inode.
+func (i *inode) Destroy() {
+	if i.wouldBlock {
+		fdnotifier.RemoveFD(int32(i.hostFD))
+	}
+	if err := unix.Close(i.hostFD); err != nil {
+		log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
+	}
+}
+
+// Open implements kernfs.Inode.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
+	if i.Mode().FileType() == linux.S_IFSOCK {
+		return nil, syserror.ENXIO
+	}
+	return i.open(ctx, vfsd, rp.Mount(), opts.Flags)
+}
+
+func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &s); err != nil {
+		return nil, err
+	}
+	fileType := s.Mode & linux.FileTypeMask
+
+	// Constrain flags to a subset we can handle.
+	// TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags.
+	flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
+
+	if fileType == syscall.S_IFSOCK {
+		if i.isTTY {
+			log.Warningf("cannot use host socket fd %d as TTY", i.hostFD)
+			return nil, syserror.ENOTTY
+		}
+
+		ep, err := newEndpoint(ctx, i.hostFD, &i.queue)
+		if err != nil {
+			return nil, err
+		}
+		// Currently, we only allow Unix sockets to be imported.
+		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d)
+	}
+
+	// TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that
+	// we don't allow importing arbitrary file types without proper support.
+	if i.isTTY {
+		fd := &TTYFileDescription{
+			fileDescription: fileDescription{inode: i},
+			termios:         linux.DefaultSlaveTermios,
+		}
+		vfsfd := &fd.vfsfd
+		if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
+		return vfsfd, nil
+	}
+
+	fd := &fileDescription{inode: i}
+	vfsfd := &fd.vfsfd
+	if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return vfsfd, nil
+}
+
+// fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+
+	// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
+	// cached to reduce indirections and casting. fileDescription does not hold
+	// a reference on the inode through the inode field (since one is already
+	// held via the Dentry).
+	//
+	// inode is immutable after fileDescription creation.
+	inode *inode
+
+	// offsetMu protects offset.
+	offsetMu sync.Mutex
+
+	// offset specifies the current file offset. It is only meaningful when
+	// inode.seekable is true.
+	offset int64
+}
+
+// SetStat implements vfs.FileDescriptionImpl.
+func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Release() {
+	// noop
+}
+
+// PRead implements FileDescriptionImpl.
+func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	i := f.inode
+	if !i.seekable {
+		return 0, syserror.ESPIPE
+	}
+
+	return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
+}
+
+// Read implements FileDescriptionImpl.
+func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	i := f.inode
+	if !i.seekable {
+		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
+		if isBlockError(err) {
+			// If we got any data at all, return it as a "completed" partial read
+			// rather than retrying until complete.
+			if n != 0 {
+				err = nil
+			} else {
+				err = syserror.ErrWouldBlock
+			}
+		}
+		return n, err
+	}
+	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+	f.offsetMu.Lock()
+	n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags)
+	f.offset += n
+	f.offsetMu.Unlock()
+	return n, err
+}
+
+func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+	// TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
+	if flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+	reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
+	n, err := dst.CopyOutFrom(ctx, reader)
+	hostfd.PutReadWriterAt(reader)
+	return int64(n), err
+}
+
+// PWrite implements FileDescriptionImpl.
+func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	i := f.inode
+	if !i.seekable {
+		return 0, syserror.ESPIPE
+	}
+
+	return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags)
+}
+
+// Write implements FileDescriptionImpl.
+func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	i := f.inode
+	if !i.seekable {
+		n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags)
+		if isBlockError(err) {
+			err = syserror.ErrWouldBlock
+		}
+		return n, err
+	}
+	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
+	// TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
+	f.offsetMu.Lock()
+	n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags)
+	f.offset += n
+	f.offsetMu.Unlock()
+	return n, err
+}
+
+func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+	// TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
+	if flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+	writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
+	n, err := src.CopyInTo(ctx, writer)
+	hostfd.PutReadWriterAt(writer)
+	return int64(n), err
+}
+
+// Seek implements FileDescriptionImpl.
+//
+// Note that we do not support seeking on directories, since we do not even
+// allow directory fds to be imported at all.
+func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
+	i := f.inode
+	if !i.seekable {
+		return 0, syserror.ESPIPE
+	}
+
+	f.offsetMu.Lock()
+	defer f.offsetMu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset = offset
+
+	case linux.SEEK_CUR:
+		// Check for overflow. Note that underflow cannot occur, since f.offset >= 0.
+		if offset > math.MaxInt64-f.offset {
+			return f.offset, syserror.EOVERFLOW
+		}
+		if f.offset+offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset += offset
+
+	case linux.SEEK_END:
+		var s syscall.Stat_t
+		if err := syscall.Fstat(i.hostFD, &s); err != nil {
+			return f.offset, err
+		}
+		size := s.Size
+
+		// Check for overflow. Note that underflow cannot occur, since size >= 0.
+		if offset > math.MaxInt64-size {
+			return f.offset, syserror.EOVERFLOW
+		}
+		if size+offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset = size + offset
+
+	case linux.SEEK_DATA, linux.SEEK_HOLE:
+		// Modifying the offset in the host file table should not matter, since
+		// this is the only place where we use it.
+		//
+		// For reading and writing, we always rely on our internal offset.
+		n, err := unix.Seek(i.hostFD, offset, int(whence))
+		if err != nil {
+			return f.offset, err
+		}
+		f.offset = n
+
+	default:
+		// Invalid whence.
+		return f.offset, syserror.EINVAL
+	}
+
+	return f.offset, nil
+}
+
+// Sync implements FileDescriptionImpl.
+func (f *fileDescription) Sync(context.Context) error {
+	// TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything.
+	return unix.Fsync(f.inode.hostFD)
+}
+
+// ConfigureMMap implements FileDescriptionImpl.
+func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
+	if !f.inode.canMap {
+		return syserror.ENODEV
+	}
+	i := f.inode
+	i.pf.fileMapperInitOnce.Do(i.pf.fileMapper.Init)
+	return vfs.GenericConfigureMMap(&f.vfsfd, i, opts)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	f.inode.queue.EventRegister(e, mask)
+	fdnotifier.UpdateFD(int32(f.inode.hostFD))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (f *fileDescription) EventUnregister(e *waiter.Entry) {
+	f.inode.queue.EventUnregister(e)
+	fdnotifier.UpdateFD(int32(f.inode.hostFD))
+}
+
+// Readiness uses the poll() syscall to check the status of the underlying FD.
+func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask)
+}
diff --git a/pkg/sentry/fsimpl/host/ioctl_unsafe.go b/pkg/sentry/fsimpl/host/ioctl_unsafe.go
new file mode 100644
index 000000000..0983bf7d8
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/ioctl_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+func ioctlGetTermios(fd int) (*linux.Termios, error) {
+	var t linux.Termios
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &t, nil
+}
+
+func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func ioctlGetWinsize(fd int) (*linux.Winsize, error) {
+	var w linux.Winsize
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &w, nil
+}
+
+func ioctlSetWinsize(fd int, w *linux.Winsize) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
new file mode 100644
index 000000000..8545a82f0
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -0,0 +1,132 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// inodePlatformFile implements platform.File. It exists solely because inode
+// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef.
+//
+// inodePlatformFile should only be used if inode.canMap is true.
+type inodePlatformFile struct {
+	*inode
+
+	// fdRefsMu protects fdRefs.
+	fdRefsMu sync.Mutex
+
+	// fdRefs counts references on platform.File offsets. It is used solely for
+	// memory accounting.
+	fdRefs fsutil.FrameRefSet
+
+	// fileMapper caches mappings of the host file represented by this inode.
+	fileMapper fsutil.HostFileMapper
+
+	// fileMapperInitOnce is used to lazily initialize fileMapper.
+	fileMapperInitOnce sync.Once
+}
+
+// IncRef implements platform.File.IncRef.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inodePlatformFile) IncRef(fr platform.FileRange) {
+	i.fdRefsMu.Lock()
+	i.fdRefs.IncRefAndAccount(fr)
+	i.fdRefsMu.Unlock()
+}
+
+// DecRef implements platform.File.DecRef.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inodePlatformFile) DecRef(fr platform.FileRange) {
+	i.fdRefsMu.Lock()
+	i.fdRefs.DecRefAndAccount(fr)
+	i.fdRefsMu.Unlock()
+}
+
+// MapInternal implements platform.File.MapInternal.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	return i.fileMapper.MapInternal(fr, i.hostFD, at.Write)
+}
+
+// FD implements platform.File.FD.
+func (i *inodePlatformFile) FD() int {
+	return i.hostFD
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	i.mapsMu.Lock()
+	mapped := i.mappings.AddMapping(ms, ar, offset, writable)
+	for _, r := range mapped {
+		i.pf.fileMapper.IncRefOn(r)
+	}
+	i.mapsMu.Unlock()
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	i.mapsMu.Lock()
+	unmapped := i.mappings.RemoveMapping(ms, ar, offset, writable)
+	for _, r := range unmapped {
+		i.pf.fileMapper.DecRefOn(r)
+	}
+	i.mapsMu.Unlock()
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return i.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	mr := optional
+	return []memmap.Translation{
+		{
+			Source: mr,
+			File:   &i.pf,
+			Offset: mr.Start,
+			Perms:  usermem.AnyAccess,
+		},
+	}, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) InvalidateUnsavable(ctx context.Context) error {
+	// We expect the same host fd across save/restore, so all translations
+	// should be valid.
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
new file mode 100644
index 000000000..38f1fbfba
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -0,0 +1,394 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Create a new host-backed endpoint from the given fd and its corresponding
+// notification queue.
+func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) {
+	// Set up an external transport.Endpoint using the host fd.
+	addr := fmt.Sprintf("hostfd:[%d]", hostFD)
+	e, err := NewConnectedEndpoint(ctx, hostFD, addr, true /* saveable */)
+	if err != nil {
+		return nil, err.ToError()
+	}
+	ep := transport.NewExternal(ctx, e.stype, uniqueid.GlobalProviderFromContext(ctx), queue, e, e)
+	return ep, nil
+}
+
+// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
+//
+// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
+const maxSendBufferSize = 8 << 20
+
+// ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and
+// transport.Receiver. It is backed by a host fd that was imported at sentry
+// startup. This fd is shared with a hostfs inode, which retains ownership of
+// it.
+//
+// ConnectedEndpoint is saveable, since we expect that the host will provide
+// the same fd upon restore.
+//
+// As of this writing, we only allow Unix sockets to be imported.
+//
+// +stateify savable
+type ConnectedEndpoint struct {
+	// ref keeps track of references to a ConnectedEndpoint.
+	ref refs.AtomicRefCount
+
+	// mu protects fd below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// fd is the host fd backing this endpoint.
+	fd int
+
+	// addr is the address at which this endpoint is bound.
+	addr string
+
+	// sndbuf is the size of the send buffer.
+	//
+	// N.B. When this is smaller than the host size, we present it via
+	// GetSockOpt and message splitting/rejection in SendMsg, but do not
+	// prevent lots of small messages from filling the real send buffer
+	// size on the host.
+	sndbuf int64 `state:"nosave"`
+
+	// stype is the type of Unix socket.
+	stype linux.SockType
+}
+
+// init performs initialization required for creating new ConnectedEndpoints and
+// for restoring them.
+func (c *ConnectedEndpoint) init() *syserr.Error {
+	family, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN)
+	if err != nil {
+		return syserr.FromError(err)
+	}
+
+	if family != syscall.AF_UNIX {
+		// We only allow Unix sockets.
+		return syserr.ErrInvalidEndpointState
+	}
+
+	stype, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_TYPE)
+	if err != nil {
+		return syserr.FromError(err)
+	}
+
+	if err := syscall.SetNonblock(c.fd, true); err != nil {
+		return syserr.FromError(err)
+	}
+
+	sndbuf, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+	if err != nil {
+		return syserr.FromError(err)
+	}
+	if sndbuf > maxSendBufferSize {
+		log.Warningf("Socket send buffer too large: %d", sndbuf)
+		return syserr.ErrInvalidEndpointState
+	}
+
+	c.stype = linux.SockType(stype)
+	c.sndbuf = int64(sndbuf)
+
+	return nil
+}
+
+// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host fd
+// imported at sentry startup,
+//
+// The caller is responsible for calling Init(). Additionaly, Release needs to
+// be called twice because ConnectedEndpoint is both a transport.Receiver and
+// transport.ConnectedEndpoint.
+func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable bool) (*ConnectedEndpoint, *syserr.Error) {
+	e := ConnectedEndpoint{
+		fd:   hostFD,
+		addr: addr,
+	}
+
+	if err := e.init(); err != nil {
+		return nil, err
+	}
+
+	// AtomicRefCounters start off with a single reference. We need two.
+	e.ref.IncRef()
+	e.ref.EnableLeakCheck("host.ConnectedEndpoint")
+	return &e, nil
+}
+
+// Send implements transport.ConnectedEndpoint.Send.
+func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	if !controlMessages.Empty() {
+		return 0, false, syserr.ErrInvalidEndpointState
+	}
+
+	// Since stream sockets don't preserve message boundaries, we can write
+	// only as much of the message as fits in the send buffer.
+	truncate := c.stype == linux.SOCK_STREAM
+
+	n, totalLen, err := fdWriteVec(c.fd, data, c.sndbuf, truncate)
+	if n < totalLen && err == nil {
+		// The host only returns a short write if it would otherwise
+		// block (and only for stream sockets).
+		err = syserror.EAGAIN
+	}
+	if n > 0 && err != syserror.EAGAIN {
+		// The caller may need to block to send more data, but
+		// otherwise there isn't anything that can be done about an
+		// error with a partial write.
+		err = nil
+	}
+
+	// There is no need for the callee to call SendNotify because fdWriteVec
+	// uses the host's sendmsg(2) and the host kernel's queue.
+	return n, false, syserr.FromError(err)
+}
+
+// SendNotify implements transport.ConnectedEndpoint.SendNotify.
+func (c *ConnectedEndpoint) SendNotify() {}
+
+// CloseSend implements transport.ConnectedEndpoint.CloseSend.
+func (c *ConnectedEndpoint) CloseSend() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if err := syscall.Shutdown(c.fd, syscall.SHUT_WR); err != nil {
+		// A well-formed UDS shutdown can't fail. See
+		// net/unix/af_unix.c:unix_shutdown.
+		panic(fmt.Sprintf("failed write shutdown on host socket %+v: %v", c, err))
+	}
+}
+
+// CloseNotify implements transport.ConnectedEndpoint.CloseNotify.
+func (c *ConnectedEndpoint) CloseNotify() {}
+
+// Writable implements transport.ConnectedEndpoint.Writable.
+func (c *ConnectedEndpoint) Writable() bool {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.EventOut)&waiter.EventOut != 0
+}
+
+// Passcred implements transport.ConnectedEndpoint.Passcred.
+func (c *ConnectedEndpoint) Passcred() bool {
+	// We don't support credential passing for host sockets.
+	return false
+}
+
+// GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress.
+func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, nil
+}
+
+// EventUpdate implements transport.ConnectedEndpoint.EventUpdate.
+func (c *ConnectedEndpoint) EventUpdate() {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.fd != -1 {
+		fdnotifier.UpdateFD(int32(c.fd))
+	}
+}
+
+// Recv implements transport.Receiver.Recv.
+func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	var cm unet.ControlMessage
+	if numRights > 0 {
+		cm.EnableFDs(int(numRights))
+	}
+
+	// N.B. Unix sockets don't have a receive buffer, the send buffer
+	// serves both purposes.
+	rl, ml, cl, cTrunc, err := fdReadVec(c.fd, data, []byte(cm), peek, c.sndbuf)
+	if rl > 0 && err != nil {
+		// We got some data, so all we need to do on error is return
+		// the data that we got. Short reads are fine, no need to
+		// block.
+		err = nil
+	}
+	if err != nil {
+		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
+	}
+
+	// There is no need for the callee to call RecvNotify because fdReadVec uses
+	// the host's recvmsg(2) and the host kernel's queue.
+
+	// Trim the control data if we received less than the full amount.
+	if cl < uint64(len(cm)) {
+		cm = cm[:cl]
+	}
+
+	// Avoid extra allocations in the case where there isn't any control data.
+	if len(cm) == 0 {
+		return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil
+	}
+
+	fds, err := cm.ExtractFDs()
+	if err != nil {
+		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
+	}
+
+	if len(fds) == 0 {
+		return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil
+	}
+	return rl, ml, control.NewVFS2(nil, nil, newSCMRights(fds)), cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil
+}
+
+// RecvNotify implements transport.Receiver.RecvNotify.
+func (c *ConnectedEndpoint) RecvNotify() {}
+
+// CloseRecv implements transport.Receiver.CloseRecv.
+func (c *ConnectedEndpoint) CloseRecv() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if err := syscall.Shutdown(c.fd, syscall.SHUT_RD); err != nil {
+		// A well-formed UDS shutdown can't fail. See
+		// net/unix/af_unix.c:unix_shutdown.
+		panic(fmt.Sprintf("failed read shutdown on host socket %+v: %v", c, err))
+	}
+}
+
+// Readable implements transport.Receiver.Readable.
+func (c *ConnectedEndpoint) Readable() bool {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.EventIn)&waiter.EventIn != 0
+}
+
+// SendQueuedSize implements transport.Receiver.SendQueuedSize.
+func (c *ConnectedEndpoint) SendQueuedSize() int64 {
+	// TODO(gvisor.dev/issue/273): SendQueuedSize isn't supported for host
+	// sockets because we don't allow the sentry to call ioctl(2).
+	return -1
+}
+
+// RecvQueuedSize implements transport.Receiver.RecvQueuedSize.
+func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
+	// TODO(gvisor.dev/issue/273): RecvQueuedSize isn't supported for host
+	// sockets because we don't allow the sentry to call ioctl(2).
+	return -1
+}
+
+// SendMaxQueueSize implements transport.Receiver.SendMaxQueueSize.
+func (c *ConnectedEndpoint) SendMaxQueueSize() int64 {
+	return int64(c.sndbuf)
+}
+
+// RecvMaxQueueSize implements transport.Receiver.RecvMaxQueueSize.
+func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
+	// N.B. Unix sockets don't use the receive buffer. We'll claim it is
+	// the same size as the send buffer.
+	return int64(c.sndbuf)
+}
+
+func (c *ConnectedEndpoint) destroyLocked() {
+	c.fd = -1
+}
+
+// Release implements transport.ConnectedEndpoint.Release and
+// transport.Receiver.Release.
+func (c *ConnectedEndpoint) Release() {
+	c.ref.DecRefWithDestructor(func() {
+		c.mu.Lock()
+		c.destroyLocked()
+		c.mu.Unlock()
+	})
+}
+
+// CloseUnread implements transport.ConnectedEndpoint.CloseUnread.
+func (c *ConnectedEndpoint) CloseUnread() {}
+
+// SCMConnectedEndpoint represents an endpoint backed by a host fd that was
+// passed through a gofer Unix socket. It resembles ConnectedEndpoint, with the
+// following differences:
+// - SCMConnectedEndpoint is not saveable, because the host cannot guarantee
+// the same descriptor number across S/R.
+// - SCMConnectedEndpoint holds ownership of its fd and notification queue.
+type SCMConnectedEndpoint struct {
+	ConnectedEndpoint
+
+	queue *waiter.Queue
+}
+
+// Init will do the initialization required without holding other locks.
+func (e *SCMConnectedEndpoint) Init() error {
+	return fdnotifier.AddFD(int32(e.fd), e.queue)
+}
+
+// Release implements transport.ConnectedEndpoint.Release and
+// transport.Receiver.Release.
+func (e *SCMConnectedEndpoint) Release() {
+	e.ref.DecRefWithDestructor(func() {
+		e.mu.Lock()
+		if err := syscall.Close(e.fd); err != nil {
+			log.Warningf("Failed to close host fd %d: %v", err)
+		}
+		fdnotifier.RemoveFD(int32(e.fd))
+		e.destroyLocked()
+		e.mu.Unlock()
+	})
+}
+
+// NewSCMEndpoint creates a new SCMConnectedEndpoint backed by a host fd that
+// was passed through a Unix socket.
+//
+// The caller is responsible for calling Init(). Additionaly, Release needs to
+// be called twice because ConnectedEndpoint is both a transport.Receiver and
+// transport.ConnectedEndpoint.
+func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr string) (*SCMConnectedEndpoint, *syserr.Error) {
+	e := SCMConnectedEndpoint{
+		ConnectedEndpoint: ConnectedEndpoint{
+			fd:   hostFD,
+			addr: addr,
+		},
+		queue: queue,
+	}
+
+	if err := e.init(); err != nil {
+		return nil, err
+	}
+
+	// AtomicRefCounters start off with a single reference. We need two.
+	e.ref.IncRef()
+	e.ref.EnableLeakCheck("host.SCMConnectedEndpoint")
+	return &e, nil
+}
diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go
new file mode 100644
index 000000000..584c247d2
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/socket_iovec.go
@@ -0,0 +1,113 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// maxIovs is the maximum number of iovecs to pass to the host.
+var maxIovs = linux.UIO_MAXIOV
+
+// copyToMulti copies as many bytes from src to dst as possible.
+func copyToMulti(dst [][]byte, src []byte) {
+	for _, d := range dst {
+		done := copy(d, src)
+		src = src[done:]
+		if len(src) == 0 {
+			break
+		}
+	}
+}
+
+// copyFromMulti copies as many bytes from src to dst as possible.
+func copyFromMulti(dst []byte, src [][]byte) {
+	for _, s := range src {
+		done := copy(dst, s)
+		dst = dst[done:]
+		if len(dst) == 0 {
+			break
+		}
+	}
+}
+
+// buildIovec builds an iovec slice from the given []byte slice.
+//
+// If truncate, truncate bufs > maxlen. Otherwise, immediately return an error.
+//
+// If length < the total length of bufs, err indicates why, even when returning
+// a truncated iovec.
+//
+// If intermediate != nil, iovecs references intermediate rather than bufs and
+// the caller must copy to/from bufs as necessary.
+func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovecs []syscall.Iovec, intermediate []byte, err error) {
+	var iovsRequired int
+	for _, b := range bufs {
+		length += int64(len(b))
+		if len(b) > 0 {
+			iovsRequired++
+		}
+	}
+
+	stopLen := length
+	if length > maxlen {
+		if truncate {
+			stopLen = maxlen
+			err = syserror.EAGAIN
+		} else {
+			return 0, nil, nil, syserror.EMSGSIZE
+		}
+	}
+
+	if iovsRequired > maxIovs {
+		// The kernel will reject our call if we pass this many iovs.
+		// Use a single intermediate buffer instead.
+		b := make([]byte, stopLen)
+
+		return stopLen, []syscall.Iovec{{
+			Base: &b[0],
+			Len:  uint64(stopLen),
+		}}, b, err
+	}
+
+	var total int64
+	iovecs = make([]syscall.Iovec, 0, iovsRequired)
+	for i := range bufs {
+		l := len(bufs[i])
+		if l == 0 {
+			continue
+		}
+
+		stop := int64(l)
+		if total+stop > stopLen {
+			stop = stopLen - total
+		}
+
+		iovecs = append(iovecs, syscall.Iovec{
+			Base: &bufs[i][0],
+			Len:  uint64(stop),
+		})
+
+		total += stop
+		if total >= stopLen {
+			break
+		}
+	}
+
+	return total, iovecs, nil, err
+}
diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go
new file mode 100644
index 000000000..35ded24bc
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/socket_unsafe.go
@@ -0,0 +1,101 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// fdReadVec receives from fd to bufs.
+//
+// If the total length of bufs is > maxlen, fdReadVec will do a partial read
+// and err will indicate why the message was truncated.
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (readLen int64, msgLen int64, controlLen uint64, controlTrunc bool, err error) {
+	flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
+	if peek {
+		flags |= syscall.MSG_PEEK
+	}
+
+	// Always truncate the receive buffer. All socket types will truncate
+	// received messages.
+	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true)
+	if err != nil && len(iovecs) == 0 {
+		// No partial write to do, return error immediately.
+		return 0, 0, 0, false, err
+	}
+
+	var msg syscall.Msghdr
+	if len(control) != 0 {
+		msg.Control = &control[0]
+		msg.Controllen = uint64(len(control))
+	}
+
+	if len(iovecs) != 0 {
+		msg.Iov = &iovecs[0]
+		msg.Iovlen = uint64(len(iovecs))
+	}
+
+	rawN, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
+	if e != 0 {
+		// N.B. prioritize the syscall error over the buildIovec error.
+		return 0, 0, 0, false, e
+	}
+	n := int64(rawN)
+
+	// Copy data back to bufs.
+	if intermediate != nil {
+		copyToMulti(bufs, intermediate)
+	}
+
+	controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
+
+	if n > length {
+		return length, n, msg.Controllen, controlTrunc, err
+	}
+
+	return n, n, msg.Controllen, controlTrunc, err
+}
+
+// fdWriteVec sends from bufs to fd.
+//
+// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a
+// partial write and err will indicate why the message was truncated.
+func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int64, error) {
+	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate)
+	if err != nil && len(iovecs) == 0 {
+		// No partial write to do, return error immediately.
+		return 0, length, err
+	}
+
+	// Copy data to intermediate buf.
+	if intermediate != nil {
+		copyFromMulti(intermediate, bufs)
+	}
+
+	var msg syscall.Msghdr
+	if len(iovecs) > 0 {
+		msg.Iov = &iovecs[0]
+		msg.Iovlen = uint64(len(iovecs))
+	}
+
+	n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL)
+	if e != 0 {
+		// N.B. prioritize the syscall error over the buildIovec error.
+		return 0, length, e
+	}
+
+	return int64(n), length, err
+}
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
new file mode 100644
index 000000000..68af6e5af
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -0,0 +1,379 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/unimpl"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// TTYFileDescription implements vfs.FileDescriptionImpl for a host file
+// descriptor that wraps a TTY FD.
+type TTYFileDescription struct {
+	fileDescription
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// session is the session attached to this TTYFileDescription.
+	session *kernel.Session
+
+	// fgProcessGroup is the foreground process group that is currently
+	// connected to this TTY.
+	fgProcessGroup *kernel.ProcessGroup
+
+	// termios contains the terminal attributes for this TTY.
+	termios linux.KernelTermios
+}
+
+// InitForegroundProcessGroup sets the foreground process group and session for
+// the TTY. This should only be called once, after the foreground process group
+// has been created, but before it has started running.
+func (t *TTYFileDescription) InitForegroundProcessGroup(pg *kernel.ProcessGroup) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.fgProcessGroup != nil {
+		panic("foreground process group is already set")
+	}
+	t.fgProcessGroup = pg
+	t.session = pg.Session()
+}
+
+// ForegroundProcessGroup returns the foreground process for the TTY.
+func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.fgProcessGroup
+}
+
+// Release implements fs.FileOperations.Release.
+func (t *TTYFileDescription) Release() {
+	t.mu.Lock()
+	t.fgProcessGroup = nil
+	t.mu.Unlock()
+
+	t.fileDescription.Release()
+}
+
+// PRead implements vfs.FileDescriptionImpl.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileDescription.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileDescription.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.
+func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Check whether TOSTOP is enabled. This corresponds to the check in
+	// drivers/tty/n_tty.c:n_tty_write().
+	if t.termios.LEnabled(linux.TOSTOP) {
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+	}
+	return t.fileDescription.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.
+func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Check whether TOSTOP is enabled. This corresponds to the check in
+	// drivers/tty/n_tty.c:n_tty_write().
+	if t.termios.LEnabled(linux.TOSTOP) {
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+	}
+	return t.fileDescription.Write(ctx, src, opts)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.
+func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Ignore arg[0]. This is the real FD:
+	fd := t.inode.hostFD
+	ioctl := args[1].Uint64()
+	switch ioctl {
+	case linux.TCGETS:
+		termios, err := ioctlGetTermios(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+
+		var termios linux.Termios
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetTermios(fd, ioctl, &termios)
+		if err == nil {
+			t.termios.FromTermios(termios)
+		}
+		return 0, err
+
+	case linux.TIOCGPGRP:
+		// Args: pid_t *argp
+		// When successful, equivalent to *argp = tcgetpgrp(fd).
+		// Get the process group ID of the foreground process group on this
+		// terminal.
+
+		pidns := kernel.PIDNamespaceFromContext(ctx)
+		if pidns == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Map the ProcessGroup into a ProcessGroupID in the task's PID namespace.
+		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSPGRP:
+		// Args: const pid_t *argp
+		// Equivalent to tcsetpgrp(fd, *argp).
+		// Set the foreground process group ID of this terminal.
+
+		task := kernel.TaskFromContext(ctx)
+		if task == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Check that we are allowed to set the process group.
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			// drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change()
+			// to -ENOTTY.
+			if err == syserror.EIO {
+				return 0, syserror.ENOTTY
+			}
+			return 0, err
+		}
+
+		// Check that calling task's process group is in the TTY session.
+		if task.ThreadGroup().Session() != t.session {
+			return 0, syserror.ENOTTY
+		}
+
+		var pgID kernel.ProcessGroupID
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+
+		// pgID must be non-negative.
+		if pgID < 0 {
+			return 0, syserror.EINVAL
+		}
+
+		// Process group with pgID must exist in this PID namespace.
+		pidns := task.PIDNamespace()
+		pg := pidns.ProcessGroupWithID(pgID)
+		if pg == nil {
+			return 0, syserror.ESRCH
+		}
+
+		// Check that new process group is in the TTY session.
+		if pg.Session() != t.session {
+			return 0, syserror.EPERM
+		}
+
+		t.fgProcessGroup = pg
+		return 0, nil
+
+	case linux.TIOCGWINSZ:
+		// Args: struct winsize *argp
+		// Get window size.
+		winsize, err := ioctlGetWinsize(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSWINSZ:
+		// Args: const struct winsize *argp
+		// Set window size.
+
+		// Unlike setting the termios, any process group (even background ones) can
+		// set the winsize.
+
+		var winsize linux.Winsize
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetWinsize(fd, &winsize)
+		return 0, err
+
+	// Unimplemented commands.
+	case linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCNOTTY,
+		linux.TIOCSCTTY,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+		fallthrough
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// checkChange checks that the process group is allowed to read, write, or
+// change the state of the TTY.
+//
+// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic
+// is a bit convoluted, but documented inline.
+//
+// Preconditions: t.mu must be held.
+func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		// No task? Linux does not have an analog for this case, but
+		// tty_check_change is more of a blacklist of cases than a
+		// whitelist, and is surprisingly permissive. Allowing the
+		// change seems most appropriate.
+		return nil
+	}
+
+	tg := task.ThreadGroup()
+	pg := tg.ProcessGroup()
+
+	// If the session for the task is different than the session for the
+	// controlling TTY, then the change is allowed. Seems like a bad idea,
+	// but that's exactly what linux does.
+	if tg.Session() != t.fgProcessGroup.Session() {
+		return nil
+	}
+
+	// If we are the foreground process group, then the change is allowed.
+	if pg == t.fgProcessGroup {
+		return nil
+	}
+
+	// We are not the foreground process group.
+
+	// Is the provided signal blocked or ignored?
+	if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) {
+		// If the signal is SIGTTIN, then we are attempting to read
+		// from the TTY. Don't send the signal and return EIO.
+		if sig == linux.SIGTTIN {
+			return syserror.EIO
+		}
+
+		// Otherwise, we are writing or changing terminal state. This is allowed.
+		return nil
+	}
+
+	// If the process group is an orphan, return EIO.
+	if pg.IsOrphan() {
+		return syserror.EIO
+	}
+
+	// Otherwise, send the signal to the process group and return ERESTARTSYS.
+	//
+	// Note that Linux also unconditionally sets TIF_SIGPENDING on current,
+	// but this isn't necessary in gVisor because the rationale given in
+	// 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't
+	// apply: the sentry will handle -ERESTARTSYS in
+	// kernel.runApp.execute() even if the kernel.Task isn't interrupted.
+	//
+	// Linux ignores the result of kill_pgrp().
+	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
+	return kernel.ERESTARTSYS
+}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
new file mode 100644
index 000000000..2bc757b1a
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func toTimespec(ts linux.StatxTimestamp, omit bool) syscall.Timespec {
+	if omit {
+		return syscall.Timespec{
+			Sec:  0,
+			Nsec: unix.UTIME_OMIT,
+		}
+	}
+	return syscall.Timespec{
+		Sec:  ts.Sec,
+		Nsec: int64(ts.Nsec),
+	}
+}
+
+func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp {
+	return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec}
+}
+
+func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp {
+	return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)}
+}
+
+// wouldBlock returns true for file types that can return EWOULDBLOCK
+// for blocking operations, e.g. pipes, character devices, and sockets.
+func wouldBlock(fileType uint32) bool {
+	return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
+}
+
+// canMap returns true if a file with fileType is allowed to be memory mapped.
+// This is ported over from VFS1, but it's probably not the best way for us
+// to check if a file can be memory mapped.
+func canMap(fileType uint32) bool {
+	// TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()).
+	//
+	// TODO(b/38213152): Some obscure character devices can be mapped.
+	return fileType == syscall.S_IFREG
+}
+
+// isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
+// If so, they can be transformed into syserror.ErrWouldBlock.
+func isBlockError(err error) bool {
+	return err == syserror.EAGAIN || err == syserror.EWOULDBLOCK
+}
diff --git a/pkg/sentry/fsimpl/host/util_unsafe.go b/pkg/sentry/fsimpl/host/util_unsafe.go
new file mode 100644
index 000000000..5136ac844
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/util_unsafe.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+func setTimestamps(fd int, ts *[2]syscall.Timespec) error {
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_UTIMENSAT,
+		uintptr(fd),
+		0, /* path */
+		uintptr(unsafe.Pointer(ts)),
+		0, /* flags */
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index e73f1f857..ef34cb28a 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -4,6 +4,17 @@ load("//tools/go_generics:defs.bzl", "go_template_instance")
 licenses(["notice"])
 
 go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "kernfs",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "Dentry",
+    },
+)
+
+go_template_instance(
     name = "slot_list",
     out = "slot_list.go",
     package = "kernfs",
@@ -21,6 +32,7 @@ go_library(
         "dynamic_bytes_file.go",
         "fd_impl_util.go",
         "filesystem.go",
+        "fstree.go",
         "inode_impl_util.go",
         "kernfs.go",
         "slot_list.go",
@@ -35,6 +47,7 @@ go_library(
         "//pkg/refs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index d092ccb2a..1568a9d49 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -44,16 +44,16 @@ type DynamicBytesFile struct {
 var _ Inode = (*DynamicBytesFile)(nil)
 
 // Init initializes a dynamic bytes file.
-func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
+func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
 	if perm&^linux.PermissionsMask != 0 {
 		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
 	}
-	f.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm)
+	f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
 	f.data = data
 }
 
 // Open implements Inode.Open.
-func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
 	if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil {
 		return nil, err
@@ -61,9 +61,10 @@ func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vf
 	return &fd.vfsfd, nil
 }
 
-// SetStat implements Inode.SetStat.
-func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
-	// DynamicBytesFiles are immutable.
+// SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow
+// inode attributes to be changed. Override SetStat() making it call
+// f.InodeAttrs to allow it.
+func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
@@ -122,7 +123,7 @@ func (fd *DynamicBytesFD) Release() {}
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return fd.inode.Stat(fs), nil
+	return fd.inode.Stat(fs, opts)
 }
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index eda781155..8284e76a7 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -15,10 +15,14 @@
 package kernfs
 
 import (
+	"math"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -33,24 +37,43 @@ import (
 // inode.
 //
 // Must be initialize with Init before first use.
+//
+// Lock ordering: mu => children.mu.
 type GenericDirectoryFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DirectoryFileDescriptionDefaultImpl
 
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
-	off      int64
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// off is the current directory offset. Protected by "mu".
+	off int64
+}
+
+// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
+// dentry.
+func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+	fd := &GenericDirectoryFD{}
+	if err := fd.Init(children, opts); err != nil {
+		return nil, err
+	}
+	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return fd, nil
 }
 
-// Init initializes a GenericDirectoryFD.
-func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) error {
+// Init initializes a GenericDirectoryFD. Use it when overriding
+// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
+// correct implementation.
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error {
 	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
-	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
-		return err
-	}
 	fd.children = children
 	return nil
 }
@@ -100,40 +123,44 @@ func (fd *GenericDirectoryFD) inode() Inode {
 // IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds
 // o.mu when calling cb.
 func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
-	vfsFS := fd.filesystem()
-	fs := vfsFS.Impl().(*Filesystem)
-	vfsd := fd.vfsfd.VirtualDentry().Dentry()
-
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
 
+	opts := vfs.StatOptions{Mask: linux.STATX_INO}
 	// Handle ".".
 	if fd.off == 0 {
-		stat := fd.inode().Stat(vfsFS)
+		stat, err := fd.inode().Stat(fd.filesystem(), opts)
+		if err != nil {
+			return err
+		}
 		dirent := vfs.Dirent{
 			Name:    ".",
 			Type:    linux.DT_DIR,
 			Ino:     stat.Ino,
 			NextOff: 1,
 		}
-		if !cb.Handle(dirent) {
-			return nil
+		if err := cb.Handle(dirent); err != nil {
+			return err
 		}
 		fd.off++
 	}
 
 	// Handle "..".
 	if fd.off == 1 {
-		parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode
-		stat := parentInode.Stat(vfsFS)
+		vfsd := fd.vfsfd.VirtualDentry().Dentry()
+		parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode
+		stat, err := parentInode.Stat(fd.filesystem(), opts)
+		if err != nil {
+			return err
+		}
 		dirent := vfs.Dirent{
 			Name:    "..",
 			Type:    linux.FileMode(stat.Mode).DirentType(),
 			Ino:     stat.Ino,
 			NextOff: 2,
 		}
-		if !cb.Handle(dirent) {
-			return nil
+		if err := cb.Handle(dirent); err != nil {
+			return err
 		}
 		fd.off++
 	}
@@ -146,15 +173,18 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	childIdx := fd.off - 2
 	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
 		inode := it.Dentry.Impl().(*Dentry).inode
-		stat := inode.Stat(vfsFS)
+		stat, err := inode.Stat(fd.filesystem(), opts)
+		if err != nil {
+			return err
+		}
 		dirent := vfs.Dirent{
 			Name:    it.Name,
 			Type:    linux.FileMode(stat.Mode).DirentType(),
 			Ino:     stat.Ino,
 			NextOff: fd.off + 1,
 		}
-		if !cb.Handle(dirent) {
-			return nil
+		if err := cb.Handle(dirent); err != nil {
+			return err
 		}
 		fd.off++
 	}
@@ -167,15 +197,18 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 
 // Seek implements vfs.FileDecriptionImpl.Seek.
 func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	fs := fd.filesystem().Impl().(*Filesystem)
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
 
 	switch whence {
 	case linux.SEEK_SET:
 		// Use offset as given.
 	case linux.SEEK_CUR:
 		offset += fd.off
+	case linux.SEEK_END:
+		// TODO(gvisor.dev/issue/1193): This can prevent new files from showing up
+		// if they are added after SEEK_END.
+		offset = math.MaxInt64
 	default:
 		return 0, syserror.EINVAL
 	}
@@ -190,12 +223,12 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.filesystem()
 	inode := fd.inode()
-	return inode.Stat(fs), nil
+	return inode.Stat(fs, opts)
 }
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	fs := fd.filesystem()
+	creds := auth.CredentialsFromContext(ctx)
 	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
-	return inode.SetStat(fs, opts)
+	return inode.SetStat(ctx, fd.filesystem(), creds, opts)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index ee98eb66a..4a12ae245 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -12,16 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// This file implements vfs.FilesystemImpl for kernfs.
-
 package kernfs
 
+// This file implements vfs.FilesystemImpl for kernfs.
+
 import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -54,36 +56,48 @@ afterSymlink:
 		return vfsd, nil
 	}
 	if name == ".." {
-		nextVFSD, err := rp.ResolveParent(vfsd)
-		if err != nil {
+		if isRoot, err := rp.CheckRoot(vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return vfsd, nil
+		}
+		if err := rp.CheckMount(&d.parent.vfsd); err != nil {
 			return nil, err
 		}
 		rp.Advance()
-		return nextVFSD, nil
+		return &d.parent.vfsd, nil
+	}
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
 	}
 	d.dirMu.Lock()
-	nextVFSD, err := rp.ResolveChild(vfsd, name)
+	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name])
+	d.dirMu.Unlock()
 	if err != nil {
-		d.dirMu.Unlock()
 		return nil, err
 	}
-	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD)
-	d.dirMu.Unlock()
-	if err != nil {
+	if err := rp.CheckMount(&next.vfsd); err != nil {
 		return nil, err
 	}
 	// Resolve any symlink at current path component.
 	if rp.ShouldFollowSymlink() && next.isSymlink() {
-		// TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks".
-		target, err := next.inode.Readlink(ctx)
+		targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
 		if err != nil {
 			return nil, err
 		}
-		if err := rp.HandleSymlink(target); err != nil {
-			return nil, err
+		if targetVD.Ok() {
+			err := rp.HandleJump(targetVD)
+			targetVD.DecRef()
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			if err := rp.HandleSymlink(targetPathname); err != nil {
+				return nil, err
+			}
 		}
 		goto afterSymlink
-
 	}
 	rp.Advance()
 	return &next.vfsd, nil
@@ -97,17 +111,17 @@ afterSymlink:
 // parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) {
-	if childVFSD != nil {
+func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
+	if child != nil {
 		// Cached dentry exists, revalidate.
-		child := childVFSD.Impl().(*Dentry)
 		if !child.inode.Valid(ctx) {
-			vfsObj.ForceDeleteDentry(childVFSD)
-			fs.deferDecRef(childVFSD) // Reference from Lookup.
-			childVFSD = nil
+			delete(parent.children, name)
+			vfsObj.InvalidateDentry(&child.vfsd)
+			fs.deferDecRef(&child.vfsd) // Reference from Lookup.
+			child = nil
 		}
 	}
-	if childVFSD == nil {
+	if child == nil {
 		// Dentry isn't cached; it either doesn't exist or failed
 		// revalidation. Attempt to resolve it via Lookup.
 		//
@@ -115,15 +129,15 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		// *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
 		// that all dentries in the filesystem are (kernfs.)Dentry and performs
 		// vfs.DentryImpl casts accordingly.
-		var err error
-		childVFSD, err = parent.inode.Lookup(ctx, name)
+		childVFSD, err := parent.inode.Lookup(ctx, name)
 		if err != nil {
 			return nil, err
 		}
 		// Reference on childVFSD dropped by a corresponding Valid.
-		parent.insertChildLocked(name, childVFSD)
+		child = childVFSD.Impl().(*Dentry)
+		parent.insertChildLocked(name, child)
 	}
-	return childVFSD.Impl().(*Dentry), nil
+	return child, nil
 }
 
 // walkExistingLocked resolves rp to an existing file.
@@ -189,14 +203,14 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 	if pc == "." || pc == ".." {
 		return "", syserror.EEXIST
 	}
-	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
-	if err != nil {
-		return "", err
+	if len(pc) > linux.NAME_MAX {
+		return "", syserror.ENAMETOOLONG
 	}
-	if childVFSD != nil {
+	// FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu.
+	if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok {
 		return "", syserror.EEXIST
 	}
-	if parentVFSD.IsDisowned() {
+	if parentVFSD.IsDead() {
 		return "", syserror.ENOENT
 	}
 	return pc, nil
@@ -206,14 +220,14 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 //
 // Preconditions: Filesystem.mu must be locked for at least reading.
 func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
-	parentVFSD := vfsd.Parent()
-	if parentVFSD == nil {
+	parent := vfsd.Impl().(*Dentry).parent
+	if parent == nil {
 		return syserror.EBUSY
 	}
-	if parentVFSD.IsDisowned() {
+	if parent.vfsd.IsDead() {
 		return syserror.ENOENT
 	}
-	if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	return nil
@@ -229,6 +243,19 @@ func (fs *Filesystem) Sync(ctx context.Context) error {
 	return nil
 }
 
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs()
+	defer fs.mu.RUnlock()
+
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	if err != nil {
+		return err
+	}
+	return inode.CheckPermissions(ctx, creds, ats)
+}
+
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
 func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
 	fs.mu.RLock()
@@ -294,11 +321,11 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		return syserror.EPERM
 	}
 
-	child, err := parentInode.NewLink(ctx, pc, d.inode)
+	childVFSD, err := parentInode.NewLink(ctx, pc, d.inode)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
 	return nil
 }
 
@@ -322,11 +349,11 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	child, err := parentInode.NewDir(ctx, pc, opts)
+	childVFSD, err := parentInode.NewDir(ctx, pc, opts)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
 	return nil
 }
 
@@ -350,11 +377,11 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	new, err := parentInode.NewNode(ctx, pc, opts)
+	newVFSD, err := parentInode.NewNode(ctx, pc, opts)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, new)
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry))
 	return nil
 }
 
@@ -364,7 +391,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
 	// appropriate bits in rp), but are returned by
 	// FileDescriptionImpl.StatusFlags().
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
 	ats := vfs.AccessTypesForOpenFlags(&opts)
 
 	// Do not create new file.
@@ -379,7 +406,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		return inode.Open(rp, vfsd, opts)
+		return inode.Open(ctx, rp, vfsd, opts)
 	}
 
 	// May create new file.
@@ -398,7 +425,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		return inode.Open(rp, vfsd, opts)
+		return inode.Open(ctx, rp, vfsd, opts)
 	}
 afterTrailingSymlink:
 	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
@@ -418,12 +445,12 @@ afterTrailingSymlink:
 	if pc == "." || pc == ".." {
 		return nil, syserror.EISDIR
 	}
-	// Determine whether or not we need to create a file.
-	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
-	if err != nil {
-		return nil, err
+	if len(pc) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
 	}
-	if childVFSD == nil {
+	// Determine whether or not we need to create a file.
+	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD)
+	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
 		if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
 			return nil, err
@@ -433,37 +460,46 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		child, err := parentInode.NewFile(ctx, pc, opts)
+		childVFSD, err = parentInode.NewFile(ctx, pc, opts)
 		if err != nil {
 			return nil, err
 		}
+		child := childVFSD.Impl().(*Dentry)
 		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
-		return child.Impl().(*Dentry).inode.Open(rp, child, opts)
+		return child.inode.Open(ctx, rp, childVFSD, opts)
+	}
+	if err != nil {
+		return nil, err
 	}
 	// Open existing file or follow symlink.
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
-	childDentry := childVFSD.Impl().(*Dentry)
-	childInode := childDentry.inode
-	if rp.ShouldFollowSymlink() {
-		if childDentry.isSymlink() {
-			target, err := childInode.Readlink(ctx)
+	child := childVFSD.Impl().(*Dentry)
+	if rp.ShouldFollowSymlink() && child.isSymlink() {
+		targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if targetVD.Ok() {
+			err := rp.HandleJump(targetVD)
+			targetVD.DecRef()
 			if err != nil {
 				return nil, err
 			}
-			if err := rp.HandleSymlink(target); err != nil {
+		} else {
+			if err := rp.HandleSymlink(targetPathname); err != nil {
 				return nil, err
 			}
-			// rp.Final() may no longer be true since we now need to resolve the
-			// symlink target.
-			goto afterTrailingSymlink
 		}
+		// rp.Final() may no longer be true since we now need to resolve the
+		// symlink target.
+		goto afterTrailingSymlink
 	}
-	if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
-	return childInode.Open(rp, childVFSD, opts)
+	return child.inode.Open(ctx, rp, &child.vfsd, opts)
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
@@ -490,15 +526,16 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
 
 	fs.mu.Lock()
-	defer fs.mu.Lock()
+	defer fs.processDeferredDecRefsLocked()
+	defer fs.mu.Unlock()
 
 	// Resolve the destination directory first to verify that it's on this
 	// Mount.
 	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked()
 	if err != nil {
 		return err
 	}
+	dstDir := dstDirVFSD.Impl().(*Dentry)
 	mnt := rp.Mount()
 	if mnt != oldParentVD.Mount() {
 		return syserror.EXDEV
@@ -511,9 +548,8 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	srcDirVFSD := oldParentVD.Dentry()
 	srcDir := srcDirVFSD.Impl().(*Dentry)
 	srcDir.dirMu.Lock()
-	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName))
+	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDir.children[oldName])
 	srcDir.dirMu.Unlock()
-	fs.processDeferredDecRefsLocked()
 	if err != nil {
 		return err
 	}
@@ -525,7 +561,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 
 	// Can we create the dst dentry?
-	var dstVFSD *vfs.Dentry
+	var dst *Dentry
 	pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
 	switch err {
 	case nil:
@@ -535,37 +571,51 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			// Won't overwrite existing node since RENAME_NOREPLACE was requested.
 			return syserror.EEXIST
 		}
-		dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc)
-		if err != nil {
+		dst = dstDir.children[pc]
+		if dst == nil {
 			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
 		}
 	default:
 		return err
 	}
+	var dstVFSD *vfs.Dentry
+	if dst != nil {
+		dstVFSD = &dst.vfsd
+	}
 
 	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
 	virtfs := rp.VirtualFilesystem()
 
-	srcDirDentry := srcDirVFSD.Impl().(*Dentry)
-	dstDirDentry := dstDirVFSD.Impl().(*Dentry)
-
 	// We can't deadlock here due to lock ordering because we're protected from
 	// concurrent renames by fs.mu held for writing.
-	srcDirDentry.dirMu.Lock()
-	defer srcDirDentry.dirMu.Unlock()
-	dstDirDentry.dirMu.Lock()
-	defer dstDirDentry.dirMu.Unlock()
+	srcDir.dirMu.Lock()
+	defer srcDir.dirMu.Unlock()
+	if srcDir != dstDir {
+		dstDir.dirMu.Lock()
+		defer dstDir.dirMu.Unlock()
+	}
 
 	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
 		return err
 	}
-	srcDirInode := srcDirDentry.inode
-	replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD)
+	replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD)
 	if err != nil {
 		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
 		return err
 	}
-	virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced)
+	delete(srcDir.children, src.name)
+	if srcDir != dstDir {
+		fs.deferDecRef(srcDirVFSD)
+		dstDir.IncRef()
+	}
+	src.parent = dstDir
+	src.name = pc
+	if dstDir.children == nil {
+		dstDir.children = make(map[string]*Dentry)
+	}
+	dstDir.children[pc] = src
+	virtfs.CommitRenameReplaceDentry(srcVFSD, replaced)
 	return nil
 }
 
@@ -585,17 +635,21 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
 		return err
 	}
-	if !vfsd.Impl().(*Dentry).isDir() {
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
 		return syserror.ENOTDIR
 	}
 	if inode.HasChildren() {
 		return syserror.ENOTEMPTY
 	}
 	virtfs := rp.VirtualFilesystem()
-	parentDentry := vfsd.Parent().Impl().(*Dentry)
+	parentDentry := d.parent
 	parentDentry.dirMu.Lock()
 	defer parentDentry.dirMu.Unlock()
-	if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
 	if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
@@ -618,7 +672,7 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	return inode.SetStat(fs.VFSFilesystem(), opts)
+	return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
@@ -630,7 +684,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if err != nil {
 		return linux.Statx{}, err
 	}
-	return inode.Stat(fs.VFSFilesystem()), nil
+	return inode.Stat(fs.VFSFilesystem(), opts)
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
@@ -642,7 +696,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO: actually implement statfs
+	// TODO(gvisor.dev/issue/1193): actually implement statfs.
 	return linux.Statfs{}, syserror.ENOSYS
 }
 
@@ -666,11 +720,11 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	child, err := parentInode.NewSymlink(ctx, pc, target)
+	childVFSD, err := parentInode.NewSymlink(ctx, pc, target)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
 	return nil
 }
 
@@ -690,14 +744,17 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
 		return err
 	}
-	if vfsd.Impl().(*Dentry).isDir() {
+	d := vfsd.Impl().(*Dentry)
+	if d.isDir() {
 		return syserror.EISDIR
 	}
 	virtfs := rp.VirtualFilesystem()
-	parentDentry := vfsd.Parent().Impl().(*Dentry)
+	parentDentry := d.parent
 	parentDentry.dirMu.Lock()
 	defer parentDentry.dirMu.Unlock()
-	if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
 	if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
@@ -708,8 +765,23 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return nil, err
+	}
+	if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
@@ -722,7 +794,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
@@ -764,5 +836,5 @@ func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 099d70a16..982daa2e6 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -36,20 +36,20 @@ type InodeNoopRefCount struct {
 }
 
 // IncRef implements Inode.IncRef.
-func (n *InodeNoopRefCount) IncRef() {
+func (InodeNoopRefCount) IncRef() {
 }
 
 // DecRef implements Inode.DecRef.
-func (n *InodeNoopRefCount) DecRef() {
+func (InodeNoopRefCount) DecRef() {
 }
 
 // TryIncRef implements Inode.TryIncRef.
-func (n *InodeNoopRefCount) TryIncRef() bool {
+func (InodeNoopRefCount) TryIncRef() bool {
 	return true
 }
 
 // Destroy implements Inode.Destroy.
-func (n *InodeNoopRefCount) Destroy() {
+func (InodeNoopRefCount) Destroy() {
 }
 
 // InodeDirectoryNoNewChildren partially implements the Inode interface.
@@ -58,27 +58,27 @@ func (n *InodeNoopRefCount) Destroy() {
 type InodeDirectoryNoNewChildren struct{}
 
 // NewFile implements Inode.NewFile.
-func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewDir implements Inode.NewDir.
-func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewLink implements Inode.NewLink.
-func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
 // NewNode implements Inode.NewNode.
-func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
 	return nil, syserror.EPERM
 }
 
@@ -90,62 +90,62 @@ type InodeNotDirectory struct {
 }
 
 // HasChildren implements Inode.HasChildren.
-func (*InodeNotDirectory) HasChildren() bool {
+func (InodeNotDirectory) HasChildren() bool {
 	return false
 }
 
 // NewFile implements Inode.NewFile.
-func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
 	panic("NewFile called on non-directory inode")
 }
 
 // NewDir implements Inode.NewDir.
-func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
 	panic("NewDir called on non-directory inode")
 }
 
 // NewLink implements Inode.NewLinkink.
-func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
 	panic("NewLink called on non-directory inode")
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
 	panic("NewSymlink called on non-directory inode")
 }
 
 // NewNode implements Inode.NewNode.
-func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
 	panic("NewNode called on non-directory inode")
 }
 
 // Unlink implements Inode.Unlink.
-func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
 	panic("Unlink called on non-directory inode")
 }
 
 // RmDir implements Inode.RmDir.
-func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
 	panic("RmDir called on non-directory inode")
 }
 
 // Rename implements Inode.Rename.
-func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
 	panic("Rename called on non-directory inode")
 }
 
 // Lookup implements Inode.Lookup.
-func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
 	panic("Lookup called on non-directory inode")
 }
 
 // IterDirents implements Inode.IterDirents.
-func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
 	panic("IterDirents called on non-directory inode")
 }
 
 // Valid implements Inode.Valid.
-func (*InodeNotDirectory) Valid(context.Context) bool {
+func (InodeNotDirectory) Valid(context.Context) bool {
 	return true
 }
 
@@ -157,17 +157,17 @@ func (*InodeNotDirectory) Valid(context.Context) bool {
 type InodeNoDynamicLookup struct{}
 
 // Lookup implements Inode.Lookup.
-func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
 	return nil, syserror.ENOENT
 }
 
 // IterDirents implements Inode.IterDirents.
-func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	return offset, nil
 }
 
 // Valid implements Inode.Valid.
-func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
+func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
 	return true
 }
 
@@ -177,25 +177,32 @@ func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
 type InodeNotSymlink struct{}
 
 // Readlink implements Inode.Readlink.
-func (*InodeNotSymlink) Readlink(context.Context) (string, error) {
+func (InodeNotSymlink) Readlink(context.Context) (string, error) {
 	return "", syserror.EINVAL
 }
 
+// Getlink implements Inode.Getlink.
+func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	return vfs.VirtualDentry{}, "", syserror.EINVAL
+}
+
 // InodeAttrs partially implements the Inode interface, specifically the
 // inodeMetadata sub interface. InodeAttrs provides functionality related to
 // inode attributes.
 //
 // Must be initialized by Init prior to first use.
 type InodeAttrs struct {
-	ino   uint64
-	mode  uint32
-	uid   uint32
-	gid   uint32
-	nlink uint32
+	devMajor uint32
+	devMinor uint32
+	ino      uint64
+	mode     uint32
+	uid      uint32
+	gid      uint32
+	nlink    uint32
 }
 
 // Init initializes this InodeAttrs.
-func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMode) {
+func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
 	if mode.FileType() == 0 {
 		panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
 	}
@@ -204,6 +211,8 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo
 	if mode.FileType() == linux.ModeDirectory {
 		nlink = 2
 	}
+	a.devMajor = devMajor
+	a.devMinor = devMinor
 	atomic.StoreUint64(&a.ino, ino)
 	atomic.StoreUint32(&a.mode, uint32(mode))
 	atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID))
@@ -211,6 +220,21 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo
 	atomic.StoreUint32(&a.nlink, nlink)
 }
 
+// DevMajor returns the device major number.
+func (a *InodeAttrs) DevMajor() uint32 {
+	return a.devMajor
+}
+
+// DevMinor returns the device minor number.
+func (a *InodeAttrs) DevMinor() uint32 {
+	return a.devMinor
+}
+
+// Ino returns the inode id.
+func (a *InodeAttrs) Ino() uint64 {
+	return atomic.LoadUint64(&a.ino)
+}
+
 // Mode implements Inode.Mode.
 func (a *InodeAttrs) Mode() linux.FileMode {
 	return linux.FileMode(atomic.LoadUint32(&a.mode))
@@ -219,22 +243,34 @@ func (a *InodeAttrs) Mode() linux.FileMode {
 // Stat partially implements Inode.Stat. Note that this function doesn't provide
 // all the stat fields, and the embedder should consider extending the result
 // with filesystem-specific fields.
-func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx {
+func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
+	stat.DevMajor = a.devMajor
+	stat.DevMinor = a.devMinor
 	stat.Ino = atomic.LoadUint64(&a.ino)
 	stat.Mode = uint16(a.Mode())
 	stat.UID = atomic.LoadUint32(&a.uid)
 	stat.GID = atomic.LoadUint32(&a.gid)
 	stat.Nlink = atomic.LoadUint32(&a.nlink)
 
-	// TODO: Implement other stat fields like timestamps.
+	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
 
-	return stat
+	return stat, nil
 }
 
 // SetStat implements Inode.SetStat.
-func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
+		return syserror.EPERM
+	}
+	if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+		return err
+	}
+
 	stat := opts.Stat
 	if stat.Mask&linux.STATX_MODE != 0 {
 		for {
@@ -256,19 +292,17 @@ func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
 	// Note that not all fields are modifiable. For example, the file type and
 	// inode numbers are immutable after node creation.
 
-	// TODO: Implement other stat fields like timestamps.
+	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
 
 	return nil
 }
 
 // CheckPermissions implements Inode.CheckPermissions.
 func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
-	mode := a.Mode()
 	return vfs.GenericCheckPermissions(
 		creds,
 		ats,
-		mode.FileType() == linux.ModeDirectory,
-		uint16(mode),
+		a.Mode(),
 		auth.KUID(atomic.LoadUint32(&a.uid)),
 		auth.KGID(atomic.LoadUint32(&a.gid)),
 	)
@@ -346,8 +380,8 @@ func (o *OrderedChildren) Destroy() {
 // cache. Populate returns the number of directories inserted, which the caller
 // may use to update the link count for the parent directory.
 //
-// Precondition: d.Impl() must be a kernfs Dentry. d must represent a directory
-// inode. children must not contain any conflicting entries already in o.
+// Precondition: d must represent a directory inode. children must not contain
+// any conflicting entries already in o.
 func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
 	var links uint32
 	for name, child := range children {
@@ -357,7 +391,7 @@ func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint3
 		if err := o.Insert(name, child.VFSDentry()); err != nil {
 			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
 		}
-		d.InsertChild(name, child.VFSDentry())
+		d.InsertChild(name, child)
 	}
 	return links
 }
@@ -507,7 +541,7 @@ type InodeSymlink struct {
 }
 
 // Open implements Inode.Open.
-func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ELOOP
 }
 
@@ -526,9 +560,9 @@ type StaticDirectory struct {
 var _ Inode = (*StaticDirectory)(nil)
 
 // NewStaticDir creates a new static directory and returns its dentry.
-func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
+func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
 	inode := &StaticDirectory{}
-	inode.Init(creds, ino, perm)
+	inode.Init(creds, devMajor, devMinor, ino, perm)
 
 	dentry := &Dentry{}
 	dentry.Init(inode)
@@ -541,16 +575,31 @@ func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, chil
 }
 
 // Init initializes StaticDirectory.
-func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) {
+func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
 	if perm&^linux.PermissionsMask != 0 {
 		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
 	}
-	s.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm)
+	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
 }
 
 // Open implements kernfs.Inode.
-func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
+func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// AlwaysValid partially implements kernfs.inodeDynamicLookup.
+type AlwaysValid struct{}
+
+// Valid implements kernfs.inodeDynamicLookup.
+func (*AlwaysValid) Valid(context.Context) bool {
+	return true
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index c74fa999b..bbee8ccda 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -63,9 +63,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
-// FilesystemType implements vfs.FilesystemType.
-type FilesystemType struct{}
-
 // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
 // filesystem. Concrete implementations are expected to embed this in their own
 // Filesystem type.
@@ -135,13 +132,6 @@ func (fs *Filesystem) processDeferredDecRefsLocked() {
 	fs.droppedDentriesMu.Unlock()
 }
 
-// Init initializes a kernfs filesystem. This should be called from during
-// vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding
-// kernfs.
-func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem) {
-	fs.vfsfs.Init(vfsObj, fs)
-}
-
 // VFSFilesystem returns the generic vfs filesystem object.
 func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem {
 	return &fs.vfsfs
@@ -171,19 +161,22 @@ const (
 //
 // Must be initialized by Init prior to first use.
 type Dentry struct {
-	refs.AtomicRefCount
-
-	vfsd  vfs.Dentry
-	inode Inode
+	vfsd vfs.Dentry
 
-	refs uint64
+	refs.AtomicRefCount
 
 	// flags caches useful information about the dentry from the inode. See the
 	// dflags* consts above. Must be accessed by atomic ops.
 	flags uint32
 
-	// dirMu protects vfsd.children for directory dentries.
-	dirMu sync.Mutex
+	parent *Dentry
+	name   string
+
+	// dirMu protects children and the names of child Dentries.
+	dirMu    sync.Mutex
+	children map[string]*Dentry
+
+	inode Inode
 }
 
 // Init initializes this dentry.
@@ -227,18 +220,30 @@ func (d *Dentry) DecRef() {
 func (d *Dentry) destroy() {
 	d.inode.DecRef() // IncRef from Init.
 	d.inode = nil
-	if parent := d.vfsd.Parent(); parent != nil {
-		parent.DecRef() // IncRef from Dentry.InsertChild.
+	if d.parent != nil {
+		d.parent.DecRef() // IncRef from Dentry.InsertChild.
 	}
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *Dentry) Watches() *vfs.Watches {
+	return nil
+}
+
 // InsertChild inserts child into the vfs dentry cache with the given name under
 // this dentry. This does not update the directory inode, so calling this on
-// it's own isn't sufficient to insert a child into a directory. InsertChild
+// its own isn't sufficient to insert a child into a directory. InsertChild
 // updates the link count on d if required.
 //
 // Precondition: d must represent a directory inode.
-func (d *Dentry) InsertChild(name string, child *vfs.Dentry) {
+func (d *Dentry) InsertChild(name string, child *Dentry) {
 	d.dirMu.Lock()
 	d.insertChildLocked(name, child)
 	d.dirMu.Unlock()
@@ -248,13 +253,22 @@ func (d *Dentry) InsertChild(name string, child *vfs.Dentry) {
 // preconditions.
 //
 // Precondition: d.dirMu must be locked.
-func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) {
+func (d *Dentry) insertChildLocked(name string, child *Dentry) {
 	if !d.isDir() {
 		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
 	}
-	vfsDentry := d.VFSDentry()
-	vfsDentry.IncRef() // DecRef in child's Dentry.destroy.
-	vfsDentry.InsertChild(child, name)
+	d.IncRef() // DecRef in child's Dentry.destroy.
+	child.parent = d
+	child.name = name
+	if d.children == nil {
+		d.children = make(map[string]*Dentry)
+	}
+	d.children[name] = child
+}
+
+// Inode returns the dentry's inode.
+func (d *Dentry) Inode() Inode {
+	return d.inode
 }
 
 // The Inode interface maps filesystem-level operations that operate on paths to
@@ -302,8 +316,9 @@ type Inode interface {
 	// this inode. The returned file description should hold a reference on the
 	// inode for its lifetime.
 	//
-	// Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry.
-	Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
+	// the inode on which Open() is being called.
+	Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
 }
 
 type inodeRefs interface {
@@ -320,7 +335,7 @@ type inodeMetadata interface {
 	// CheckPermissions checks that creds may access this inode for the
 	// requested access type, per the the rules of
 	// fs/namei.c:generic_permission().
-	CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error
+	CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error
 
 	// Mode returns the (struct stat)::st_mode value for this inode. This is
 	// separated from Stat for performance.
@@ -328,11 +343,13 @@ type inodeMetadata interface {
 
 	// Stat returns the metadata for this inode. This corresponds to
 	// vfs.FilesystemImpl.StatAt.
-	Stat(fs *vfs.Filesystem) linux.Statx
+	Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
 
 	// SetStat updates the metadata for this inode. This corresponds to
-	// vfs.FilesystemImpl.SetStatAt.
-	SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error
+	// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
+	// if the operation can be performed (see vfs.CheckSetStat() for common
+	// checks).
+	SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error
 }
 
 // Precondition: All methods in this interface may only be called on directory
@@ -416,7 +433,21 @@ type inodeDynamicLookup interface {
 }
 
 type inodeSymlink interface {
-	// Readlink resolves the target of a symbolic link. If an inode is not a
+	// Readlink returns the target of a symbolic link. If an inode is not a
 	// symlink, the implementation should return EINVAL.
 	Readlink(ctx context.Context) (string, error)
+
+	// Getlink returns the target of a symbolic link, as used by path
+	// resolution:
+	//
+	// - If the inode is a "magic link" (a link whose target is most accurately
+	// represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "",
+	// nil). A reference is taken on the returned VirtualDentry.
+	//
+	// - If the inode is an ordinary symlink, Getlink returns (zero-value
+	// VirtualDentry, symlink target, nil).
+	//
+	// - If the inode is not a symlink, Getlink returns (zero-value
+	// VirtualDentry, "", EINVAL).
+	Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error)
 }
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 96a16e654..412cf6ac9 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -45,7 +45,10 @@ type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
 func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
-	v := vfs.New()
+	v := &vfs.VirtualFilesystem{}
+	if err := v.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
@@ -72,7 +75,7 @@ type file struct {
 func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
 	f := &file{}
 	f.content = content
-	f.DynamicBytesFile.Init(creds, fs.NextIno(), f, 0777)
+	f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
 
 	d := &kernfs.Dentry{}
 	d.Init(f)
@@ -88,7 +91,7 @@ type attrs struct {
 	kernfs.InodeAttrs
 }
 
-func (a *attrs) SetStat(fs *vfs.Filesystem, opt vfs.SetStatOptions) error {
+func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
@@ -104,7 +107,7 @@ type readonlyDir struct {
 
 func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
 	dir := &readonlyDir{}
-	dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
 	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
 	dir.dentry.Init(dir)
 
@@ -113,9 +116,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 	return &dir.dentry
 }
 
-func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil {
+func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
@@ -134,7 +137,7 @@ type dir struct {
 func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
 	dir := &dir{}
 	dir.fs = fs
-	dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
 	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
 	dir.dentry.Init(dir)
 
@@ -143,9 +146,11 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 	return &dir.dentry
 }
 
-func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
@@ -184,9 +189,13 @@ func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, err
 	return nil, syserror.EPERM
 }
 
-func (fst *fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fsType) Name() string {
+	return "kernfs"
+}
+
+func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	fs := &filesystem{}
-	fs.Init(vfsObj)
+	fs.VFSFilesystem().Init(vfsObj, &fst, fs)
 	root := fst.rootFn(creds, fs)
 	return fs.VFSFilesystem(), root.VFSDentry(), nil
 }
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 0ee7eb9b7..2ab3f53fd 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -18,6 +18,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // StaticSymlink provides an Inode implementation for symlinks that point to
@@ -33,9 +35,9 @@ type StaticSymlink struct {
 var _ Inode = (*StaticSymlink)(nil)
 
 // NewStaticSymlink creates a new symlink file pointing to 'target'.
-func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentry {
+func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry {
 	inode := &StaticSymlink{}
-	inode.Init(creds, ino, target)
+	inode.Init(creds, devMajor, devMinor, ino, target)
 
 	d := &Dentry{}
 	d.Init(inode)
@@ -43,12 +45,22 @@ func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentr
 }
 
 // Init initializes the instance.
-func (s *StaticSymlink) Init(creds *auth.Credentials, ino uint64, target string) {
+func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) {
 	s.target = target
-	s.InodeAttrs.Init(creds, ino, linux.ModeSymlink|0777)
+	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
 }
 
 // Readlink implements Inode.
 func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
 	return s.target, nil
 }
+
+// Getlink implements Inode.Getlink.
+func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	return vfs.VirtualDentry{}, s.target, nil
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD
new file mode 100644
index 000000000..5950a2d59
--- /dev/null
+++ b/pkg/sentry/fsimpl/pipefs/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "pipefs",
+    srcs = ["pipefs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
new file mode 100644
index 000000000..cab771211
--- /dev/null
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -0,0 +1,164 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipefs provides the filesystem implementation backing
+// Kernel.PipeMount.
+package pipefs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type filesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (filesystemType) Name() string {
+	return "pipefs"
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	panic("pipefs.filesystemType.GetFilesystem should never be called")
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+// NewFilesystem sets up and returns a new vfs.Filesystem implemented by pipefs.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, err
+	}
+	fs := &filesystem{
+		devMinor: devMinor,
+	}
+	fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
+	return fs.Filesystem.VFSFilesystem(), nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release()
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode)
+	b.PrependComponent(fmt.Sprintf("pipe:[%d]", inode.ino))
+	return vfs.PrependPathSyntheticError{}
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoopRefCount
+
+	pipe *pipe.VFSPipe
+
+	ino uint64
+	uid auth.KUID
+	gid auth.KGID
+	// We use the creation timestamp for all of atime, mtime, and ctime.
+	ctime ktime.Time
+}
+
+func newInode(ctx context.Context, fs *filesystem) *inode {
+	creds := auth.CredentialsFromContext(ctx)
+	return &inode{
+		pipe:  pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+		ino:   fs.Filesystem.NextIno(),
+		uid:   creds.EffectiveKUID,
+		gid:   creds.EffectiveKGID,
+		ctime: ktime.NowFromContext(ctx),
+	}
+}
+
+const pipeMode = 0600 | linux.S_IFIFO
+
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
+func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid)
+}
+
+// Mode implements kernfs.Inode.Mode.
+func (i *inode) Mode() linux.FileMode {
+	return pipeMode
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds())
+	return linux.Statx{
+		Mask:     linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
+		Blksize:  usermem.PageSize,
+		Nlink:    1,
+		UID:      uint32(i.uid),
+		GID:      uint32(i.gid),
+		Mode:     pipeMode,
+		Ino:      i.ino,
+		Size:     0,
+		Blocks:   0,
+		Atime:    ts,
+		Ctime:    ts,
+		Mtime:    ts,
+		DevMajor: linux.UNNAMED_MAJOR,
+		DevMinor: vfsfs.Impl().(*filesystem).devMinor,
+	}, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	return syserror.EPERM
+}
+
+// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement
+// statfs, from which we should indicate PIPEFS_MAGIC.
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags)
+}
+
+// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
+// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2).
+//
+// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
+func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
+	fs := mnt.Filesystem().Impl().(*filesystem)
+	inode := newInode(ctx, fs)
+	var d kernfs.Dentry
+	d.Init(inode)
+	defer d.DecRef()
+	return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags)
+}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 12aac2e6a..17c1342b5 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -8,18 +8,21 @@ go_library(
         "filesystem.go",
         "subtasks.go",
         "task.go",
+        "task_fds.go",
         "task_files.go",
+        "task_net.go",
         "tasks.go",
         "tasks_files.go",
-        "tasks_net.go",
         "tasks_sys.go",
     ],
+    visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/log",
+        "//pkg/refs",
         "//pkg/safemem",
-        "//pkg/sentry/fs",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
@@ -52,6 +55,7 @@ go_test(
         "//pkg/fspath",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 11477b6a9..609210253 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -26,15 +26,29 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
-// procFSType is the factory class for procfs.
+// Name is the default filesystem name.
+const Name = "proc"
+
+// FilesystemType is the factory class for procfs.
 //
 // +stateify savable
-type procFSType struct{}
+type FilesystemType struct{}
+
+var _ vfs.FilesystemType = (*FilesystemType)(nil)
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+type filesystem struct {
+	kernfs.Filesystem
 
-var _ vfs.FilesystemType = (*procFSType)(nil)
+	devMinor uint32
+}
 
-// GetFilesystem implements vfs.FilesystemType.
-func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	k := kernel.KernelFromContext(ctx)
 	if k == nil {
 		return nil, nil, fmt.Errorf("procfs requires a kernel")
@@ -43,30 +57,42 @@ func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile
 	if pidns == nil {
 		return nil, nil, fmt.Errorf("procfs requires a PID namespace")
 	}
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+	procfs := &filesystem{
+		devMinor: devMinor,
+	}
+	procfs.VFSFilesystem().Init(vfsObj, &ft, procfs)
 
-	procfs := &kernfs.Filesystem{}
-	procfs.VFSFilesystem().Init(vfsObj, procfs)
-
-	var data *InternalData
+	var cgroups map[string]string
 	if opts.InternalData != nil {
-		data = opts.InternalData.(*InternalData)
+		data := opts.InternalData.(*InternalData)
+		cgroups = data.Cgroups
 	}
 
-	_, dentry := newTasksInode(procfs, k, pidns, data.Cgroups)
+	_, dentry := procfs.newTasksInode(k, pidns, cgroups)
 	return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
 }
 
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release()
+}
+
 // dynamicInode is an overfitted interface for common Inodes with
 // dynamicByteSource types used in procfs.
 type dynamicInode interface {
 	kernfs.Inode
 	vfs.DynamicBytesSource
 
-	Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
+	Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
 }
 
-func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
-	inode.Init(creds, ino, inode, perm)
+func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
 
 	d := &kernfs.Dentry{}
 	d.Init(inode)
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 102af0e93..36a911db4 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -34,24 +35,25 @@ type subtasksInode struct {
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeAttrs
 	kernfs.OrderedChildren
+	kernfs.AlwaysValid
 
+	fs                *filesystem
 	task              *kernel.Task
 	pidns             *kernel.PIDNamespace
-	inoGen            InoGenerator
 	cgroupControllers map[string]string
 }
 
 var _ kernfs.Inode = (*subtasksInode)(nil)
 
-func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator, cgroupControllers map[string]string) *kernfs.Dentry {
+func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry {
 	subInode := &subtasksInode{
+		fs:                fs,
 		task:              task,
 		pidns:             pidns,
-		inoGen:            inoGen,
 		cgroupControllers: cgroupControllers,
 	}
 	// Note: credentials are overridden by taskOwnedInode.
-	subInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555)
+	subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
 	subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
 
 	inode := &taskOwnedInode{Inode: subInode, owner: task}
@@ -61,11 +63,6 @@ func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenera
 	return dentry
 }
 
-// Valid implements kernfs.inodeDynamicLookup.
-func (i *subtasksInode) Valid(ctx context.Context) bool {
-	return true
-}
-
 // Lookup implements kernfs.inodeDynamicLookup.
 func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
 	tid, err := strconv.ParseUint(name, 10, 32)
@@ -81,7 +78,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e
 		return nil, syserror.ENOENT
 	}
 
-	subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false, i.cgroupControllers)
+	subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
 	return subTaskDentry.VFSDentry(), nil
 }
 
@@ -91,6 +88,9 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 	if len(tasks) == 0 {
 		return offset, syserror.ENOENT
 	}
+	if relOffset >= int64(len(tasks)) {
+		return offset, nil
+	}
 
 	tids := make([]int, 0, len(tasks))
 	for _, tid := range tasks {
@@ -102,27 +102,79 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 		dirent := vfs.Dirent{
 			Name:    strconv.FormatUint(uint64(tid), 10),
 			Type:    linux.DT_DIR,
-			Ino:     i.inoGen.NextIno(),
+			Ino:     i.fs.NextIno(),
 			NextOff: offset + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
 	return offset, nil
 }
 
+type subtasksFD struct {
+	kernfs.GenericDirectoryFD
+
+	task *kernel.Task
+}
+
+func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.IterDirents(ctx, cb)
+}
+
+// Seek implements vfs.FileDecriptionImpl.Seek.
+func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return 0, syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.Seek(ctx, offset, whence)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return linux.Statx{}, syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.Stat(ctx, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.SetStat(ctx, opts)
+}
+
 // Open implements kernfs.Inode.
-func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &subtasksFD{task: i.task}
+	if err := fd.Init(&i.OrderedChildren, &opts); err != nil {
+		return nil, err
+	}
+	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
 // Stat implements kernfs.Inode.
-func (i *subtasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
-	stat := i.InodeAttrs.Stat(vsfs)
-	stat.Nlink += uint32(i.task.ThreadGroup().Count())
-	return stat
+func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(vsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	if opts.Mask&linux.STATX_NLINK != 0 {
+		stat.Nlink += uint32(i.task.ThreadGroup().Count())
+	}
+	return stat, nil
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
 }
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 2d814668a..482055db1 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -43,41 +43,45 @@ type taskInode struct {
 
 var _ kernfs.Inode = (*taskInode)(nil)
 
-func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
+func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
+	// TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited.
 	contents := map[string]*kernfs.Dentry{
-		"auxv":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}),
-		"cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
-		"comm":    newComm(task, inoGen.NextIno(), 0444),
-		"environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
-		//"exe":       newExe(t, msrc),
-		//"fd":        newFdDir(t, msrc),
-		//"fdinfo":    newFdInfoDir(t, msrc),
-		"gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}),
-		"io":      newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)),
-		"maps":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}),
-		//"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
-		//"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
-		"ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{
-			"net":  newNamespaceSymlink(task, inoGen.NextIno(), "net"),
-			"pid":  newNamespaceSymlink(task, inoGen.NextIno(), "pid"),
-			"user": newNamespaceSymlink(task, inoGen.NextIno(), "user"),
+		"auxv":      fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}),
+		"cmdline":   fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
+		"comm":      fs.newComm(task, fs.NextIno(), 0444),
+		"environ":   fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
+		"exe":       fs.newExeSymlink(task, fs.NextIno()),
+		"fd":        fs.newFDDirInode(task),
+		"fdinfo":    fs.newFDInfoDirInode(task),
+		"gid_map":   fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
+		"io":        fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
+		"maps":      fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}),
+		"mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
+		"mounts":    fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}),
+		"net":       fs.newTaskNetDir(task),
+		"ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{
+			"net":  fs.newNamespaceSymlink(task, fs.NextIno(), "net"),
+			"pid":  fs.newNamespaceSymlink(task, fs.NextIno(), "pid"),
+			"user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"),
 		}),
-		"smaps":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
-		"stat":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
-		"statm":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}),
-		"status":  newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
-		"uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
+		"oom_score":     fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")),
+		"oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
+		"smaps":         fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}),
+		"stat":          fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":         fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}),
+		"status":        fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+		"uid_map":       fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
 	}
 	if isThreadGroup {
-		contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers)
+		contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers)
 	}
 	if len(cgroupControllers) > 0 {
-		contents["cgroup"] = newTaskOwnedFile(task, inoGen.NextIno(), 0444, newCgroupData(cgroupControllers))
+		contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
 	}
 
 	taskInode := &taskInode{task: task}
 	// Note: credentials are overridden by taskOwnedInode.
-	taskInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555)
+	taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
 
 	inode := &taskOwnedInode{Inode: taskInode, owner: task}
 	dentry := &kernfs.Dentry{}
@@ -98,19 +102,17 @@ func (i *taskInode) Valid(ctx context.Context) bool {
 }
 
 // Open implements kernfs.Inode.
-func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
-// SetStat implements kernfs.Inode.
-func (i *taskInode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
-	stat := opts.Stat
-	if stat.Mask&linux.STATX_MODE != 0 {
-		return syserror.EPERM
-	}
-	return nil
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
 }
 
 // taskOwnedInode implements kernfs.Inode and overrides inode owner with task
@@ -124,9 +126,9 @@ type taskOwnedInode struct {
 
 var _ kernfs.Inode = (*taskOwnedInode)(nil)
 
-func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
 	// Note: credentials are overridden by taskOwnedInode.
-	inode.Init(task.Credentials(), ino, inode, perm)
+	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
 
 	taskInode := &taskOwnedInode{Inode: inode, owner: task}
 	d := &kernfs.Dentry{}
@@ -134,11 +136,11 @@ func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode
 	return d
 }
 
-func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
 	dir := &kernfs.StaticDirectory{}
 
 	// Note: credentials are overridden by taskOwnedInode.
-	dir.Init(task.Credentials(), ino, perm)
+	dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
 
 	inode := &taskOwnedInode{Inode: dir, owner: task}
 	d := &kernfs.Dentry{}
@@ -152,26 +154,28 @@ func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, childre
 }
 
 // Stat implements kernfs.Inode.
-func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx {
-	stat := i.Inode.Stat(fs)
-	uid, gid := i.getOwner(linux.FileMode(stat.Mode))
-	stat.UID = uint32(uid)
-	stat.GID = uint32(gid)
-	return stat
+func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.Inode.Stat(fs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 {
+		uid, gid := i.getOwner(linux.FileMode(stat.Mode))
+		if opts.Mask&linux.STATX_UID != 0 {
+			stat.UID = uint32(uid)
+		}
+		if opts.Mask&linux.STATX_GID != 0 {
+			stat.GID = uint32(gid)
+		}
+	}
+	return stat, nil
 }
 
 // CheckPermissions implements kernfs.Inode.
 func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	mode := i.Mode()
 	uid, gid := i.getOwner(mode)
-	return vfs.GenericCheckPermissions(
-		creds,
-		ats,
-		mode.FileType() == linux.ModeDirectory,
-		uint16(mode),
-		uid,
-		gid,
-	)
+	return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid)
 }
 
 func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) {
@@ -213,28 +217,12 @@ func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
 	return &ioData{ioUsage: t}
 }
 
-func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
-	// Namespace symlinks should contain the namespace name and the inode number
-	// for the namespace instance, so for example user:[123456]. We currently fake
-	// the inode number by sticking the symlink inode in its place.
-	target := fmt.Sprintf("%s:[%d]", ns, ino)
-
-	inode := &kernfs.StaticSymlink{}
-	// Note: credentials are overridden by taskOwnedInode.
-	inode.Init(task.Credentials(), ino, target)
-
-	taskInode := &taskOwnedInode{Inode: inode, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(taskInode)
-	return d
-}
-
 // newCgroupData creates inode that shows cgroup information.
 // From man 7 cgroups: "For each cgroup hierarchy of which the process is a
 // member, there is one entry containing three colon-separated fields:
 //   hierarchy-ID:controller-list:cgroup-path"
 func newCgroupData(controllers map[string]string) dynamicInode {
-	buf := bytes.Buffer{}
+	var buf bytes.Buffer
 
 	// The hierarchy ids must be positive integers (for cgroup v1), but the
 	// exact number does not matter, so long as they are unique. We can
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
new file mode 100644
index 000000000..44ccc9e4a
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -0,0 +1,306 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) {
+	var (
+		file  *vfs.FileDescription
+		flags kernel.FDFlags
+	)
+	t.WithMuLocked(func(t *kernel.Task) {
+		if fdt := t.FDTable(); fdt != nil {
+			file, flags = fdt.GetVFS2(fd)
+		}
+	})
+	return file, flags
+}
+
+func taskFDExists(t *kernel.Task, fd int32) bool {
+	file, _ := getTaskFD(t, fd)
+	if file == nil {
+		return false
+	}
+	file.DecRef()
+	return true
+}
+
+type fdDir struct {
+	fs   *filesystem
+	task *kernel.Task
+
+	// When produceSymlinks is set, dirents produces for the FDs are reported
+	// as symlink. Otherwise, they are reported as regular files.
+	produceSymlink bool
+}
+
+// IterDirents implements kernfs.inodeDynamicLookup.
+func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) {
+	var fds []int32
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		if fdTable := t.FDTable(); fdTable != nil {
+			fds = fdTable.GetFDs()
+		}
+	})
+
+	offset := absOffset + relOffset
+	typ := uint8(linux.DT_REG)
+	if i.produceSymlink {
+		typ = linux.DT_LNK
+	}
+
+	// Find the appropriate starting point.
+	idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(relOffset) })
+	if idx >= len(fds) {
+		return offset, nil
+	}
+	for _, fd := range fds[idx:] {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(fd), 10),
+			Type:    typ,
+			Ino:     i.fs.NextIno(),
+			NextOff: offset + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	return offset, nil
+}
+
+// fdDirInode represents the inode for /proc/[pid]/fd directory.
+//
+// +stateify savable
+type fdDirInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+	kernfs.AlwaysValid
+	fdDir
+}
+
+var _ kernfs.Inode = (*fdDirInode)(nil)
+
+func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
+	inode := &fdDirInode{
+		fdDir: fdDir{
+			fs:             fs,
+			task:           task,
+			produceSymlink: true,
+		},
+	}
+	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+
+	return dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	fdInt, err := strconv.ParseInt(name, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+	fd := int32(fdInt)
+	if !taskFDExists(i.task, fd) {
+		return nil, syserror.ENOENT
+	}
+	taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno())
+	return taskDentry.VFSDentry(), nil
+}
+
+// Open implements kernfs.Inode.
+func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// CheckPermissions implements kernfs.Inode.
+//
+// This is to match Linux, which uses a special permission handler to guarantee
+// that a process can still access /proc/self/fd after it has executed
+// setuid. See fs/proc/fd.c:proc_fd_permission.
+func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	err := i.InodeAttrs.CheckPermissions(ctx, creds, ats)
+	if err == nil {
+		// Access granted, no extra check needed.
+		return nil
+	}
+	if t := kernel.TaskFromContext(ctx); t != nil {
+		// Allow access if the task trying to access it is in the thread group
+		// corresponding to this directory.
+		if i.task.ThreadGroup() == t.ThreadGroup() {
+			// Access granted (overridden).
+			return nil
+		}
+	}
+	return err
+}
+
+// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
+//
+// +stateify savable
+type fdSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	task *kernel.Task
+	fd   int32
+}
+
+var _ kernfs.Inode = (*fdSymlink)(nil)
+
+func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry {
+	inode := &fdSymlink{
+		task: task,
+		fd:   fd,
+	}
+	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+	file, _ := getTaskFD(s.task, s.fd)
+	if file == nil {
+		return "", syserror.ENOENT
+	}
+	defer file.DecRef()
+	root := vfs.RootFromContext(ctx)
+	defer root.DecRef()
+	return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry())
+}
+
+func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	file, _ := getTaskFD(s.task, s.fd)
+	if file == nil {
+		return vfs.VirtualDentry{}, "", syserror.ENOENT
+	}
+	defer file.DecRef()
+	vd := file.VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
+}
+
+// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
+//
+// +stateify savable
+type fdInfoDirInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+	kernfs.AlwaysValid
+	fdDir
+}
+
+var _ kernfs.Inode = (*fdInfoDirInode)(nil)
+
+func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
+	inode := &fdInfoDirInode{
+		fdDir: fdDir{
+			fs:   fs,
+			task: task,
+		},
+	}
+	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+
+	return dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	fdInt, err := strconv.ParseInt(name, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+	fd := int32(fdInt)
+	if !taskFDExists(i.task, fd) {
+		return nil, syserror.ENOENT
+	}
+	data := &fdInfoData{
+		task: i.task,
+		fd:   fd,
+	}
+	dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data)
+	return dentry.VFSDentry(), nil
+}
+
+// Open implements kernfs.Inode.
+func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
+//
+// +stateify savable
+type fdInfoData struct {
+	kernfs.DynamicBytesFile
+	refs.AtomicRefCount
+
+	task *kernel.Task
+	fd   int32
+}
+
+var _ dynamicInode = (*fdInfoData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	file, descriptorFlags := getTaskFD(d.task, d.fd)
+	if file == nil {
+		return syserror.ENOENT
+	}
+	defer file.DecRef()
+	// TODO(b/121266871): Include pos, locks, and other data. For now we only
+	// have flags.
+	// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
+	flags := uint(file.StatusFlags()) | descriptorFlags.ToLinuxFileFlags()
+	fmt.Fprintf(buf, "flags:\t0%o\n", flags)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index efd3b3453..2f297e48a 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -63,6 +64,16 @@ func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) {
 	return m, nil
 }
 
+func checkTaskState(t *kernel.Task) error {
+	switch t.ExitState() {
+	case kernel.TaskExitZombie:
+		return syserror.EACCES
+	case kernel.TaskExitDead:
+		return syserror.ESRCH
+	}
+	return nil
+}
+
 type bufferWriter struct {
 	buf *bytes.Buffer
 }
@@ -100,17 +111,18 @@ func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	}
 	defer m.DecUsers(ctx)
 
-	// Space for buffer with AT_NULL (0) terminator at the end.
 	auxv := m.Auxv()
+	// Space for buffer with AT_NULL (0) terminator at the end.
 	buf.Grow((len(auxv) + 1) * 16)
 	for _, e := range auxv {
-		var tmp [8]byte
-		usermem.ByteOrder.PutUint64(tmp[:], e.Key)
-		buf.Write(tmp[:])
-
-		usermem.ByteOrder.PutUint64(tmp[:], uint64(e.Value))
+		var tmp [16]byte
+		usermem.ByteOrder.PutUint64(tmp[:8], e.Key)
+		usermem.ByteOrder.PutUint64(tmp[8:], uint64(e.Value))
 		buf.Write(tmp[:])
 	}
+	var atNull [16]byte
+	buf.Write(atNull[:])
+
 	return nil
 }
 
@@ -229,9 +241,9 @@ type commInode struct {
 	task *kernel.Task
 }
 
-func newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry {
 	inode := &commInode{task: task}
-	inode.DynamicBytesFile.Init(task.Credentials(), ino, &commData{task: task}, perm)
+	inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
 
 	d := &kernfs.Dentry{}
 	d.Init(inode)
@@ -496,7 +508,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
-// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
+// ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider.
 type ioUsage interface {
 	// IOUsage returns the io usage data.
 	IOUsage() *usage.IO
@@ -525,3 +537,296 @@ func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
 	return nil
 }
+
+// oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if o.task.ExitState() == kernel.TaskExitDead {
+		return syserror.ESRCH
+	}
+	fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj())
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if o.task.ExitState() == kernel.TaskExitDead {
+		return 0, syserror.ESRCH
+	}
+	if err := o.task.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
+
+// exeSymlink is an symlink for the /proc/[pid]/exe file.
+//
+// +stateify savable
+type exeSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	task *kernel.Task
+}
+
+var _ kernfs.Inode = (*exeSymlink)(nil)
+
+func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry {
+	inode := &exeSymlink{task: task}
+	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+// Readlink implements kernfs.Inode.
+func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
+	if !kernel.ContextCanTrace(ctx, s.task, false) {
+		return "", syserror.EACCES
+	}
+
+	// Pull out the executable for /proc/[pid]/exe.
+	exec, err := s.executable()
+	if err != nil {
+		return "", err
+	}
+	defer exec.DecRef()
+
+	return exec.PathnameWithDeleted(ctx), nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	if !kernel.ContextCanTrace(ctx, s.task, false) {
+		return vfs.VirtualDentry{}, "", syserror.EACCES
+	}
+
+	exec, err := s.executable()
+	if err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	defer exec.DecRef()
+
+	vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
+}
+
+func (s *exeSymlink) executable() (file fsbridge.File, err error) {
+	if err := checkTaskState(s.task); err != nil {
+		return nil, err
+	}
+
+	s.task.WithMuLocked(func(t *kernel.Task) {
+		mm := t.MemoryManager()
+		if mm == nil {
+			err = syserror.EACCES
+			return
+		}
+
+		// The MemoryManager may be destroyed, in which case
+		// MemoryManager.destroy will simply set the executable to nil
+		// (with locks held).
+		file = mm.Executable()
+		if file == nil {
+			err = syserror.ESRCH
+		}
+	})
+	return
+}
+
+// mountInfoData is used to implement /proc/[pid]/mountinfo.
+//
+// +stateify savable
+type mountInfoData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*mountInfoData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	var fsctx *kernel.FSContext
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
+	})
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return nil
+	}
+	rootDir := fsctx.RootDirectoryVFS2()
+	if !rootDir.Ok() {
+		// Root has been destroyed. Don't try to read mounts.
+		return nil
+	}
+	defer rootDir.DecRef()
+	i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
+	return nil
+}
+
+// mountsData is used to implement /proc/[pid]/mounts.
+//
+// +stateify savable
+type mountsData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*mountsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	var fsctx *kernel.FSContext
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
+	})
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return nil
+	}
+	rootDir := fsctx.RootDirectoryVFS2()
+	if !rootDir.Ok() {
+		// Root has been destroyed. Don't try to read mounts.
+		return nil
+	}
+	defer rootDir.DecRef()
+	i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
+	return nil
+}
+
+type namespaceSymlink struct {
+	kernfs.StaticSymlink
+
+	task *kernel.Task
+}
+
+func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+	// Namespace symlinks should contain the namespace name and the inode number
+	// for the namespace instance, so for example user:[123456]. We currently fake
+	// the inode number by sticking the symlink inode in its place.
+	target := fmt.Sprintf("%s:[%d]", ns, ino)
+
+	inode := &namespaceSymlink{task: task}
+	// Note: credentials are overridden by taskOwnedInode.
+	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
+
+	taskInode := &taskOwnedInode{Inode: inode, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(taskInode)
+	return d
+}
+
+// Readlink implements Inode.
+func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+	if err := checkTaskState(s.task); err != nil {
+		return "", err
+	}
+	return s.StaticSymlink.Readlink(ctx)
+}
+
+// Getlink implements Inode.Getlink.
+func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	if err := checkTaskState(s.task); err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+
+	// Create a synthetic inode to represent the namespace.
+	dentry := &kernfs.Dentry{}
+	dentry.Init(&namespaceInode{})
+	vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
+	vd.IncRef()
+	dentry.DecRef()
+	return vd, "", nil
+}
+
+// namespaceInode is a synthetic inode created to represent a namespace in
+// /proc/[pid]/ns/*.
+type namespaceInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+}
+
+var _ kernfs.Inode = (*namespaceInode)(nil)
+
+// Init initializes a namespace inode.
+func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+	i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+}
+
+// Open implements Inode.Open.
+func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &namespaceFD{inode: i}
+	i.IncRef()
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// namespace FD is a synthetic file that represents a namespace in
+// /proc/[pid]/ns/*.
+type namespaceFD struct {
+	vfs.FileDescriptionDefaultImpl
+
+	vfsfd vfs.FileDescription
+	inode *namespaceInode
+}
+
+var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
+
+// Stat implements FileDescriptionImpl.
+func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.Stat(vfs, opts)
+}
+
+// SetStat implements FileDescriptionImpl.
+func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	creds := auth.CredentialsFromContext(ctx)
+	return fd.inode.SetStat(ctx, vfs, creds, opts)
+}
+
+// Release implements FileDescriptionImpl.
+func (fd *namespaceFD) Release() {
+	fd.inode.DecRef()
+}
+
+// OnClose implements FileDescriptionImpl.
+func (*namespaceFD) OnClose(context.Context) error {
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index 608fec017..6bde27376 100644
--- a/pkg/sentry/fsimpl/proc/tasks_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -24,7 +24,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -32,14 +31,19 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
+func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
+	k := task.Kernel()
+	pidns := task.PIDNamespace()
+	root := auth.NewRootCredentials(pidns.UserNamespace())
+
 	var contents map[string]*kernfs.Dentry
-	if stack := k.NetworkStack(); stack != nil {
+	if stack := task.NetworkNamespace().Stack(); stack != nil {
 		const (
 			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
 			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
@@ -50,38 +54,40 @@ func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k
 		)
 		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
 
+		// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
+		// network namespace.
 		contents = map[string]*kernfs.Dentry{
-			"dev":  newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}),
-			"snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}),
+			"dev":  fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}),
+			"snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}),
 
 			// The following files are simple stubs until they are implemented in
 			// netstack, if the file contains a header the stub is just the header
 			// otherwise it is an empty file.
-			"arp":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)),
-			"netlink":   newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)),
-			"netstat":   newDentry(root, inoGen.NextIno(), 0444, &netStatData{}),
-			"packet":    newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)),
-			"protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)),
+			"arp":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)),
+			"netlink":   fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)),
+			"netstat":   fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}),
+			"packet":    fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)),
+			"protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)),
 
 			// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
 			// high res timer ticks per sec (ClockGetres returns 1ns resolution).
-			"psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)),
-			"ptype":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)),
-			"route":  newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}),
-			"tcp":    newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}),
-			"udp":    newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}),
-			"unix":   newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}),
+			"psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)),
+			"ptype":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)),
+			"route":  fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}),
+			"tcp":    fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}),
+			"udp":    fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}),
+			"unix":   fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}),
 		}
 
 		if stack.SupportsIPv6() {
-			contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack})
-			contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(""))
-			contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k})
-			contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6))
+			contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack})
+			contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(""))
+			contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k})
+			contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6))
 		}
 	}
 
-	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents)
+	return fs.newTaskOwnedDir(task, fs.NextIno(), 0555, contents)
 }
 
 // ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
@@ -200,22 +206,21 @@ var _ dynamicInode = (*netUnixData)(nil)
 func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
 	for _, se := range n.kernel.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
 			continue
 		}
-		sfile := s.(*fs.File)
-		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+		if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX {
 			s.DecRef()
 			// Not a unix socket.
 			continue
 		}
-		sops := sfile.FileOperations.(*unix.SocketOperations)
+		sops := s.Impl().(*unix.SocketVFS2)
 
 		addr, err := sops.Endpoint().GetLocalAddress()
 		if err != nil {
-			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
+			log.Warningf("Failed to retrieve socket name from %+v: %v", s, err)
 			addr.Addr = "<unknown>"
 		}
 
@@ -228,6 +233,15 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 			}
 		}
 
+		// Get inode number.
+		var ino uint64
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO})
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve ino for socket file: %v", statErr)
+		} else {
+			ino = stat.Ino
+		}
+
 		// In the socket entry below, the value for the 'Num' field requires
 		// some consideration. Linux prints the address to the struct
 		// unix_sock representing a socket in the kernel, but may redact the
@@ -246,14 +260,14 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// the definition of this struct changes over time.
 		//
 		// For now, we always redact this pointer.
-		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
+		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
 			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
+			s.Refs()-1,                    // RefCount, don't count our own ref.
 			0,                             // Protocol, always 0 for UDS.
 			sockFlags,                     // Flags.
 			sops.Endpoint().Type(),        // Type.
 			sops.State(),                  // State.
-			sfile.InodeID(),               // Inode.
+			ino,                           // Inode.
 		)
 
 		// Path
@@ -335,15 +349,14 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 	t := kernel.TaskFromContext(ctx)
 
 	for _, se := range k.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
 			continue
 		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
+		sops, ok := s.Impl().(socket.SocketVFS2)
 		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
 		}
 		if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
 			s.DecRef()
@@ -392,14 +405,15 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 		// Unimplemented.
 		fmt.Fprintf(buf, "%08X ", 0)
 
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO})
+
 		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+		if statErr != nil || stat.Mask&linux.STATX_UID == 0 {
+			log.Warningf("Failed to retrieve uid for socket file: %v", statErr)
 			fmt.Fprintf(buf, "%5d ", 0)
 		} else {
 			creds := auth.CredentialsFromContext(ctx)
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+			fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow()))
 		}
 
 		// Field: timeout; number of unanswered 0-window probes.
@@ -407,11 +421,16 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 		fmt.Fprintf(buf, "%8d ", 0)
 
 		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve inode for socket file: %v", statErr)
+			fmt.Fprintf(buf, "%8d ", 0)
+		} else {
+			fmt.Fprintf(buf, "%8d ", stat.Ino)
+		}
 
 		// Field: refcount. Don't count the ref we obtain while deferencing
 		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+		fmt.Fprintf(buf, "%d ", s.Refs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -492,16 +511,17 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	// degrade gracefully and retrieve what we can.
 	t := kernel.TaskFromContext(ctx)
 
+	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops             \n")
+
 	for _, se := range d.kernel.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
 			continue
 		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
+		sops, ok := s.Impl().(socket.SocketVFS2)
 		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
 		}
 		if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
 			s.DecRef()
@@ -545,25 +565,31 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// Field: retrnsmt. Always 0 for UDP.
 		fmt.Fprintf(buf, "%08X ", 0)
 
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO})
+
 		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+		if statErr != nil || stat.Mask&linux.STATX_UID == 0 {
+			log.Warningf("Failed to retrieve uid for socket file: %v", statErr)
 			fmt.Fprintf(buf, "%5d ", 0)
 		} else {
 			creds := auth.CredentialsFromContext(ctx)
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+			fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow()))
 		}
 
 		// Field: timeout. Always 0 for UDP.
 		fmt.Fprintf(buf, "%8d ", 0)
 
 		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve inode for socket file: %v", statErr)
+			fmt.Fprintf(buf, "%8d ", 0)
+		} else {
+			fmt.Fprintf(buf, "%8d ", stat.Ino)
+		}
 
 		// Field: ref; reference count on the socket inode. Don't count the ref
 		// we obtain while deferencing the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+		fmt.Fprintf(buf, "%d ", s.Refs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -664,9 +690,9 @@ func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		if line.prefix == "Tcp" {
 			tcp := stat.(*inet.StatSNMPTCP)
 			// "Tcp" needs special processing because MaxConn is signed. RFC 2012.
-			fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
+			fmt.Fprintf(buf, "%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
 		} else {
-			fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
+			fmt.Fprintf(buf, "%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
 		}
 	}
 	return nil
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index ebe21630c..b51d43954 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -33,11 +33,6 @@ const (
 	threadSelfName = "thread-self"
 )
 
-// InoGenerator generates unique inode numbers for a given filesystem.
-type InoGenerator interface {
-	NextIno() uint64
-}
-
 // tasksInode represents the inode for /proc/ directory.
 //
 // +stateify savable
@@ -46,9 +41,10 @@ type tasksInode struct {
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeAttrs
 	kernfs.OrderedChildren
+	kernfs.AlwaysValid
 
-	inoGen InoGenerator
-	pidns  *kernel.PIDNamespace
+	fs    *filesystem
+	pidns *kernel.PIDNamespace
 
 	// '/proc/self' and '/proc/thread-self' have custom directory offsets in
 	// Linux. So handle them outside of OrderedChildren.
@@ -63,29 +59,29 @@ type tasksInode struct {
 
 var _ kernfs.Inode = (*tasksInode)(nil)
 
-func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
+func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
 	root := auth.NewRootCredentials(pidns.UserNamespace())
 	contents := map[string]*kernfs.Dentry{
-		"cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))),
-		//"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}),
-		"loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}),
-		"sys":     newSysDir(root, inoGen, k),
-		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
-		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
-		"net":     newNetDir(root, inoGen, k),
-		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{}),
-		"uptime":  newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
-		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}),
+		"cpuinfo":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))),
+		"filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}),
+		"loadavg":     fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}),
+		"sys":         fs.newSysDir(root, k),
+		"meminfo":     fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}),
+		"mounts":      kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
+		"net":         kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
+		"stat":        fs.newDentry(root, fs.NextIno(), 0444, &statData{}),
+		"uptime":      fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}),
+		"version":     fs.newDentry(root, fs.NextIno(), 0444, &versionData{}),
 	}
 
 	inode := &tasksInode{
 		pidns:             pidns,
-		inoGen:            inoGen,
-		selfSymlink:       newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
-		threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
+		fs:                fs,
+		selfSymlink:       fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
+		threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
 		cgroupControllers: cgroupControllers,
 	}
-	inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555)
+	inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
 
 	dentry := &kernfs.Dentry{}
 	dentry.Init(inode)
@@ -117,15 +113,10 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 		return nil, syserror.ENOENT
 	}
 
-	taskDentry := newTaskInode(i.inoGen, task, i.pidns, true, i.cgroupControllers)
+	taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
 	return taskDentry.VFSDentry(), nil
 }
 
-// Valid implements kernfs.inodeDynamicLookup.
-func (i *tasksInode) Valid(ctx context.Context) bool {
-	return true
-}
-
 // IterDirents implements kernfs.inodeDynamicLookup.
 func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
 	// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
@@ -148,11 +139,11 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 		dirent := vfs.Dirent{
 			Name:    selfName,
 			Type:    linux.DT_LNK,
-			Ino:     i.inoGen.NextIno(),
+			Ino:     i.fs.NextIno(),
 			NextOff: offset + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
@@ -160,11 +151,11 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 		dirent := vfs.Dirent{
 			Name:    threadSelfName,
 			Type:    linux.DT_LNK,
-			Ino:     i.inoGen.NextIno(),
+			Ino:     i.fs.NextIno(),
 			NextOff: offset + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
@@ -193,11 +184,11 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 		dirent := vfs.Dirent{
 			Name:    strconv.FormatUint(uint64(tid), 10),
 			Type:    linux.DT_DIR,
-			Ino:     i.inoGen.NextIno(),
+			Ino:     i.fs.NextIno(),
 			NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1,
 		}
-		if !cb.Handle(dirent) {
-			return offset, nil
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
 		}
 		offset++
 	}
@@ -205,23 +196,44 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 }
 
 // Open implements kernfs.Inode.
-func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
-func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
-	stat := i.InodeAttrs.Stat(vsfs)
+func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(vsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
 
-	// Add dynamic children to link count.
-	for _, tg := range i.pidns.ThreadGroups() {
-		if leader := tg.Leader(); leader != nil {
-			stat.Nlink++
+	if opts.Mask&linux.STATX_NLINK != 0 {
+		// Add dynamic children to link count.
+		for _, tg := range i.pidns.ThreadGroups() {
+			if leader := tg.Leader(); leader != nil {
+				stat.Nlink++
+			}
 		}
 	}
 
-	return stat
+	return stat, nil
+}
+
+// staticFileSetStat implements a special static file that allows inode
+// attributes to be set. This is to support /proc files that are readonly, but
+// allow attributes to be set.
+type staticFileSetStat struct {
+	dynamicBytesFileSetAttr
+	vfs.StaticData
+}
+
+var _ dynamicInode = (*staticFileSetStat)(nil)
+
+func newStaticFileSetStat(data string) *staticFileSetStat {
+	return &staticFileSetStat{StaticData: vfs.StaticData{Data: data}}
 }
 
 func cpuInfoData(k *kernel.Kernel) string {
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 434998910..7d8983aa5 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -40,9 +41,9 @@ type selfSymlink struct {
 
 var _ kernfs.Inode = (*selfSymlink)(nil)
 
-func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
 	inode := &selfSymlink{pidns: pidns}
-	inode.Init(creds, ino, linux.ModeSymlink|perm)
+	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
 
 	d := &kernfs.Dentry{}
 	d.Init(inode)
@@ -62,6 +63,16 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
 	return strconv.FormatUint(uint64(tgid), 10), nil
 }
 
+func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx)
+	return vfs.VirtualDentry{}, target, err
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
 type threadSelfSymlink struct {
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
@@ -72,9 +83,9 @@ type threadSelfSymlink struct {
 
 var _ kernfs.Inode = (*threadSelfSymlink)(nil)
 
-func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
 	inode := &threadSelfSymlink{pidns: pidns}
-	inode.Init(creds, ino, linux.ModeSymlink|perm)
+	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
 
 	d := &kernfs.Dentry{}
 	d.Init(inode)
@@ -95,6 +106,28 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 }
 
+func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx)
+	return vfs.VirtualDentry{}, target, err
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// dynamicBytesFileSetAttr implements a special file that allows inode
+// attributes to be set. This is to support /proc files that are readonly, but
+// allow attributes to be set.
+type dynamicBytesFileSetAttr struct {
+	kernfs.DynamicBytesFile
+}
+
+// SetStat implements Inode.SetStat.
+func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
+}
+
 // cpuStats contains the breakdown of CPU time for /proc/stat.
 type cpuStats struct {
 	// user is time spent in userspace tasks with non-positive niceness.
@@ -137,22 +170,20 @@ func (c cpuStats) String() string {
 //
 // +stateify savable
 type statData struct {
-	kernfs.DynamicBytesFile
-
-	// k is the owning Kernel.
-	k *kernel.Kernel
+	dynamicBytesFileSetAttr
 }
 
 var _ dynamicInode = (*statData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	// TODO(b/37226836): We currently export only zero CPU stats. We could
 	// at least provide some aggregate stats.
 	var cpu cpuStats
 	fmt.Fprintf(buf, "cpu  %s\n", cpu)
 
-	for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ {
+	k := kernel.KernelFromContext(ctx)
+	for c, max := uint(0), k.ApplicationCores(); c < max; c++ {
 		fmt.Fprintf(buf, "cpu%d %s\n", c, cpu)
 	}
 
@@ -176,7 +207,7 @@ func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "ctxt 0\n")
 
 	// CLOCK_REALTIME timestamp from boot, in seconds.
-	fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds())
+	fmt.Fprintf(buf, "btime %d\n", k.Timekeeper().BootTime().Seconds())
 
 	// Total number of clones.
 	// TODO(b/37226836): Count this.
@@ -203,13 +234,13 @@ func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 //
 // +stateify savable
 type loadavgData struct {
-	kernfs.DynamicBytesFile
+	dynamicBytesFileSetAttr
 }
 
 var _ dynamicInode = (*loadavgData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
-func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	// TODO(b/62345059): Include real data in fields.
 	// Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
 	// Column 4-5: currently running processes and the total number of processes.
@@ -222,17 +253,15 @@ func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 //
 // +stateify savable
 type meminfoData struct {
-	kernfs.DynamicBytesFile
-
-	// k is the owning Kernel.
-	k *kernel.Kernel
+	dynamicBytesFileSetAttr
 }
 
 var _ dynamicInode = (*meminfoData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
-func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	mf := d.k.MemoryFile()
+func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	mf := k.MemoryFile()
 	mf.UpdateUsage()
 	snapshot, totalUsage := usage.MemoryAccounting.Copy()
 	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
@@ -243,12 +272,16 @@ func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	inactiveFile := file - activeFile
 
 	fmt.Fprintf(buf, "MemTotal:       %8d kB\n", totalSize/1024)
-	memFree := (totalSize - totalUsage) / 1024
+	memFree := totalSize - totalUsage
+	if memFree > totalSize {
+		// Underflow.
+		memFree = 0
+	}
 	// We use MemFree as MemAvailable because we don't swap.
 	// TODO(rahat): When reclaim is implemented the value of MemAvailable
 	// should change.
-	fmt.Fprintf(buf, "MemFree:        %8d kB\n", memFree)
-	fmt.Fprintf(buf, "MemAvailable:   %8d kB\n", memFree)
+	fmt.Fprintf(buf, "MemFree:        %8d kB\n", memFree/1024)
+	fmt.Fprintf(buf, "MemAvailable:   %8d kB\n", memFree/1024)
 	fmt.Fprintf(buf, "Buffers:               0 kB\n") // memory usage by block devices
 	fmt.Fprintf(buf, "Cached:         %8d kB\n", (file+snapshot.Tmpfs)/1024)
 	// Emulate a system with no swap, which disables inactivation of anon pages.
@@ -275,7 +308,7 @@ func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 //
 // +stateify savable
 type uptimeData struct {
-	kernfs.DynamicBytesFile
+	dynamicBytesFileSetAttr
 }
 
 var _ dynamicInode = (*uptimeData)(nil)
@@ -294,17 +327,15 @@ func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 //
 // +stateify savable
 type versionData struct {
-	kernfs.DynamicBytesFile
-
-	// k is the owning Kernel.
-	k *kernel.Kernel
+	dynamicBytesFileSetAttr
 }
 
 var _ dynamicInode = (*versionData)(nil)
 
 // Generate implements vfs.DynamicBytesSource.Generate.
-func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	init := v.k.GlobalInit()
+func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	init := k.GlobalInit()
 	if init == nil {
 		// Attempted to read before the init Task is created. This can
 		// only occur during startup, which should never need to read
@@ -335,3 +366,19 @@ func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)
 	return nil
 }
+
+// filesystemsData backs /proc/filesystems.
+//
+// +stateify savable
+type filesystemsData struct {
+	kernfs.DynamicBytesFile
+}
+
+var _ dynamicInode = (*filesystemsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *filesystemsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	k.VFS().GenerateProcFilesystems(buf)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index c7ce74883..6dac2afa4 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -30,89 +30,89 @@ import (
 )
 
 // newSysDir returns the dentry corresponding to /proc/sys directory.
-func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
-	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
-		"kernel": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
-			"hostname": newDentry(root, inoGen.NextIno(), 0444, &hostnameData{}),
-			"shmall":   newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMALL)),
-			"shmmax":   newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMAX)),
-			"shmmni":   newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMNI)),
+func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
+	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+		"kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+			"hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}),
+			"shmall":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)),
+			"shmmax":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)),
+			"shmmni":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)),
 		}),
-		"vm": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
-			"mmap_min_addr":     newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{}),
-			"overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")),
+		"vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+			"mmap_min_addr":     fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}),
+			"overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")),
 		}),
-		"net": newSysNetDir(root, inoGen, k),
+		"net": fs.newSysNetDir(root, k),
 	})
 }
 
 // newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
-func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
+func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
 	var contents map[string]*kernfs.Dentry
 
-	if stack := k.NetworkStack(); stack != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
 		contents = map[string]*kernfs.Dentry{
-			"ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
-				"tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}),
+			"ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+				"tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
 
 				// The following files are simple stubs until they are implemented in
 				// netstack, most of these files are configuration related. We use the
 				// value closest to the actual netstack behavior or any empty file, all
 				// of these files will have mode 0444 (read-only for all users).
-				"ip_local_port_range":     newDentry(root, inoGen.NextIno(), 0444, newStaticFile("16000   65535")),
-				"ip_local_reserved_ports": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")),
-				"ipfrag_time":             newDentry(root, inoGen.NextIno(), 0444, newStaticFile("30")),
-				"ip_nonlocal_bind":        newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"ip_no_pmtu_disc":         newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")),
+				"ip_local_port_range":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000   65535")),
+				"ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
+				"ipfrag_time":             fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")),
+				"ip_nonlocal_bind":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"ip_no_pmtu_disc":         fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
 
 				// tcp_allowed_congestion_control tell the user what they are able to
 				// do as an unprivledged process so we leave it empty.
-				"tcp_allowed_congestion_control":   newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")),
-				"tcp_available_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")),
-				"tcp_congestion_control":           newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")),
+				"tcp_allowed_congestion_control":   fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
+				"tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
+				"tcp_congestion_control":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
 
 				// Many of the following stub files are features netstack doesn't
 				// support. The unsupported features return "0" to indicate they are
 				// disabled.
-				"tcp_base_mss":              newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1280")),
-				"tcp_dsack":                 newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"tcp_early_retrans":         newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fack":                  newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fastopen":              newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fastopen_key":          newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")),
-				"tcp_invalid_ratelimit":     newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_intvl":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_probes":      newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_time":        newDentry(root, inoGen.NextIno(), 0444, newStaticFile("7200")),
-				"tcp_mtu_probing":           newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"tcp_no_metrics_save":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")),
-				"tcp_probe_interval":        newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"tcp_probe_threshold":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"tcp_retries1":              newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")),
-				"tcp_retries2":              newDentry(root, inoGen.NextIno(), 0444, newStaticFile("15")),
-				"tcp_rfc1337":               newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")),
-				"tcp_slow_start_after_idle": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")),
-				"tcp_synack_retries":        newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")),
-				"tcp_syn_retries":           newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")),
-				"tcp_timestamps":            newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")),
+				"tcp_base_mss":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")),
+				"tcp_dsack":                 fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_early_retrans":         fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_fack":                  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_fastopen":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_fastopen_key":          fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
+				"tcp_invalid_ratelimit":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_keepalive_intvl":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_keepalive_probes":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_keepalive_time":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")),
+				"tcp_mtu_probing":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_no_metrics_save":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"tcp_probe_interval":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_probe_threshold":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_retries1":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
+				"tcp_retries2":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")),
+				"tcp_rfc1337":               fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"tcp_synack_retries":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
+				"tcp_syn_retries":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
+				"tcp_timestamps":            fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
 			}),
-			"core": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
-				"default_qdisc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("pfifo_fast")),
-				"message_burst": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("10")),
-				"message_cost":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")),
-				"optmem_max":    newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")),
-				"rmem_default":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")),
-				"rmem_max":      newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")),
-				"somaxconn":     newDentry(root, inoGen.NextIno(), 0444, newStaticFile("128")),
-				"wmem_default":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")),
-				"wmem_max":      newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")),
+			"core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+				"default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")),
+				"message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")),
+				"message_cost":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
+				"optmem_max":    fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"rmem_default":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+				"rmem_max":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+				"somaxconn":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")),
+				"wmem_default":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+				"wmem_max":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
 			}),
 		}
 	}
 
-	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
-		"net": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents),
-	})
+	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
 }
 
 // mmapMinAddrData implements vfs.DynamicBytesSource for
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 6fc3524db..19abb5034 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -47,10 +48,11 @@ var (
 var (
 	tasksStaticFiles = map[string]testutil.DirentType{
 		"cpuinfo":     linux.DT_REG,
+		"filesystems": linux.DT_REG,
 		"loadavg":     linux.DT_REG,
 		"meminfo":     linux.DT_REG,
 		"mounts":      linux.DT_LNK,
-		"net":         linux.DT_DIR,
+		"net":         linux.DT_LNK,
 		"self":        linux.DT_LNK,
 		"stat":        linux.DT_REG,
 		"sys":         linux.DT_DIR,
@@ -63,21 +65,29 @@ var (
 		"thread-self": threadSelfLink.NextOff,
 	}
 	taskStaticFiles = map[string]testutil.DirentType{
-		"auxv":    linux.DT_REG,
-		"cgroup":  linux.DT_REG,
-		"cmdline": linux.DT_REG,
-		"comm":    linux.DT_REG,
-		"environ": linux.DT_REG,
-		"gid_map": linux.DT_REG,
-		"io":      linux.DT_REG,
-		"maps":    linux.DT_REG,
-		"ns":      linux.DT_DIR,
-		"smaps":   linux.DT_REG,
-		"stat":    linux.DT_REG,
-		"statm":   linux.DT_REG,
-		"status":  linux.DT_REG,
-		"task":    linux.DT_DIR,
-		"uid_map": linux.DT_REG,
+		"auxv":          linux.DT_REG,
+		"cgroup":        linux.DT_REG,
+		"cmdline":       linux.DT_REG,
+		"comm":          linux.DT_REG,
+		"environ":       linux.DT_REG,
+		"exe":           linux.DT_LNK,
+		"fd":            linux.DT_DIR,
+		"fdinfo":        linux.DT_DIR,
+		"gid_map":       linux.DT_REG,
+		"io":            linux.DT_REG,
+		"maps":          linux.DT_REG,
+		"mountinfo":     linux.DT_REG,
+		"mounts":        linux.DT_REG,
+		"net":           linux.DT_DIR,
+		"ns":            linux.DT_DIR,
+		"oom_score":     linux.DT_REG,
+		"oom_score_adj": linux.DT_REG,
+		"smaps":         linux.DT_REG,
+		"stat":          linux.DT_REG,
+		"statm":         linux.DT_REG,
+		"status":        linux.DT_REG,
+		"task":          linux.DT_DIR,
+		"uid_map":       linux.DT_REG,
 	}
 )
 
@@ -90,30 +100,49 @@ func setup(t *testing.T) *testutil.System {
 	ctx := k.SupervisorContext()
 	creds := auth.CredentialsFromContext(ctx)
 
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{
+	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	fsOpts := vfs.GetFilesystemOptions{
-		InternalData: &InternalData{
-			Cgroups: map[string]string{
-				"cpuset": "/foo/cpuset",
-				"memory": "/foo/memory",
+
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("NewMountNamespace(): %v", err)
+	}
+	pop := &vfs.PathOperation{
+		Root:  mntns.Root(),
+		Start: mntns.Root(),
+		Path:  fspath.Parse("/proc"),
+	}
+	if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil {
+		t.Fatalf("MkDir(/proc): %v", err)
+	}
+
+	pop = &vfs.PathOperation{
+		Root:  mntns.Root(),
+		Start: mntns.Root(),
+		Path:  fspath.Parse("/proc"),
+	}
+	mntOpts := &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: &InternalData{
+				Cgroups: map[string]string{
+					"cpuset": "/foo/cpuset",
+					"memory": "/foo/memory",
+				},
 			},
 		},
 	}
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts)
-	if err != nil {
-		t.Fatalf("NewMountNamespace(): %v", err)
+	if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
+		t.Fatalf("MountAt(/proc): %v", err)
 	}
-	return testutil.NewSystem(ctx, t, vfsObj, mntns)
+	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
 }
 
 func TestTasksEmpty(t *testing.T) {
 	s := setup(t)
 	defer s.Destroy()
 
-	collector := s.ListDirents(s.PathOpAtRoot("/"))
+	collector := s.ListDirents(s.PathOpAtRoot("/proc"))
 	s.AssertAllDirentTypes(collector, tasksStaticFiles)
 	s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs)
 }
@@ -131,7 +160,7 @@ func TestTasks(t *testing.T) {
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc)
+		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root)
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
@@ -139,7 +168,7 @@ func TestTasks(t *testing.T) {
 		expectedDirents[fmt.Sprintf("%d", i+1)] = linux.DT_DIR
 	}
 
-	collector := s.ListDirents(s.PathOpAtRoot("/"))
+	collector := s.ListDirents(s.PathOpAtRoot("/proc"))
 	s.AssertAllDirentTypes(collector, expectedDirents)
 	s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs)
 
@@ -179,7 +208,7 @@ func TestTasks(t *testing.T) {
 	}
 
 	// Test lookup.
-	for _, path := range []string{"/1", "/2"} {
+	for _, path := range []string{"/proc/1", "/proc/2"} {
 		fd, err := s.VFS.OpenAt(
 			s.Ctx,
 			s.Creds,
@@ -189,6 +218,7 @@ func TestTasks(t *testing.T) {
 		if err != nil {
 			t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err)
 		}
+		defer fd.DecRef()
 		buf := make([]byte, 1)
 		bufIOSeq := usermem.BytesIOSequence(buf)
 		if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR {
@@ -199,10 +229,10 @@ func TestTasks(t *testing.T) {
 	if _, err := s.VFS.OpenAt(
 		s.Ctx,
 		s.Creds,
-		s.PathOpAtRoot("/9999"),
+		s.PathOpAtRoot("/proc/9999"),
 		&vfs.OpenOptions{},
 	); err != syserror.ENOENT {
-		t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err)
+		t.Fatalf("wrong error from vfsfs.OpenAt(/proc/9999): %v", err)
 	}
 }
 
@@ -213,7 +243,7 @@ func TestTasksOffset(t *testing.T) {
 	k := kernel.KernelFromContext(s.Ctx)
 	for i := 0; i < 3; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
+		if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root); err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
 	}
@@ -300,12 +330,13 @@ func TestTasksOffset(t *testing.T) {
 			fd, err := s.VFS.OpenAt(
 				s.Ctx,
 				s.Creds,
-				s.PathOpAtRoot("/"),
+				s.PathOpAtRoot("/proc"),
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
 				t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
 			}
+			defer fd.DecRef()
 			if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil {
 				t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err)
 			}
@@ -337,12 +368,12 @@ func TestTask(t *testing.T) {
 
 	k := kernel.KernelFromContext(s.Ctx)
 	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-	_, err := testutil.CreateTask(s.Ctx, "name", tc)
+	_, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root)
 	if err != nil {
 		t.Fatalf("CreateTask(): %v", err)
 	}
 
-	collector := s.ListDirents(s.PathOpAtRoot("/1"))
+	collector := s.ListDirents(s.PathOpAtRoot("/proc/1"))
 	s.AssertAllDirentTypes(collector, taskStaticFiles)
 }
 
@@ -352,7 +383,7 @@ func TestProcSelf(t *testing.T) {
 
 	k := kernel.KernelFromContext(s.Ctx)
 	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-	task, err := testutil.CreateTask(s.Ctx, "name", tc)
+	task, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root)
 	if err != nil {
 		t.Fatalf("CreateTask(): %v", err)
 	}
@@ -360,14 +391,14 @@ func TestProcSelf(t *testing.T) {
 	collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{
 		Root:               s.Root,
 		Start:              s.Root,
-		Path:               fspath.Parse("/self/"),
+		Path:               fspath.Parse("/proc/self/"),
 		FollowFinalSymlink: true,
 	})
 	s.AssertAllDirentTypes(collector, taskStaticFiles)
 }
 
 func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.FileDescription) {
-	t.Logf("Iterating: /proc%s", fd.MappedName(ctx))
+	t.Logf("Iterating: %s", fd.MappedName(ctx))
 
 	var collector testutil.DirentCollector
 	if err := fd.IterDirents(ctx, &collector); err != nil {
@@ -384,35 +415,36 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F
 		if d.Name == "." || d.Name == ".." {
 			continue
 		}
-		childPath := path.Join(fd.MappedName(ctx), d.Name)
+		absPath := path.Join(fd.MappedName(ctx), d.Name)
 		if d.Type == linux.DT_LNK {
 			link, err := s.VFS.ReadlinkAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)},
+				&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)},
 			)
 			if err != nil {
-				t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err)
+				t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", absPath, err)
 			} else {
-				t.Logf("Skipping symlink: /proc%s => %s", childPath, link)
+				t.Logf("Skipping symlink: %s => %s", absPath, link)
 			}
 			continue
 		}
 
-		t.Logf("Opening: /proc%s", childPath)
+		t.Logf("Opening: %s", absPath)
 		child, err := s.VFS.OpenAt(
 			ctx,
 			auth.CredentialsFromContext(ctx),
-			&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)},
+			&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)},
 			&vfs.OpenOptions{},
 		)
 		if err != nil {
-			t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err)
+			t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err)
 			continue
 		}
+		defer child.DecRef()
 		stat, err := child.Stat(ctx, vfs.StatOptions{})
 		if err != nil {
-			t.Errorf("Stat(%v) failed: %v", childPath, err)
+			t.Errorf("Stat(%v) failed: %v", absPath, err)
 		}
 		if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type {
 			t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type)
@@ -430,13 +462,31 @@ func TestTree(t *testing.T) {
 	defer s.Destroy()
 
 	k := kernel.KernelFromContext(s.Ctx)
+
+	pop := &vfs.PathOperation{
+		Root:  s.Root,
+		Start: s.Root,
+		Path:  fspath.Parse("test-file"),
+	}
+	opts := &vfs.OpenOptions{
+		Flags: linux.O_RDONLY | linux.O_CREAT,
+		Mode:  0777,
+	}
+	file, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, opts)
+	if err != nil {
+		t.Fatalf("failed to create test file: %v", err)
+	}
+	defer file.DecRef()
+
 	var tasks []*kernel.Task
 	for i := 0; i < 5; i++ {
 		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
-		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc)
+		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root)
 		if err != nil {
 			t.Fatalf("CreateTask(): %v", err)
 		}
+		// Add file to populate /proc/[pid]/fd and fdinfo directories.
+		task.FDTable().NewFDVFS2(task, 0, file, kernel.FDFlags{})
 		tasks = append(tasks, task)
 	}
 
@@ -444,11 +494,12 @@ func TestTree(t *testing.T) {
 	fd, err := s.VFS.OpenAt(
 		ctx,
 		auth.CredentialsFromContext(s.Ctx),
-		&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/")},
+		&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/proc")},
 		&vfs.OpenOptions{},
 	)
 	if err != nil {
-		t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+		t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err)
 	}
 	iterateDir(ctx, t, s, fd)
+	fd.DecRef()
 }
diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD
new file mode 100644
index 000000000..067c1657f
--- /dev/null
+++ b/pkg/sentry/fsimpl/signalfd/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "signalfd",
+    srcs = ["signalfd.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/context",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
new file mode 100644
index 000000000..d29ef3f83
--- /dev/null
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -0,0 +1,135 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package signalfd
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// SignalFileDescription implements FileDescriptionImpl for signal fds.
+type SignalFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	// target is the original signal target task.
+	//
+	// The semantics here are a bit broken. Linux will always use current
+	// for all reads, regardless of where the signalfd originated. We can't
+	// do exactly that because we need to plumb the context through
+	// EventRegister in order to support proper blocking behavior. This
+	// will undoubtedly become very complicated quickly.
+	target *kernel.Task
+
+	// mu protects mask.
+	mu sync.Mutex
+
+	// mask is the signal mask. Protected by mu.
+	mask linux.SignalSet
+}
+
+var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil)
+
+// New creates a new signal fd.
+func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) {
+	vd := vfsObj.NewAnonVirtualDentry("[signalfd]")
+	defer vd.DecRef()
+	sfd := &SignalFileDescription{
+		target: target,
+		mask:   mask,
+	}
+	if err := sfd.vfsfd.Init(sfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+	}); err != nil {
+		return nil, err
+	}
+	return &sfd.vfsfd, nil
+}
+
+// Mask returns the signal mask.
+func (sfd *SignalFileDescription) Mask() linux.SignalSet {
+	sfd.mu.Lock()
+	defer sfd.mu.Unlock()
+	return sfd.mask
+}
+
+// SetMask sets the signal mask.
+func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) {
+	sfd.mu.Lock()
+	defer sfd.mu.Unlock()
+	sfd.mask = mask
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	// Attempt to dequeue relevant signals.
+	info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
+	if err != nil {
+		// There must be no signal available.
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// Copy out the signal info using the specified format.
+	var buf [128]byte
+	binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+		Signo:   uint32(info.Signo),
+		Errno:   info.Errno,
+		Code:    info.Code,
+		PID:     uint32(info.Pid()),
+		UID:     uint32(info.Uid()),
+		Status:  info.Status(),
+		Overrun: uint32(info.Overrun()),
+		Addr:    info.Addr(),
+	})
+	n, err := dst.CopyOut(ctx, buf[:])
+	return int64(n), err
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (sfd *SignalFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	sfd.mu.Lock()
+	defer sfd.mu.Unlock()
+	if mask&waiter.EventIn != 0 && sfd.target.PendingSignals()&sfd.mask != 0 {
+		return waiter.EventIn // Pending signals.
+	}
+	return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (sfd *SignalFileDescription) EventRegister(entry *waiter.Entry, _ waiter.EventMask) {
+	sfd.mu.Lock()
+	defer sfd.mu.Unlock()
+	// Register for the signal set; ignore the passed events.
+	sfd.target.SignalRegister(entry, waiter.EventMask(sfd.mask))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
+	// Unregister the original entry.
+	sfd.target.SignalUnregister(entry)
+}
+
+// Release implements FileDescriptionImpl.Release()
+func (sfd *SignalFileDescription) Release() {}
diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD
new file mode 100644
index 000000000..9453277b8
--- /dev/null
+++ b/pkg/sentry/fsimpl/sockfs/BUILD
@@ -0,0 +1,18 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "sockfs",
+    srcs = ["sockfs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
new file mode 100644
index 000000000..ee0828a15
--- /dev/null
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -0,0 +1,109 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sockfs provides a filesystem implementation for anonymous sockets.
+package sockfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// filesystemType implements vfs.FilesystemType.
+type filesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	panic("sockfs.filesystemType.GetFilesystem should never be called")
+}
+
+// Name implements FilesystemType.Name.
+//
+// Note that registering sockfs is unnecessary, except for the fact that it
+// will not show up under /proc/filesystems as a result. This is a very minor
+// discrepancy from Linux.
+func (filesystemType) Name() string {
+	return "sockfs"
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+// NewFilesystem sets up and returns a new sockfs filesystem.
+//
+// Note that there should only ever be one instance of sockfs.Filesystem,
+// backing a global socket mount.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, err
+	}
+	fs := &filesystem{
+		devMinor: devMinor,
+	}
+	fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
+	return fs.Filesystem.VFSFilesystem(), nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release()
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode)
+	b.PrependComponent(fmt.Sprintf("socket:[%d]", inode.InodeAttrs.Ino()))
+	return vfs.PrependPathSyntheticError{}
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return nil, syserror.ENXIO
+}
+
+// NewDentry constructs and returns a sockfs dentry.
+//
+// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
+func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry {
+	fs := mnt.Filesystem().Impl().(*filesystem)
+
+	// File mode matches net/socket.c:sock_alloc.
+	filemode := linux.FileMode(linux.S_IFSOCK | 0600)
+	i := &inode{}
+	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)
+
+	d := &kernfs.Dentry{}
+	d.Init(i)
+	return d.VFSDentry()
+}
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index 66c0d8bc8..a741e2bb6 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "sys.go",
     ],
+    visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index d693fceae..0af373604 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -28,18 +28,35 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// Name is the default filesystem name.
+const Name = "sysfs"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
 type filesystem struct {
 	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
 }
 
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	fs := &filesystem{}
-	fs.Filesystem.Init(vfsObj)
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	fs := &filesystem{
+		devMinor: devMinor,
+	}
+	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
 	k := kernel.KernelFromContext(ctx)
 	maxCPUCores := k.ApplicationCores()
 	defaultSysDirMode := linux.FileMode(0755)
@@ -69,6 +86,12 @@ func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile
 	return fs.VFSFilesystem(), root.VFSDentry(), nil
 }
 
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release()
+}
+
 // dir implements kernfs.Inode.
 type dir struct {
 	kernfs.InodeAttrs
@@ -82,7 +105,7 @@ type dir struct {
 
 func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
 	d := &dir{}
-	d.InodeAttrs.Init(creds, fs.NextIno(), linux.ModeDirectory|0755)
+	d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
 	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
 	d.dentry.Init(d)
 
@@ -91,15 +114,17 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 	return &d.dentry
 }
 
-// SetStat implements kernfs.Inode.SetStat.
-func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error {
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
 // Open implements kernfs.Inode.Open.
-func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd := &kernfs.GenericDirectoryFD{}
-	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	if err != nil {
+		return nil, err
+	}
 	return fd.VFSFileDescription(), nil
 }
 
@@ -117,7 +142,7 @@ func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
 
 func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry {
 	c := &cpuFile{maxCores: maxCores}
-	c.DynamicBytesFile.Init(creds, fs.NextIno(), c, mode)
+	c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
 	d := &kernfs.Dentry{}
 	d.Init(c)
 	return d
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 8b1cf0bd0..4b3602d47 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -34,16 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System {
 	}
 	ctx := k.SupervisorContext()
 	creds := auth.CredentialsFromContext(ctx)
-	v := vfs.New()
-	v.MustRegisterFilesystemType("sysfs", sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+	k.VFS().MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
 
-	mns, err := v.NewMountNamespace(ctx, creds, "", "sysfs", &vfs.GetFilesystemOptions{})
+	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create new mount namespace: %v", err)
 	}
-	return testutil.NewSystem(ctx, t, v, mns)
+	return testutil.NewSystem(ctx, t, k.VFS(), mns)
 }
 
 func TestReadCPUFile(t *testing.T) {
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index efd5974c4..0e4053a46 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -16,12 +16,14 @@ go_library(
         "//pkg/cpuid",
         "//pkg/fspath",
         "//pkg/memutil",
-        "//pkg/sentry/fs",
+        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
+        "//pkg/sentry/mm",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm",
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index 89f8c4915..c16a36cdb 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -23,16 +23,20 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/memutil"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 
 	// Platforms are plugable.
 	_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
@@ -99,40 +103,67 @@ func Boot() (*kernel.Kernel, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
-	ctx := k.SupervisorContext()
+	kernel.VFS2Enabled = true
 
-	// Create mount namespace without root as it's the minimum required to create
-	// the global thread group.
-	mntns, err := fs.NewMountNamespace(ctx, nil)
-	if err != nil {
-		return nil, err
+	if err := k.VFS().Init(); err != nil {
+		return nil, fmt.Errorf("VFS init: %v", err)
 	}
+	k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+
 	ls, err := limits.NewLinuxLimitSet()
 	if err != nil {
 		return nil, err
 	}
-	tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
+	tg := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
 	k.TestOnly_SetGlobalInit(tg)
 
 	return k, nil
 }
 
 // CreateTask creates a new bare bones task for tests.
-func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) {
+func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) {
 	k := kernel.KernelFromContext(ctx)
+	exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root)
+	if err != nil {
+		return nil, err
+	}
+	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
+	m.SetExecutable(fsbridge.NewVFSFile(exe))
+
 	config := &kernel.TaskConfig{
 		Kernel:                  k,
 		ThreadGroup:             tc,
-		TaskContext:             &kernel.TaskContext{Name: name},
+		TaskContext:             &kernel.TaskContext{Name: name, MemoryManager: m},
 		Credentials:             auth.CredentialsFromContext(ctx),
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
 		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
 		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
 		AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		MountNamespaceVFS2:      mntns,
+		FSContext:               kernel.NewFSContextVFS2(root, cwd, 0022),
+		FDTable:                 k.NewFDTable(),
 	}
 	return k.TaskSet().NewTask(config)
 }
 
+func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) {
+	const name = "executable"
+	pop := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(name),
+	}
+	opts := &vfs.OpenOptions{
+		Flags: linux.O_RDONLY | linux.O_CREAT,
+		Mode:  0777,
+	}
+	return vfsObj.OpenAt(ctx, creds, pop, opts)
+}
+
 func createMemoryFile() (*pgalloc.MemoryFile, error) {
 	const memfileName = "test-memory"
 	memfd, err := memutil.CreateMemFD(memfileName, 0)
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 69fd84ddd..0556af877 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -41,12 +41,12 @@ type System struct {
 	Creds *auth.Credentials
 	VFS   *vfs.VirtualFilesystem
 	Root  vfs.VirtualDentry
-	mns   *vfs.MountNamespace
+	MntNs *vfs.MountNamespace
 }
 
 // NewSystem constructs a System.
 //
-// Precondition: Caller must hold a reference on mns, whose ownership
+// Precondition: Caller must hold a reference on MntNs, whose ownership
 // is transferred to the new System.
 func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
 	s := &System{
@@ -54,7 +54,7 @@ func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns
 		Ctx:   ctx,
 		Creds: auth.CredentialsFromContext(ctx),
 		VFS:   v,
-		mns:   mns,
+		MntNs: mns,
 		Root:  mns.Root(),
 	}
 	return s
@@ -75,7 +75,7 @@ func (s *System) WithSubtest(t *testing.T) *System {
 		Ctx:   s.Ctx,
 		Creds: s.Creds,
 		VFS:   s.VFS,
-		mns:   s.mns,
+		MntNs: s.MntNs,
 		Root:  s.Root,
 	}
 }
@@ -90,7 +90,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
 		Ctx:   ctx,
 		Creds: s.Creds,
 		VFS:   s.VFS,
-		mns:   s.mns,
+		MntNs: s.MntNs,
 		Root:  s.Root,
 	}
 }
@@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
 // Destroy release resources associated with a test system.
 func (s *System) Destroy() {
 	s.Root.DecRef()
-	s.mns.DecRef() // Reference on mns passed to NewSystem.
+	s.MntNs.DecRef() // Reference on MntNs passed to NewSystem.
 }
 
 // ReadToEnd reads the contents of fd until EOF to a string.
@@ -162,6 +162,9 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector {
 // exactly the specified set of expected entries. AssertAllDirentTypes respects
 // collector.skipDots, and implicitly checks for "." and ".." accordingly.
 func (s *System) AssertAllDirentTypes(collector *DirentCollector, expected map[string]DirentType) {
+	if expected == nil {
+		expected = make(map[string]DirentType)
+	}
 	// Also implicitly check for "." and "..", if enabled.
 	if !collector.skipDots {
 		expected["."] = linux.DT_DIR
@@ -226,7 +229,7 @@ func (d *DirentCollector) SkipDotsChecks(value bool) {
 }
 
 // Handle implements vfs.IterDirentsCallback.Handle.
-func (d *DirentCollector) Handle(dirent vfs.Dirent) bool {
+func (d *DirentCollector) Handle(dirent vfs.Dirent) error {
 	d.mu.Lock()
 	if d.dirents == nil {
 		d.dirents = make(map[string]*vfs.Dirent)
@@ -234,7 +237,7 @@ func (d *DirentCollector) Handle(dirent vfs.Dirent) bool {
 	d.order = append(d.order, &dirent)
 	d.dirents[dirent.Name] = &dirent
 	d.mu.Unlock()
-	return true
+	return nil
 }
 
 // Count returns the number of dirents currently in the collector.
diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD
new file mode 100644
index 000000000..fbb02a271
--- /dev/null
+++ b/pkg/sentry/fsimpl/timerfd/BUILD
@@ -0,0 +1,17 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "timerfd",
+    srcs = ["timerfd.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/context",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
new file mode 100644
index 000000000..60c92d626
--- /dev/null
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -0,0 +1,143 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package timerfd implements timer fds.
+package timerfd
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// implements ktime.TimerListener.
+type TimerFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	events waiter.Queue
+	timer  *ktime.Timer
+
+	// val is the number of timer expirations since the last successful
+	// call to PRead, or SetTime. val must be accessed using atomic memory
+	// operations.
+	val uint64
+}
+
+var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil)
+var _ ktime.TimerListener = (*TimerFileDescription)(nil)
+
+// New returns a new timer fd.
+func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) {
+	vd := vfsObj.NewAnonVirtualDentry("[timerfd]")
+	defer vd.DecRef()
+	tfd := &TimerFileDescription{}
+	tfd.timer = ktime.NewTimer(clock, tfd)
+	if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+	}); err != nil {
+		return nil, err
+	}
+	return &tfd.vfsfd, nil
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	const sizeofUint64 = 8
+	if dst.NumBytes() < sizeofUint64 {
+		return 0, syserror.EINVAL
+	}
+	if val := atomic.SwapUint64(&tfd.val, 0); val != 0 {
+		var buf [sizeofUint64]byte
+		usermem.ByteOrder.PutUint64(buf[:], val)
+		if _, err := dst.CopyOut(ctx, buf[:]); err != nil {
+			// Linux does not undo consuming the number of
+			// expirations even if writing to userspace fails.
+			return 0, err
+		}
+		return sizeofUint64, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+// Clock returns the timer fd's Clock.
+func (tfd *TimerFileDescription) Clock() ktime.Clock {
+	return tfd.timer.Clock()
+}
+
+// GetTime returns the associated Timer's setting and the time at which it was
+// observed.
+func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) {
+	return tfd.timer.Get()
+}
+
+// SetTime atomically changes the associated Timer's setting, resets the number
+// of expirations to 0, and returns the previous setting and the time at which
+// it was observed.
+func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) {
+	return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) })
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	var ready waiter.EventMask
+	if atomic.LoadUint64(&tfd.val) != 0 {
+		ready |= waiter.EventIn
+	}
+	return ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	tfd.events.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) {
+	tfd.events.EventUnregister(e)
+}
+
+// PauseTimer pauses the associated Timer.
+func (tfd *TimerFileDescription) PauseTimer() {
+	tfd.timer.Pause()
+}
+
+// ResumeTimer resumes the associated Timer.
+func (tfd *TimerFileDescription) ResumeTimer() {
+	tfd.timer.Resume()
+}
+
+// Release implements FileDescriptionImpl.Release()
+func (tfd *TimerFileDescription) Release() {
+	tfd.timer.Destroy()
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
+	atomic.AddUint64(&tfd.val, exp)
+	tfd.events.Notify(waiter.EventIn)
+	return ktime.Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (tfd *TimerFileDescription) Destroy() {}
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 57abd5583..062321cbc 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -15,6 +15,17 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "tmpfs",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
 go_library(
     name = "tmpfs",
     srcs = [
@@ -22,8 +33,10 @@ go_library(
         "device_file.go",
         "directory.go",
         "filesystem.go",
+        "fstree.go",
         "named_pipe.go",
         "regular_file.go",
+        "socket_file.go",
         "symlink.go",
         "tmpfs.go",
     ],
@@ -39,16 +52,18 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
-        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
         "//pkg/sentry/vfs/lock",
+        "//pkg/sentry/vfs/memxattr",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
@@ -81,6 +96,7 @@ go_test(
         "pipe_test.go",
         "regular_file_test.go",
         "stat_test.go",
+        "tmpfs_test.go",
     ],
     library = ":tmpfs",
     deps = [
@@ -90,7 +106,6 @@ go_test(
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 9fce5e4b4..2fb5c4d84 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -168,14 +168,17 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
 	}
 }
 
-func BenchmarkVFS2MemfsStat(b *testing.B) {
+func BenchmarkVFS2TmpfsStat(b *testing.B) {
 	for _, depth := range depths {
 		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
 			ctx := contexttest.Context(b)
 			creds := auth.CredentialsFromContext(ctx)
 
 			// Create VFS.
-			vfsObj := vfs.New()
+			vfsObj := vfs.VirtualFilesystem{}
+			if err := vfsObj.Init(); err != nil {
+				b.Fatalf("VFS init: %v", err)
+			}
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
@@ -359,14 +362,17 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
 	}
 }
 
-func BenchmarkVFS2MemfsMountStat(b *testing.B) {
+func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 	for _, depth := range depths {
 		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
 			ctx := contexttest.Context(b)
 			creds := auth.CredentialsFromContext(ctx)
 
 			// Create VFS.
-			vfsObj := vfs.New()
+			vfsObj := vfs.VirtualFilesystem{}
+			if err := vfsObj.Init(); err != nil {
+				b.Fatalf("VFS init: %v", err)
+			}
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
@@ -432,13 +438,6 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 				filePathBuilder.WriteByte('/')
 			}
 
-			// Verify that we didn't create any directories under the mount
-			// point (i.e. they were all created on the submount).
-			firstDirName := fmt.Sprintf("%d", depth)
-			if child := mountPoint.Dentry().Child(firstDirName); child != nil {
-				b.Fatalf("created directory %q under root mount, not submount", firstDirName)
-			}
-
 			// Create the file that will be stat'd.
 			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
 				Root:               root,
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index 84b181b90..ac54d420d 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -15,6 +15,8 @@
 package tmpfs
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -27,13 +29,21 @@ type deviceFile struct {
 	minor uint32
 }
 
-func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
+func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
 	file := &deviceFile{
 		kind:  kind,
 		major: major,
 		minor: minor,
 	}
-	file.inode.init(file, fs, creds, mode)
+	switch kind {
+	case vfs.BlockDevice:
+		mode |= linux.S_IFBLK
+	case vfs.CharDevice:
+		mode |= linux.S_IFCHR
+	default:
+		panic(fmt.Sprintf("invalid DeviceKind: %v", kind))
+	}
+	file.inode.init(file, fs, kuid, kgid, mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index dc0d27cf9..913b8a6c5 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -15,40 +15,78 @@
 package tmpfs
 
 import (
+	"sync/atomic"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 type directory struct {
-	inode inode
+	// Since directories can't be hard-linked, each directory can only be
+	// associated with a single dentry, which we can store in the directory
+	// struct.
+	dentry dentry
+	inode  inode
+
+	// childMap maps the names of the directory's children to their dentries.
+	// childMap is protected by filesystem.mu.
+	childMap map[string]*dentry
 
-	// childList is a list containing (1) child Dentries and (2) fake Dentries
+	// numChildren is len(childMap), but accessed using atomic memory
+	// operations to avoid locking in inode.statTo().
+	numChildren int64
+
+	// childList is a list containing (1) child dentries and (2) fake dentries
 	// (with inode == nil) that represent the iteration position of
 	// directoryFDs. childList is used to support directoryFD.IterDirents()
-	// efficiently. childList is protected by filesystem.mu.
+	// efficiently. childList is protected by iterMu.
+	iterMu    sync.Mutex
 	childList dentryList
 }
 
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *directory {
 	dir := &directory{}
-	dir.inode.init(dir, fs, creds, mode)
+	dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode)
 	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
-	return &dir.inode
+	dir.dentry.inode = &dir.inode
+	dir.dentry.vfsd.Init(&dir.dentry)
+	return dir
 }
 
-func (i *inode) isDir() bool {
-	_, ok := i.impl.(*directory)
-	return ok
+// Preconditions: filesystem.mu must be locked for writing. dir must not
+// already contain a child with the given name.
+func (dir *directory) insertChildLocked(child *dentry, name string) {
+	child.parent = &dir.dentry
+	child.name = name
+	if dir.childMap == nil {
+		dir.childMap = make(map[string]*dentry)
+	}
+	dir.childMap[name] = child
+	atomic.AddInt64(&dir.numChildren, 1)
+	dir.iterMu.Lock()
+	dir.childList.PushBack(child)
+	dir.iterMu.Unlock()
+}
+
+// Preconditions: filesystem.mu must be locked for writing.
+func (dir *directory) removeChildLocked(child *dentry) {
+	delete(dir.childMap, child.name)
+	atomic.AddInt64(&dir.numChildren, -1)
+	dir.iterMu.Lock()
+	dir.childList.Remove(child)
+	dir.iterMu.Unlock()
+	child.unlinked = true
 }
 
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
 
-	// Protected by filesystem.mu.
+	// Protected by directory.iterMu.
 	iter *dentry
 	off  int64
 }
@@ -56,11 +94,10 @@ type directoryFD struct {
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *directoryFD) Release() {
 	if fd.iter != nil {
-		fs := fd.filesystem()
 		dir := fd.inode().impl.(*directory)
-		fs.mu.Lock()
+		dir.iterMu.Lock()
 		dir.childList.Remove(fd.iter)
-		fs.mu.Unlock()
+		dir.iterMu.Unlock()
 		fd.iter = nil
 	}
 }
@@ -68,36 +105,42 @@ func (fd *directoryFD) Release() {
 // IterDirents implements vfs.FileDescriptionImpl.IterDirents.
 func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
 	fs := fd.filesystem()
-	vfsd := fd.vfsfd.VirtualDentry().Dentry()
+	dir := fd.inode().impl.(*directory)
 
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
+	// fs.mu is required to read d.parent and dentry.name.
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	dir.iterMu.Lock()
+	defer dir.iterMu.Unlock()
+
+	fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	fd.inode().touchAtime(fd.vfsfd.Mount())
 
 	if fd.off == 0 {
-		if !cb.Handle(vfs.Dirent{
+		if err := cb.Handle(vfs.Dirent{
 			Name:    ".",
 			Type:    linux.DT_DIR,
-			Ino:     vfsd.Impl().(*dentry).inode.ino,
+			Ino:     dir.inode.ino,
 			NextOff: 1,
-		}) {
-			return nil
+		}); err != nil {
+			return err
 		}
 		fd.off++
 	}
+
 	if fd.off == 1 {
-		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
-		if !cb.Handle(vfs.Dirent{
+		parentInode := genericParentOrSelf(&dir.dentry).inode
+		if err := cb.Handle(vfs.Dirent{
 			Name:    "..",
 			Type:    parentInode.direntType(),
 			Ino:     parentInode.ino,
 			NextOff: 2,
-		}) {
-			return nil
+		}); err != nil {
+			return err
 		}
 		fd.off++
 	}
 
-	dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
 	var child *dentry
 	if fd.iter == nil {
 		// Start iteration at the beginning of dir.
@@ -111,14 +154,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	for child != nil {
 		// Skip other directoryFD iterators.
 		if child.inode != nil {
-			if !cb.Handle(vfs.Dirent{
-				Name:    child.vfsd.Name(),
+			if err := cb.Handle(vfs.Dirent{
+				Name:    child.name,
 				Type:    child.inode.direntType(),
 				Ino:     child.inode.ino,
 				NextOff: fd.off + 1,
-			}) {
+			}); err != nil {
 				dir.childList.InsertBefore(child, fd.iter)
-				return nil
+				return err
 			}
 			fd.off++
 		}
@@ -130,9 +173,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	fs := fd.filesystem()
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
+	dir := fd.inode().impl.(*directory)
+	dir.iterMu.Lock()
+	defer dir.iterMu.Unlock()
 
 	switch whence {
 	case linux.SEEK_SET:
@@ -160,8 +203,6 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 		remChildren = offset - 2
 	}
 
-	dir := fd.inode().impl.(*directory)
-
 	// Ensure that fd.iter exists and is not linked into dir.childList.
 	if fd.iter == nil {
 		fd.iter = &dentry{}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 8785452b6..e801680e8 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -21,8 +21,11 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Sync implements vfs.FilesystemImpl.Sync.
@@ -38,33 +41,52 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 //
 // Preconditions: filesystem.mu must be locked. !rp.Done().
 func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
-	if !d.inode.isDir() {
+	dir, ok := d.inode.impl.(*directory)
+	if !ok {
 		return nil, syserror.ENOTDIR
 	}
-	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 afterSymlink:
-	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
-	if err != nil {
-		return nil, err
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return d.parent, nil
+	}
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
 	}
-	if nextVFSD == nil {
-		// Since the Dentry tree is the sole source of truth for tmpfs, if it's
-		// not in the Dentry tree, it doesn't exist.
+	child, ok := dir.childMap[name]
+	if !ok {
 		return nil, syserror.ENOENT
 	}
-	next := nextVFSD.Impl().(*dentry)
-	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO(gvisor.dev/issues/1197): Symlink traversals updates
-		// access time.
+	if err := rp.CheckMount(&child.vfsd); err != nil {
+		return nil, err
+	}
+	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+		// Symlink traversal updates access time.
+		atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds())
 		if err := rp.HandleSymlink(symlink.target); err != nil {
 			return nil, err
 		}
 		goto afterSymlink // don't check the current directory again
 	}
 	rp.Advance()
-	return next, nil
+	return child, nil
 }
 
 // walkParentDirLocked resolves all but the last path component of rp to an
@@ -76,7 +98,7 @@ afterSymlink:
 // fs/namei.c:path_parentat().
 //
 // Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
 	for !rp.Final() {
 		next, err := stepLocked(rp, d)
 		if err != nil {
@@ -84,10 +106,11 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
 		}
 		d = next
 	}
-	if !d.inode.isDir() {
+	dir, ok := d.inode.impl.(*directory)
+	if !ok {
 		return nil, syserror.ENOTDIR
 	}
-	return d, nil
+	return dir, nil
 }
 
 // resolveLocked resolves rp to an existing file.
@@ -118,33 +141,32 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
 //
 // Preconditions: !rp.Done(). For the final path component in rp,
 // !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
 	if name == "." || name == ".." {
 		return syserror.EEXIST
 	}
-	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
-	// because if the child exists we want to return EEXIST immediately instead
-	// of attempting symlink/mount traversal.
-	if parent.vfsd.Child(name) != nil {
+	if len(name) > linux.NAME_MAX {
+		return syserror.ENAMETOOLONG
+	}
+	if _, ok := parentDir.childMap[name]; ok {
 		return syserror.EEXIST
 	}
 	if !dir && rp.MustBeDir() {
 		return syserror.ENOENT
 	}
-	// In memfs, the only way to cause a dentry to be disowned is by removing
-	// it from the filesystem, so this check is equivalent to checking if
-	// parent has been removed.
-	if parent.vfsd.IsDisowned() {
+	// tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only
+	// be dead if it was deleted.
+	if parentDir.dentry.vfsd.IsDead() {
 		return syserror.ENOENT
 	}
 	mnt := rp.Mount()
@@ -152,7 +174,28 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 		return err
 	}
 	defer mnt.EndWrite()
-	return create(parent, name)
+	if err := create(parentDir, name); err != nil {
+		return err
+	}
+
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent)
+	parentDir.inode.touchCMtime()
+	return nil
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	return d.inode.checkPermissions(creds, ats)
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
@@ -167,7 +210,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 		if !d.inode.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
+		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 			return nil, err
 		}
 	}
@@ -179,17 +222,17 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return nil, err
 	}
-	d.IncRef()
-	return &d.vfsd, nil
+	dir.dentry.IncRef()
+	return &dir.dentry.vfsd, nil
 }
 
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
 func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
 		if rp.Mount() != vd.Mount() {
 			return syserror.EXDEV
 		}
@@ -204,49 +247,47 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 			return syserror.EMLINK
 		}
 		d.inode.incLinksLocked()
-		child := fs.newDentry(d.inode)
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
+		d.inode.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent)
+		parentDir.insertChildLocked(fs.newDentry(d.inode), name)
 		return nil
 	})
 }
 
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
-	return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
-		if parent.inode.nlink == maxLinks {
+	return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
+		if parentDir.inode.nlink == maxLinks {
 			return syserror.EMLINK
 		}
-		parent.inode.incLinksLocked() // from child's ".."
-		child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
+		parentDir.inode.incLinksLocked() // from child's ".."
+		childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
+		parentDir.insertChildLocked(&childDir.dentry, name)
 		return nil
 	})
 }
 
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
 		var childInode *inode
 		switch opts.Mode.FileType() {
 		case 0, linux.S_IFREG:
-			childInode = fs.newRegularFile(rp.Credentials(), opts.Mode)
+			childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		case linux.S_IFIFO:
-			childInode = fs.newNamedPipe(rp.Credentials(), opts.Mode)
+			childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		case linux.S_IFBLK:
-			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
+			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
 		case linux.S_IFCHR:
-			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
+			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
 		case linux.S_IFSOCK:
-			// Not yet supported.
-			return syserror.EPERM
+			childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint)
 		default:
 			return syserror.EINVAL
 		}
 		child := fs.newDentry(childInode)
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
+		parentDir.insertChildLocked(child, name)
 		return nil
 	})
 }
@@ -285,12 +326,12 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		return start.open(ctx, rp, &opts, false /* afterCreate */)
 	}
 afterTrailingSymlink:
-	parent, err := walkParentDirLocked(rp, start)
+	parentDir, err := walkParentDirLocked(rp, start)
 	if err != nil {
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Reject attempts to open directories with O_CREAT.
@@ -301,11 +342,14 @@ afterTrailingSymlink:
 	if name == "." || name == ".." {
 		return nil, syserror.EISDIR
 	}
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
 	// Determine whether or not we need to create a file.
-	child, err := stepLocked(rp, parent)
-	if err == syserror.ENOENT {
+	child, ok := parentDir.childMap[name]
+	if !ok {
 		// Already checked for searchability above; now check for writability.
-		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+		if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 			return nil, err
 		}
 		if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -313,17 +357,29 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
-		return child.open(ctx, rp, &opts, true)
+		creds := rp.Credentials()
+		child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
+		parentDir.insertChildLocked(child, name)
+		fd, err := child.open(ctx, rp, &opts, true)
+		if err != nil {
+			return nil, err
+		}
+		parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent)
+		parentDir.inode.touchCMtime()
+		return fd, nil
 	}
-	if err != nil {
+	// Is the file mounted over?
+	if err := rp.CheckMount(&child.vfsd); err != nil {
 		return nil, err
 	}
 	// Do we need to resolve a trailing symlink?
-	if !rp.Done() {
-		start = parent
+	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+		// Symlink traversal updates access time.
+		atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds())
+		if err := rp.HandleSymlink(symlink.target); err != nil {
+			return nil, err
+		}
+		start = &parentDir.dentry
 		goto afterTrailingSymlink
 	}
 	// Open existing file.
@@ -336,7 +392,7 @@ afterTrailingSymlink:
 func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(opts)
 	if !afterCreate {
-		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
+		if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
 	}
@@ -347,10 +403,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 			return nil, err
 		}
 		if opts.Flags&linux.O_TRUNC != 0 {
-			impl.mu.Lock()
-			impl.data.Truncate(0, impl.memFile)
-			atomic.StoreUint64(&impl.size, 0)
-			impl.mu.Unlock()
+			if _, err := impl.truncate(0); err != nil {
+				return nil, err
+			}
 		}
 		return &fd.vfsfd, nil
 	case *directory:
@@ -367,9 +422,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 		// Can't open symlinks without O_PATH (which is unimplemented).
 		return nil, syserror.ELOOP
 	case *namedPipe:
-		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags)
+		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags)
 	case *deviceFile:
 		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
+	case *socketFile:
+		return nil, syserror.ENXIO
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
 	}
@@ -387,6 +444,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
 	if !ok {
 		return "", syserror.EINVAL
 	}
+	symlink.inode.touchAtime(rp.Mount())
 	return symlink.target, nil
 }
 
@@ -400,7 +458,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	// Resolve newParent first to verify that it's on this Mount.
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
@@ -417,25 +475,24 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 	defer mnt.EndWrite()
 
-	oldParent := oldParentVD.Dentry().Impl().(*dentry)
-	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory)
+	if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
-	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
-	// because if the existing child is a symlink or mount point then we want
-	// to rename over it rather than follow it.
-	renamedVFSD := oldParent.vfsd.Child(oldName)
-	if renamedVFSD == nil {
+	renamed, ok := oldParentDir.childMap[oldName]
+	if !ok {
 		return syserror.ENOENT
 	}
-	renamed := renamedVFSD.Impl().(*dentry)
+	// Note that we don't need to call rp.CheckMount(), since if renamed is a
+	// mount point then we want to rename the mount point, not anything in the
+	// mounted filesystem.
 	if renamed.inode.isDir() {
-		if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
+		if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) {
 			return syserror.EINVAL
 		}
-		if oldParent != newParent {
+		if oldParentDir != newParentDir {
 			// Writability is needed to change renamed's "..".
-			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
+			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 				return err
 			}
 		}
@@ -445,18 +502,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		}
 	}
 
-	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
-	replacedVFSD := newParent.vfsd.Child(newName)
-	var replaced *dentry
-	if replacedVFSD != nil {
-		replaced = replacedVFSD.Impl().(*dentry)
-		if replaced.inode.isDir() {
+	replaced, ok := newParentDir.childMap[newName]
+	if ok {
+		replacedDir, ok := replaced.inode.impl.(*directory)
+		if ok {
 			if !renamed.inode.isDir() {
 				return syserror.EISDIR
 			}
-			if replaced.vfsd.HasChildren() {
+			if len(replacedDir.childMap) != 0 {
 				return syserror.ENOTEMPTY
 			}
 		} else {
@@ -468,11 +524,13 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			}
 		}
 	} else {
-		if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
+		if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks {
 			return syserror.EMLINK
 		}
 	}
-	if newParent.vfsd.IsDisowned() {
+	// tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can
+	// only be dead if it was deleted.
+	if newParentDir.dentry.vfsd.IsDead() {
 		return syserror.ENOENT
 	}
 
@@ -480,31 +538,40 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	// simplicity, under the assumption that applications are not intentionally
 	// doing noop renames expecting them to succeed where non-noop renames
 	// would fail.
-	if renamedVFSD == replacedVFSD {
+	if renamed == replaced {
 		return nil
 	}
 	vfsObj := rp.VirtualFilesystem()
-	oldParentDir := oldParent.inode.impl.(*directory)
-	newParentDir := newParent.inode.impl.(*directory)
-	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	var replacedVFSD *vfs.Dentry
+	if replaced != nil {
+		replacedVFSD = &replaced.vfsd
+	}
+	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
 		return err
 	}
 	if replaced != nil {
-		newParentDir.childList.Remove(replaced)
+		newParentDir.removeChildLocked(replaced)
 		if replaced.inode.isDir() {
-			newParent.inode.decLinksLocked() // from replaced's ".."
+			newParentDir.inode.decLinksLocked() // from replaced's ".."
 		}
 		replaced.inode.decLinksLocked()
 	}
-	oldParentDir.childList.Remove(renamed)
-	newParentDir.childList.PushBack(renamed)
-	if renamed.inode.isDir() {
-		oldParent.inode.decLinksLocked()
-		newParent.inode.incLinksLocked()
+	oldParentDir.removeChildLocked(renamed)
+	newParentDir.insertChildLocked(renamed, newName)
+	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
+	oldParentDir.inode.touchCMtime()
+	if oldParentDir != newParentDir {
+		if renamed.inode.isDir() {
+			oldParentDir.inode.decLinksLocked()
+			newParentDir.inode.incLinksLocked()
+		}
+		newParentDir.inode.touchCMtime()
 	}
-	// TODO(gvisor.dev/issues/1197): Update timestamps and parent directory
-	// sizes.
-	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
+	renamed.inode.touchCtime()
+
+	vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
 	return nil
 }
 
@@ -512,11 +579,11 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
@@ -526,15 +593,15 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if name == ".." {
 		return syserror.ENOTEMPTY
 	}
-	childVFSD := parent.vfsd.Child(name)
-	if childVFSD == nil {
+	child, ok := parentDir.childMap[name]
+	if !ok {
 		return syserror.ENOENT
 	}
-	child := childVFSD.Impl().(*dentry)
-	if !child.inode.isDir() {
+	childDir, ok := child.inode.impl.(*directory)
+	if !ok {
 		return syserror.ENOTDIR
 	}
-	if childVFSD.HasChildren() {
+	if len(childDir.childMap) != 0 {
 		return syserror.ENOTEMPTY
 	}
 	mnt := rp.Mount()
@@ -543,13 +610,19 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	}
 	defer mnt.EndWrite()
 	vfsObj := rp.VirtualFilesystem()
-	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 		return err
 	}
-	parent.inode.impl.(*directory).childList.Remove(child)
-	parent.inode.decLinksLocked() // from child's ".."
+	parentDir.removeChildLocked(child)
+	parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent)
+	// Remove links for child, child/., and child/..
 	child.inode.decLinksLocked()
-	vfsObj.CommitDeleteDentry(childVFSD)
+	child.inode.decLinksLocked()
+	parentDir.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(&child.vfsd)
+	parentDir.inode.touchCMtime()
 	return nil
 }
 
@@ -561,7 +634,14 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 	if err != nil {
 		return err
 	}
-	return d.inode.setStat(opts.Stat)
+	if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil {
+		return err
+	}
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
@@ -581,20 +661,27 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
+	if _, err := resolveLocked(rp); err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO(gvisor.dev/issues/1197): Actually implement statfs.
-	return linux.Statfs{}, syserror.ENOSYS
+	statfs := linux.Statfs{
+		Type:         linux.TMPFS_MAGIC,
+		BlockSize:    usermem.PageSize,
+		FragmentSize: usermem.PageSize,
+		NameLength:   linux.NAME_MAX,
+		// TODO(b/29637826): Allow configuring a tmpfs size and enforce it.
+		Blocks:     0,
+		BlocksFree: 0,
+	}
+	return statfs, nil
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
-		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
+	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
+		child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target))
+		parentDir.insertChildLocked(child, name)
 		return nil
 	})
 }
@@ -603,22 +690,21 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return err
 	}
 	name := rp.Component()
 	if name == "." || name == ".." {
 		return syserror.EISDIR
 	}
-	childVFSD := parent.vfsd.Child(name)
-	if childVFSD == nil {
+	child, ok := parentDir.childMap[name]
+	if !ok {
 		return syserror.ENOENT
 	}
-	child := childVFSD.Impl().(*dentry)
 	if child.inode.isDir() {
 		return syserror.EISDIR
 	}
@@ -631,66 +717,119 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	}
 	defer mnt.EndWrite()
 	vfsObj := rp.VirtualFilesystem()
-	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef()
+	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 		return err
 	}
-	parent.inode.impl.(*directory).childList.Remove(child)
+
+	// Generate inotify events. Note that this must take place before the link
+	// count of the child is decremented, or else the watches may be dropped
+	// before these events are added.
+	vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name)
+
+	parentDir.removeChildLocked(child)
 	child.inode.decLinksLocked()
-	vfsObj.CommitDeleteDentry(childVFSD)
+	vfsObj.CommitDeleteDentry(&child.vfsd)
+	parentDir.inode.touchCMtime()
 	return nil
 }
 
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	switch impl := d.inode.impl.(type) {
+	case *socketFile:
+		return impl.ep, nil
+	default:
+		return nil, syserror.ECONNREFUSED
+	}
+}
+
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return nil, err
 	}
-	// TODO(b/127675828): support extended attributes
-	return nil, syserror.ENOTSUP
+	return d.inode.listxattr(size)
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return "", err
 	}
-	// TODO(b/127675828): support extended attributes
-	return "", syserror.ENOTSUP
+	return d.inode.getxattr(rp.Credentials(), &opts)
 }
 
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
 func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
+	if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
 func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
+	if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
+	mnt := vd.Mount()
+	d := vd.Dentry().Impl().(*dentry)
+	for {
+		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+			return vfs.PrependPathAtVFSRootError{}
+		}
+		if &d.vfsd == mnt.Root() {
+			return nil
+		}
+		if d.parent == nil {
+			if d.name != "" {
+				// This must be an anonymous memfd file.
+				b.PrependComponent("/" + d.name)
+				return vfs.PrependPathSyntheticError{}
+			}
+			return vfs.PrependPathAtNonMountRootError{}
+		}
+		b.PrependComponent(d.name)
+		d = d.parent
+	}
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 0c57fdca3..739350cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -16,10 +16,8 @@ package tmpfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -32,28 +30,9 @@ type namedPipe struct {
 // Preconditions:
 //   * fs.mu must be locked.
 //   * rp.Mount().CheckBeginWrite() has been called successfully.
-func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
-	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
-	file.inode.init(file, fs, creds, mode)
+func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
+	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
+	file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
 	file.inode.nlink = 1 // Only the parent has a link.
 	return &file.inode
 }
-
-// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented
-// entirely via struct embedding.
-type namedPipeFD struct {
-	fileDescription
-
-	*pipe.VFSPipeFD
-}
-
-func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
-	var err error
-	var fd namedPipeFD
-	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags)
-	if err != nil {
-		return nil, err
-	}
-	fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{})
-	return &fd.vfsfd, nil
-}
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index 5ee7f2a72..1614f2c39 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -151,7 +151,10 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index dab346a41..4f2ae04d2 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -15,6 +15,7 @@
 package tmpfs
 
 import (
+	"fmt"
 	"io"
 	"math"
 	"sync/atomic"
@@ -22,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -34,71 +36,231 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// regularFile is a regular (=S_IFREG) tmpfs file.
 type regularFile struct {
 	inode inode
 
 	// memFile is a platform.File used to allocate pages to this regularFile.
 	memFile *pgalloc.MemoryFile
 
-	// mu protects the fields below.
-	mu sync.RWMutex
+	// mapsMu protects mappings.
+	mapsMu sync.Mutex `state:"nosave"`
+
+	// mappings tracks mappings of the file into memmap.MappingSpaces.
+	//
+	// Protected by mapsMu.
+	mappings memmap.MappingSet
+
+	// writableMappingPages tracks how many pages of virtual memory are mapped
+	// as potentially writable from this file. If a page has multiple mappings,
+	// each mapping is counted separately.
+	//
+	// This counter is susceptible to overflow as we can potentially count
+	// mappings from many VMAs. We count pages rather than bytes to slightly
+	// mitigate this.
+	//
+	// Protected by mapsMu.
+	writableMappingPages uint64
+
+	// dataMu protects the fields below.
+	dataMu sync.RWMutex
 
 	// data maps offsets into the file to offsets into memFile that store
 	// the file's data.
+	//
+	// Protected by dataMu.
 	data fsutil.FileRangeSet
 
-	// size is the size of data, but accessed using atomic memory
-	// operations to avoid locking in inode.stat().
-	size uint64
-
 	// seals represents file seals on this inode.
+	//
+	// Protected by dataMu.
 	seals uint32
+
+	// size is the size of data.
+	//
+	// Protected by both dataMu and inode.mu; reading it requires holding
+	// either mutex, while writing requires holding both AND using atomics.
+	// Readers that do not require consistency (like Stat) may read the
+	// value atomically without holding either lock.
+	size uint64
 }
 
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &regularFile{
 		memFile: fs.memFile,
+		seals:   linux.F_SEAL_SEAL,
 	}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
 
 // truncate grows or shrinks the file to the given size. It returns true if the
 // file size was updated.
-func (rf *regularFile) truncate(size uint64) (bool, error) {
-	rf.mu.Lock()
-	defer rf.mu.Unlock()
+func (rf *regularFile) truncate(newSize uint64) (bool, error) {
+	rf.inode.mu.Lock()
+	defer rf.inode.mu.Unlock()
+	return rf.truncateLocked(newSize)
+}
 
-	if size == rf.size {
+// Preconditions: rf.inode.mu must be held.
+func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) {
+	oldSize := rf.size
+	if newSize == oldSize {
 		// Nothing to do.
 		return false, nil
 	}
 
-	if size > rf.size {
-		// Growing the file.
+	// Need to hold inode.mu and dataMu while modifying size.
+	rf.dataMu.Lock()
+	if newSize > oldSize {
+		// Can we grow the file?
 		if rf.seals&linux.F_SEAL_GROW != 0 {
-			// Seal does not allow growth.
+			rf.dataMu.Unlock()
 			return false, syserror.EPERM
 		}
-		rf.size = size
+		// We only need to update the file size.
+		atomic.StoreUint64(&rf.size, newSize)
+		rf.dataMu.Unlock()
 		return true, nil
 	}
 
-	// Shrinking the file
+	// We are shrinking the file. First check if this is allowed.
 	if rf.seals&linux.F_SEAL_SHRINK != 0 {
-		// Seal does not allow shrink.
+		rf.dataMu.Unlock()
 		return false, syserror.EPERM
 	}
 
-	// TODO(gvisor.dev/issues/1197): Invalidate mappings once we have
-	// mappings.
+	// Update the file size.
+	atomic.StoreUint64(&rf.size, newSize)
+	rf.dataMu.Unlock()
+
+	// Invalidate past translations of truncated pages.
+	oldpgend := fs.OffsetPageEnd(int64(oldSize))
+	newpgend := fs.OffsetPageEnd(int64(newSize))
+	if newpgend < oldpgend {
+		rf.mapsMu.Lock()
+		rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+			// Compare Linux's mm/shmem.c:shmem_setattr() =>
+			// mm/memory.c:unmap_mapping_range(evencows=1).
+			InvalidatePrivate: true,
+		})
+		rf.mapsMu.Unlock()
+	}
 
-	rf.data.Truncate(size, rf.memFile)
-	rf.size = size
+	// We are now guaranteed that there are no translations of truncated pages,
+	// and can remove them.
+	rf.dataMu.Lock()
+	rf.data.Truncate(newSize, rf.memFile)
+	rf.dataMu.Unlock()
 	return true, nil
 }
 
+// AddMapping implements memmap.Mappable.AddMapping.
+func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	rf.mapsMu.Lock()
+	defer rf.mapsMu.Unlock()
+	rf.dataMu.RLock()
+	defer rf.dataMu.RUnlock()
+
+	// Reject writable mapping if F_SEAL_WRITE is set.
+	if rf.seals&linux.F_SEAL_WRITE != 0 && writable {
+		return syserror.EPERM
+	}
+
+	rf.mappings.AddMapping(ms, ar, offset, writable)
+	if writable {
+		pagesBefore := rf.writableMappingPages
+
+		// ar is guaranteed to be page aligned per memmap.Mappable.
+		rf.writableMappingPages += uint64(ar.Length() / usermem.PageSize)
+
+		if rf.writableMappingPages < pagesBefore {
+			panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
+		}
+	}
+
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	rf.mapsMu.Lock()
+	defer rf.mapsMu.Unlock()
+
+	rf.mappings.RemoveMapping(ms, ar, offset, writable)
+
+	if writable {
+		pagesBefore := rf.writableMappingPages
+
+		// ar is guaranteed to be page aligned per memmap.Mappable.
+		rf.writableMappingPages -= uint64(ar.Length() / usermem.PageSize)
+
+		if rf.writableMappingPages > pagesBefore {
+			panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
+		}
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return rf.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	rf.dataMu.Lock()
+	defer rf.dataMu.Unlock()
+
+	// Constrain translations to f.attr.Size (rounded up) to prevent
+	// translation to pages that may be concurrently truncated.
+	pgend := fs.OffsetPageEnd(int64(rf.size))
+	var beyondEOF bool
+	if required.End > pgend {
+		if required.Start >= pgend {
+			return nil, &memmap.BusError{io.EOF}
+		}
+		beyondEOF = true
+		required.End = pgend
+	}
+	if optional.End > pgend {
+		optional.End = pgend
+	}
+
+	cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+		// Newly-allocated pages are zeroed, so we don't need to do anything.
+		return dsts.NumBytes(), nil
+	})
+
+	var ts []memmap.Translation
+	var translatedEnd uint64
+	for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+		segMR := seg.Range().Intersect(optional)
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   rf.memFile,
+			Offset: seg.FileRangeOf(segMR).Start,
+			Perms:  usermem.AnyAccess,
+		})
+		translatedEnd = segMR.End
+	}
+
+	// Don't return the error returned by f.data.Fill if it occurred outside of
+	// required.
+	if translatedEnd < required.End && cerr != nil {
+		return ts, &memmap.BusError{cerr}
+	}
+	if beyondEOF {
+		return ts, &memmap.BusError{io.EOF}
+	}
+	return ts, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (*regularFile) InvalidateUnsavable(context.Context) error {
+	return nil
+}
+
 type regularFileFD struct {
 	fileDescription
 
@@ -125,7 +287,8 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := dst.CopyOutFrom(ctx, rw)
 	putRegularFileReadWriter(rw)
-	return int64(n), err
+	fd.inode().touchAtime(fd.vfsfd.Mount())
+	return n, err
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
@@ -147,13 +310,23 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 		return 0, nil
 	}
 	f := fd.inode().impl.(*regularFile)
-	end := offset + srclen
-	if end < offset {
+	if end := offset + srclen; end < offset {
 		// Overflow.
-		return 0, syserror.EFBIG
+		return 0, syserror.EINVAL
+	}
+
+	var err error
+	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
+	if err != nil {
+		return 0, err
 	}
+	src = src.TakeFirst64(srclen)
+
+	f.inode.mu.Lock()
 	rw := getRegularFileReadWriter(f, offset)
 	n, err := src.CopyInTo(ctx, rw)
+	fd.inode().touchCMtimeLocked()
+	f.inode.mu.Unlock()
 	putRegularFileReadWriter(rw)
 	return n, err
 }
@@ -215,6 +388,12 @@ func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng
 	return nil
 }
 
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	file := fd.inode().impl.(*regularFile)
+	return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts)
+}
+
 // regularFileReadWriter implements safemem.Reader and Safemem.Writer.
 type regularFileReadWriter struct {
 	file *regularFile
@@ -244,14 +423,15 @@ func putRegularFileReadWriter(rw *regularFileReadWriter) {
 
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
 func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
-	rw.file.mu.RLock()
+	rw.file.dataMu.RLock()
+	defer rw.file.dataMu.RUnlock()
+	size := rw.file.size
 
 	// Compute the range to read (limited by file size and overflow-checked).
-	if rw.off >= rw.file.size {
-		rw.file.mu.RUnlock()
+	if rw.off >= size {
 		return 0, io.EOF
 	}
-	end := rw.file.size
+	end := size
 	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
 		end = rend
 	}
@@ -265,7 +445,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			// Get internal mappings.
 			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
 			if err != nil {
-				rw.file.mu.RUnlock()
 				return done, err
 			}
 
@@ -275,7 +454,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			rw.off += uint64(n)
 			dsts = dsts.DropFirst64(n)
 			if err != nil {
-				rw.file.mu.RUnlock()
 				return done, err
 			}
 
@@ -291,7 +469,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			rw.off += uint64(n)
 			dsts = dsts.DropFirst64(n)
 			if err != nil {
-				rw.file.mu.RUnlock()
 				return done, err
 			}
 
@@ -299,13 +476,16 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er
 			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
 		}
 	}
-	rw.file.mu.RUnlock()
 	return done, nil
 }
 
 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: inode.mu must be held.
 func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
-	rw.file.mu.Lock()
+	// Hold dataMu so we can modify size.
+	rw.file.dataMu.Lock()
+	defer rw.file.dataMu.Unlock()
 
 	// Compute the range to write (overflow-checked).
 	end := rw.off + srcs.NumBytes()
@@ -316,7 +496,6 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
 	// Check if seals prevent either file growth or all writes.
 	switch {
 	case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
-		rw.file.mu.Unlock()
 		return 0, syserror.EPERM
 	case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
 		// When growth is sealed, Linux effectively allows writes which would
@@ -338,7 +517,6 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
 		}
 		if end <= rw.off {
 			// Truncation would result in no data being written.
-			rw.file.mu.Unlock()
 			return 0, syserror.EPERM
 		}
 	}
@@ -395,9 +573,49 @@ exitLoop:
 	// If the write ends beyond the file's previous size, it causes the
 	// file to grow.
 	if rw.off > rw.file.size {
-		atomic.StoreUint64(&rw.file.size, rw.off)
+		rw.file.size = rw.off
 	}
 
-	rw.file.mu.Unlock()
 	return done, retErr
 }
+
+// GetSeals returns the current set of seals on a memfd inode.
+func GetSeals(fd *vfs.FileDescription) (uint32, error) {
+	f, ok := fd.Impl().(*regularFileFD)
+	if !ok {
+		return 0, syserror.EINVAL
+	}
+	rf := f.inode().impl.(*regularFile)
+	rf.dataMu.RLock()
+	defer rf.dataMu.RUnlock()
+	return rf.seals, nil
+}
+
+// AddSeals adds new file seals to a memfd inode.
+func AddSeals(fd *vfs.FileDescription, val uint32) error {
+	f, ok := fd.Impl().(*regularFileFD)
+	if !ok {
+		return syserror.EINVAL
+	}
+	rf := f.inode().impl.(*regularFile)
+	rf.mapsMu.Lock()
+	defer rf.mapsMu.Unlock()
+	rf.dataMu.RLock()
+	defer rf.dataMu.RUnlock()
+
+	if rf.seals&linux.F_SEAL_SEAL != 0 {
+		// Seal applied which prevents addition of any new seals.
+		return syserror.EPERM
+	}
+
+	// F_SEAL_WRITE can only be added if there are no active writable maps.
+	if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
+		if rf.writableMappingPages > 0 {
+			return syserror.EBUSY
+		}
+	}
+
+	// Seals can only be added, never removed.
+	rf.seals |= val
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index e9f71e334..64e1c40ad 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -18,148 +18,16 @@ import (
 	"bytes"
 	"fmt"
 	"io"
-	"sync/atomic"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// nextFileID is used to generate unique file names.
-var nextFileID int64
-
-// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error
-// is not nil, then cleanup should be called when the root is no longer needed.
-func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
-	creds := auth.CredentialsFromContext(ctx)
-
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-		AllowUserMount: true,
-	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
-	if err != nil {
-		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
-	}
-	root := mntns.Root()
-	return vfsObj, root, func() {
-		root.DecRef()
-		mntns.DecRef()
-	}, nil
-}
-
-// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
-// the returned err is not nil, then cleanup should be called when the FD is no
-// longer needed.
-func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
-	creds := auth.CredentialsFromContext(ctx)
-	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1))
-
-	// Create the file that will be write/read.
-	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(filename),
-	}, &vfs.OpenOptions{
-		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
-		Mode:  linux.ModeRegular | mode,
-	})
-	if err != nil {
-		cleanup()
-		return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
-	}
-
-	return fd, cleanup, nil
-}
-
-// newDirFD is like newFileFD, but for directories.
-func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
-	creds := auth.CredentialsFromContext(ctx)
-	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1))
-
-	// Create the dir.
-	if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(dirname),
-	}, &vfs.MkdirOptions{
-		Mode: linux.ModeDirectory | mode,
-	}); err != nil {
-		cleanup()
-		return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err)
-	}
-
-	// Open the dir and return it.
-	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(dirname),
-	}, &vfs.OpenOptions{
-		Flags: linux.O_RDONLY | linux.O_DIRECTORY,
-	})
-	if err != nil {
-		cleanup()
-		return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err)
-	}
-
-	return fd, cleanup, nil
-}
-
-// newPipeFD is like newFileFD, but for pipes.
-func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
-	creds := auth.CredentialsFromContext(ctx)
-	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	pipename := fmt.Sprintf("tmpfs-test-pipe-%d", atomic.AddInt64(&nextFileID, 1))
-
-	// Create the pipe.
-	if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(pipename),
-	}, &vfs.MknodOptions{
-		Mode: linux.ModeNamedPipe | mode,
-	}); err != nil {
-		cleanup()
-		return nil, nil, fmt.Errorf("failed to create pipe %q: %v", pipename, err)
-	}
-
-	// Open the pipe and return it.
-	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(pipename),
-	}, &vfs.OpenOptions{
-		Flags: linux.O_RDWR,
-	})
-	if err != nil {
-		cleanup()
-		return nil, nil, fmt.Errorf("failed to open pipe %q: %v", pipename, err)
-	}
-
-	return fd, cleanup, nil
-}
-
 // Test that we can write some data to a file and read it back.`
 func TestSimpleWriteRead(t *testing.T) {
 	ctx := contexttest.Context(t)
diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 847cb0784..3ed650474 100644
--- a/pkg/sentry/fsimpl/gofer/pagemath.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -12,20 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package gofer
+package tmpfs
 
 import (
-	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 )
 
-// This are equivalent to usermem.Addr.RoundDown/Up, but without the
-// potentially truncating conversion to usermem.Addr. This is necessary because
-// there is no way to define generic "PageRoundDown/Up" functions in Go.
-
-func pageRoundDown(x uint64) uint64 {
-	return x &^ (usermem.PageSize - 1)
+// socketFile is a socket (=S_IFSOCK) tmpfs file.
+type socketFile struct {
+	inode inode
+	ep    transport.BoundEndpoint
 }
 
-func pageRoundUp(x uint64) uint64 {
-	return pageRoundDown(x + usermem.PageSize - 1)
+func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
+	file := &socketFile{ep: ep}
+	file.inode.init(file, fs, kuid, kgid, mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
index ebe035dee..f7ee4aab2 100644
--- a/pkg/sentry/fsimpl/tmpfs/stat_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -19,8 +19,8 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
@@ -29,7 +29,6 @@ func TestStatAfterCreate(t *testing.T) {
 	mode := linux.FileMode(0644)
 
 	// Run with different file types.
-	// TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
 	for _, typ := range []string{"file", "dir", "pipe"} {
 		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
 			var (
@@ -71,9 +70,15 @@ func TestStatAfterCreate(t *testing.T) {
 				t.Errorf("got btime %d, want 0", got.Btime.ToNsec())
 			}
 
-			// Size should be 0.
-			if got.Size != 0 {
-				t.Errorf("got size %d, want 0", got.Size)
+			// Size should be 0 (except for directories, which make up a size
+			// of 20 per entry, including the "." and ".." entries present in
+			// otherwise-empty directories).
+			wantSize := uint64(0)
+			if typ == "dir" {
+				wantSize = 40
+			}
+			if got.Size != wantSize {
+				t.Errorf("got size %d, want %d", got.Size, wantSize)
 			}
 
 			// Nlink should be 1 for files, 2 for dirs.
@@ -140,7 +145,7 @@ func TestSetStatAtime(t *testing.T) {
 		Mask:  0,
 		Atime: linux.NsecToStatxTimestamp(100),
 	}}); err != nil {
-		t.Errorf("SetStat atime without mask failed: %v")
+		t.Errorf("SetStat atime without mask failed: %v", err)
 	}
 	// Atime should be unchanged.
 	if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
@@ -155,7 +160,7 @@ func TestSetStatAtime(t *testing.T) {
 		Atime: linux.NsecToStatxTimestamp(100),
 	}
 	if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
-		t.Errorf("SetStat atime with mask failed: %v")
+		t.Errorf("SetStat atime with mask failed: %v", err)
 	}
 	if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
 		t.Errorf("Stat got error: %v", err)
@@ -169,7 +174,6 @@ func TestSetStat(t *testing.T) {
 	mode := linux.FileMode(0644)
 
 	// Run with different file types.
-	// TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
 	for _, typ := range []string{"file", "dir", "pipe"} {
 		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
 			var (
@@ -205,7 +209,7 @@ func TestSetStat(t *testing.T) {
 				Mask:  0,
 				Atime: linux.NsecToStatxTimestamp(100),
 			}}); err != nil {
-				t.Errorf("SetStat atime without mask failed: %v")
+				t.Errorf("SetStat atime without mask failed: %v", err)
 			}
 			// Atime should be unchanged.
 			if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
@@ -220,7 +224,7 @@ func TestSetStat(t *testing.T) {
 				Atime: linux.NsecToStatxTimestamp(100),
 			}
 			if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
-				t.Errorf("SetStat atime with mask failed: %v")
+				t.Errorf("SetStat atime with mask failed: %v", err)
 			}
 			if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
 				t.Errorf("Stat got error: %v", err)
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index 5246aca84..b0de5fabe 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -15,6 +15,7 @@
 package tmpfs
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
@@ -23,11 +24,11 @@ type symlink struct {
 	target string // immutable
 }
 
-func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
+func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string) *inode {
 	link := &symlink{
 		target: target,
 	}
-	link.inode.init(link, fs, creds, 0777)
+	link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode)
 	link.inode.nlink = 1 // from parent directory
 	return &link.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 2108d0f4d..7ce1b86c7 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -12,20 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package tmpfs provides a filesystem implementation that behaves like tmpfs:
-// the Dentry tree is the sole source of truth for the state of the filesystem.
+// Package tmpfs provides an in-memory filesystem whose contents are
+// application-mutable, consistent with Linux's tmpfs.
 //
 // Lock order:
 //
 // filesystem.mu
-//   regularFileFD.offMu
-//     regularFile.mu
 //   inode.mu
+//     regularFileFD.offMu
+//       *** "memmap.Mappable locks" below this point
+//       regularFile.mapsMu
+//         *** "memmap.Mappable locks taken by Translate" below this point
+//         regularFile.dataMu
+//     directory.iterMu
 package tmpfs
 
 import (
 	"fmt"
 	"math"
+	"strconv"
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -36,10 +42,15 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// Name is the default filesystem name.
+const Name = "tmpfs"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
@@ -53,50 +64,176 @@ type filesystem struct {
 	// clock is a realtime clock used to set timestamps in file operations.
 	clock time.Clock
 
+	// devMinor is the filesystem's minor device number. devMinor is immutable.
+	devMinor uint32
+
 	// mu serializes changes to the Dentry tree.
 	mu sync.RWMutex
 
 	nextInoMinusOne uint64 // accessed using atomic memory operations
 }
 
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// FilesystemOpts is used to pass configuration data to tmpfs.
+type FilesystemOpts struct {
+	// RootFileType is the FileType of the filesystem root. Valid values
+	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
+	RootFileType uint16
+
+	// RootSymlinkTarget is the target of the root symlink. Only valid if
+	// RootFileType == S_IFLNK.
+	RootSymlinkTarget string
+
+	// FilesystemType allows setting a different FilesystemType for this
+	// tmpfs filesystem. This allows tmpfs to "impersonate" other
+	// filesystems, like ramdiskfs and cgroupfs.
+	FilesystemType vfs.FilesystemType
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
 	if memFileProvider == nil {
 		panic("MemoryFileProviderFromContext returned nil")
 	}
+
+	rootFileType := uint16(linux.S_IFDIR)
+	newFSType := vfs.FilesystemType(&fstype)
+	tmpfsOpts, ok := opts.InternalData.(FilesystemOpts)
+	if ok {
+		if tmpfsOpts.RootFileType != 0 {
+			rootFileType = tmpfsOpts.RootFileType
+		}
+		if tmpfsOpts.FilesystemType != nil {
+			newFSType = tmpfsOpts.FilesystemType
+		}
+	}
+
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	rootMode := linux.FileMode(0777)
+	if rootFileType == linux.S_IFDIR {
+		rootMode = 01777
+	}
+	modeStr, ok := mopts["mode"]
+	if ok {
+		delete(mopts, "mode")
+		mode, err := strconv.ParseUint(modeStr, 8, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
+			return nil, nil, syserror.EINVAL
+		}
+		rootMode = linux.FileMode(mode & 07777)
+	}
+	rootKUID := creds.EffectiveKUID
+	uidStr, ok := mopts["uid"]
+	if ok {
+		delete(mopts, "uid")
+		uid, err := strconv.ParseUint(uidStr, 10, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
+			return nil, nil, syserror.EINVAL
+		}
+		kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
+		if !kuid.Ok() {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
+			return nil, nil, syserror.EINVAL
+		}
+		rootKUID = kuid
+	}
+	rootKGID := creds.EffectiveKGID
+	gidStr, ok := mopts["gid"]
+	if ok {
+		delete(mopts, "gid")
+		gid, err := strconv.ParseUint(gidStr, 10, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
+			return nil, nil, syserror.EINVAL
+		}
+		kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
+		if !kgid.Ok() {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
+			return nil, nil, syserror.EINVAL
+		}
+		rootKGID = kgid
+	}
+	if len(mopts) != 0 {
+		ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
 	clock := time.RealtimeClockFromContext(ctx)
 	fs := filesystem{
-		memFile: memFileProvider.MemoryFile(),
-		clock:   clock,
+		memFile:  memFileProvider.MemoryFile(),
+		clock:    clock,
+		devMinor: devMinor,
+	}
+	fs.vfsfs.Init(vfsObj, newFSType, &fs)
+
+	var root *dentry
+	switch rootFileType {
+	case linux.S_IFREG:
+		root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode))
+	case linux.S_IFLNK:
+		root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget))
+	case linux.S_IFDIR:
+		root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry
+	default:
+		fs.vfsfs.DecRef()
+		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
 	}
-	fs.vfsfs.Init(vfsObj, &fs)
-	root := fs.newDentry(fs.newDirectory(creds, 01777))
 	return &fs.vfsfs, &root.vfsd, nil
 }
 
+// NewFilesystem returns a new tmpfs filesystem.
+func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) {
+	return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{})
+}
+
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release() {
+	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
 }
 
 // dentry implements vfs.DentryImpl.
 type dentry struct {
 	vfsd vfs.Dentry
 
+	// parent is this dentry's parent directory. Each referenced dentry holds a
+	// reference on parent.dentry. If this dentry is a filesystem root, parent
+	// is nil. parent is protected by filesystem.mu.
+	parent *dentry
+
+	// name is the name of this dentry in its parent. If this dentry is a
+	// filesystem root, name is the empty string. name is protected by
+	// filesystem.mu.
+	name string
+
+	// unlinked indicates whether this dentry has been unlinked from its parent.
+	// It is only set to true on an unlink operation, and never set from true to
+	// false. unlinked is protected by filesystem.mu.
+	unlinked bool
+
+	// dentryEntry (ugh) links dentries into their parent directory.childList.
+	dentryEntry
+
 	// inode is the inode represented by this dentry. Multiple Dentries may
 	// share a single non-directory inode (with hard links). inode is
 	// immutable.
-	inode *inode
-
+	//
 	// tmpfs doesn't count references on dentries; because the dentry tree is
 	// the sole source of truth, it is by definition always consistent with the
 	// state of the filesystem. However, it does count references on inodes,
 	// because inode resources are released when all references are dropped.
-	// (tmpfs doesn't really have resources to release, but we implement
-	// reference counting because tmpfs regular files will.)
-
-	// dentryEntry (ugh) links dentries into their parent directory.childList.
-	dentryEntry
+	// dentry therefore forwards reference counting directly to inode.
+	inode *inode
 }
 
 func (fs *filesystem) newDentry(inode *inode) *dentry {
@@ -122,25 +259,48 @@ func (d *dentry) DecRef() {
 	d.inode.decRef()
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {
+	if d.inode.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		// Note that d.parent or d.name may be stale if there is a concurrent
+		// rename operation. Inotify does not provide consistency guarantees.
+		d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.unlinked)
+	}
+	d.inode.watches.Notify("", events, cookie, et)
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	return &d.inode.watches
+}
+
 // inode represents a filesystem object.
 type inode struct {
-	// clock is a realtime clock used to set timestamps in file operations.
-	clock time.Clock
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
 
 	// refs is a reference count. refs is accessed using atomic memory
 	// operations.
 	//
-	// A reference is held on all inodes that are reachable in the filesystem
-	// tree. For non-directories (which may have multiple hard links), this
-	// means that a reference is dropped when nlink reaches 0. For directories,
-	// nlink never reaches 0 due to the "." entry; instead,
-	// filesystem.RmdirAt() drops the reference.
+	// A reference is held on all inodes as long as they are reachable in the
+	// filesystem tree, i.e. nlink is nonzero. This reference is dropped when
+	// nlink reaches 0.
 	refs int64
 
+	// xattrs implements extended attributes.
+	//
+	// TODO(b/148380782): Support xattrs other than user.*
+	xattrs memxattr.SimpleExtendedAttributes
+
 	// Inode metadata. Writing multiple fields atomically requires holding
 	// mu, othewise atomic operations can be used.
 	mu    sync.Mutex
-	mode  uint32 // excluding file type bits, which are based on impl
+	mode  uint32 // file type and mode
 	nlink uint32 // protected by filesystem.mu instead of inode.mu
 	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
 	gid   uint32 // auth.KGID, but ...
@@ -151,31 +311,34 @@ type inode struct {
 	ctime int64 // nanoseconds
 	mtime int64 // nanoseconds
 
-	// Only meaningful for device special files.
-	rdevMajor uint32
-	rdevMinor uint32
-
 	// Advisory file locks, which lock at the inode level.
 	locks lock.FileLocks
 
+	// Inotify watches for this inode.
+	watches vfs.Watches
+
 	impl interface{} // immutable
 }
 
 const maxLinks = math.MaxUint32
 
-func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
-	i.clock = fs.clock
+func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) {
+	if mode.FileType() == 0 {
+		panic("file type is required in FileMode")
+	}
+	i.fs = fs
 	i.refs = 1
 	i.mode = uint32(mode)
-	i.uid = uint32(creds.EffectiveKUID)
-	i.gid = uint32(creds.EffectiveKGID)
+	i.uid = uint32(kuid)
+	i.gid = uint32(kgid)
 	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
 	// Tmpfs creation sets atime, ctime, and mtime to current time.
-	now := i.clock.Now().Nanoseconds()
+	now := fs.clock.Now().Nanoseconds()
 	i.atime = now
 	i.ctime = now
 	i.mtime = now
 	// i.nlink initialized by caller
+	i.watches = vfs.Watches{}
 	i.impl = impl
 }
 
@@ -188,19 +351,22 @@ func (i *inode) incLinksLocked() {
 		panic("tmpfs.inode.incLinksLocked() called with no existing links")
 	}
 	if i.nlink == maxLinks {
-		panic("memfs.inode.incLinksLocked() called with maximum link count")
+		panic("tmpfs.inode.incLinksLocked() called with maximum link count")
 	}
 	atomic.AddUint32(&i.nlink, 1)
 }
 
-// decLinksLocked decrements i's link count.
+// decLinksLocked decrements i's link count. If the link count reaches 0, we
+// remove a reference on i as well.
 //
 // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
 func (i *inode) decLinksLocked() {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.decLinksLocked() called with no existing links")
 	}
-	atomic.AddUint32(&i.nlink, ^uint32(0))
+	if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 {
+		i.decRef()
+	}
 }
 
 func (i *inode) incRef() {
@@ -223,20 +389,21 @@ func (i *inode) tryIncRef() bool {
 
 func (i *inode) decRef() {
 	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
-		// This is unnecessary; it's mostly to simulate what tmpfs would do.
+		i.watches.HandleDeletion()
 		if regFile, ok := i.impl.(*regularFile); ok {
-			regFile.mu.Lock()
+			// Release memory used by regFile to store data. Since regFile is
+			// no longer usable, we don't need to grab any locks or update any
+			// metadata.
 			regFile.data.DropAll(regFile.memFile)
-			atomic.StoreUint64(&regFile.size, 0)
-			regFile.mu.Unlock()
 		}
 	} else if refs < 0 {
 		panic("tmpfs.inode.decRef() called without holding a reference")
 	}
 }
 
-func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
-	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
 }
 
 // Go won't inline this function, and returning linux.Statx (which is quite
@@ -247,44 +414,38 @@ func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, i
 // a concurrent modification), so we do not require holding inode.mu.
 func (i *inode) statTo(stat *linux.Statx) {
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
-		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME |
-		linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME
-	stat.Blksize = 1 // usermem.PageSize in tmpfs
+		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
+		linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME |
+		linux.STATX_MTIME
+	stat.Blksize = usermem.PageSize
 	stat.Nlink = atomic.LoadUint32(&i.nlink)
 	stat.UID = atomic.LoadUint32(&i.uid)
 	stat.GID = atomic.LoadUint32(&i.gid)
 	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
 	stat.Ino = i.ino
-	// Linux's tmpfs has no concept of btime, so zero-value is returned.
 	stat.Atime = linux.NsecToStatxTimestamp(i.atime)
 	stat.Ctime = linux.NsecToStatxTimestamp(i.ctime)
 	stat.Mtime = linux.NsecToStatxTimestamp(i.mtime)
-	// TODO(gvisor.dev/issues/1197): Device number.
+	stat.DevMajor = linux.UNNAMED_MAJOR
+	stat.DevMinor = i.fs.devMinor
 	switch impl := i.impl.(type) {
 	case *regularFile:
-		stat.Mode |= linux.S_IFREG
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
 		stat.Size = uint64(atomic.LoadUint64(&impl.size))
-		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
-		// a uint64 accessed using atomic memory operations to avoid taking
-		// locks).
+		// TODO(jamieliu): This should be impl.data.Span() / 512, but this is
+		// too expensive to compute here. Cache it in regularFile.
 		stat.Blocks = allocatedBlocksForSize(stat.Size)
 	case *directory:
-		stat.Mode |= linux.S_IFDIR
+		// "20" is mm/shmem.c:BOGO_DIRENT_SIZE.
+		stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren)))
+		// stat.Blocks is 0.
 	case *symlink:
-		stat.Mode |= linux.S_IFLNK
-		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
 		stat.Size = uint64(len(impl.target))
-		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *namedPipe:
-		stat.Mode |= linux.S_IFIFO
+		// stat.Blocks is 0.
+	case *namedPipe, *socketFile:
+		// stat.Size and stat.Blocks are 0.
 	case *deviceFile:
-		switch impl.kind {
-		case vfs.BlockDevice:
-			stat.Mode |= linux.S_IFBLK
-		case vfs.CharDevice:
-			stat.Mode |= linux.S_IFCHR
-		}
+		// stat.Size and stat.Blocks are 0.
 		stat.RdevMajor = impl.major
 		stat.RdevMinor = impl.minor
 	default:
@@ -292,18 +453,27 @@ func (i *inode) statTo(stat *linux.Statx) {
 	}
 }
 
-func (i *inode) setStat(stat linux.Statx) error {
+func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error {
 	if stat.Mask == 0 {
 		return nil
 	}
+	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
+		return syserror.EPERM
+	}
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+		return err
+	}
 	i.mu.Lock()
+	defer i.mu.Unlock()
 	var (
 		needsMtimeBump bool
 		needsCtimeBump bool
 	)
 	mask := stat.Mask
 	if mask&linux.STATX_MODE != 0 {
-		atomic.StoreUint32(&i.mode, uint32(stat.Mode))
+		ft := atomic.LoadUint32(&i.mode) & linux.S_IFMT
+		atomic.StoreUint32(&i.mode, ft|uint32(stat.Mode&^linux.S_IFMT))
 		needsCtimeBump = true
 	}
 	if mask&linux.STATX_UID != 0 {
@@ -317,7 +487,7 @@ func (i *inode) setStat(stat linux.Statx) error {
 	if mask&linux.STATX_SIZE != 0 {
 		switch impl := i.impl.(type) {
 		case *regularFile:
-			updated, err := impl.truncate(stat.Size)
+			updated, err := impl.truncateLocked(stat.Size)
 			if err != nil {
 				return err
 			}
@@ -331,29 +501,41 @@ func (i *inode) setStat(stat linux.Statx) error {
 			return syserror.EINVAL
 		}
 	}
+	now := i.fs.clock.Now().Nanoseconds()
 	if mask&linux.STATX_ATIME != 0 {
-		atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
+		if stat.Atime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.atime, now)
+		} else {
+			atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
+		}
 		needsCtimeBump = true
 	}
 	if mask&linux.STATX_MTIME != 0 {
-		atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+		if stat.Mtime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.mtime, now)
+		} else {
+			atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+		}
 		needsCtimeBump = true
 		// Ignore the mtime bump, since we just set it ourselves.
 		needsMtimeBump = false
 	}
 	if mask&linux.STATX_CTIME != 0 {
-		atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
+		if stat.Ctime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.ctime, now)
+		} else {
+			atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
+		}
 		// Ignore the ctime bump, since we just set it ourselves.
 		needsCtimeBump = false
 	}
-	now := i.clock.Now().Nanoseconds()
 	if needsMtimeBump {
 		atomic.StoreInt64(&i.mtime, now)
 	}
 	if needsCtimeBump {
 		atomic.StoreInt64(&i.ctime, now)
 	}
-	i.mu.Unlock()
+
 	return nil
 }
 
@@ -412,6 +594,8 @@ func (i *inode) direntType() uint8 {
 		return linux.DT_DIR
 	case *symlink:
 		return linux.DT_LNK
+	case *socketFile:
+		return linux.DT_SOCK
 	case *deviceFile:
 		switch impl.kind {
 		case vfs.BlockDevice:
@@ -426,6 +610,99 @@ func (i *inode) direntType() uint8 {
 	}
 }
 
+func (i *inode) isDir() bool {
+	return linux.FileMode(i.mode).FileType() == linux.S_IFDIR
+}
+
+func (i *inode) touchAtime(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime {
+		return
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now := i.fs.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.atime, now)
+	i.mu.Unlock()
+	mnt.EndWrite()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
+func (i *inode) touchCtime() {
+	now := i.fs.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.ctime, now)
+	i.mu.Unlock()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
+func (i *inode) touchCMtime() {
+	now := i.fs.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.mtime, now)
+	atomic.StoreInt64(&i.ctime, now)
+	i.mu.Unlock()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds
+// inode.mu.
+func (i *inode) touchCMtimeLocked() {
+	now := i.fs.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&i.mtime, now)
+	atomic.StoreInt64(&i.ctime, now)
+}
+
+func (i *inode) listxattr(size uint64) ([]string, error) {
+	return i.xattrs.Listxattr(size)
+}
+
+func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+		return "", err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return "", syserror.ENODATA
+	}
+	return i.xattrs.Getxattr(opts)
+}
+
+func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return i.xattrs.Setxattr(opts)
+}
+
+func (i *inode) removexattr(creds *auth.Credentials, name string) error {
+	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return i.xattrs.Removexattr(name)
+}
+
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (i *inode) userXattrSupported() bool {
+	filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
+	return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+}
+
 // fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
@@ -437,8 +714,12 @@ func (fd *fileDescription) filesystem() *filesystem {
 	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 }
 
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
 func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.Dentry().Impl().(*dentry).inode
+	return fd.dentry().inode
 }
 
 // Stat implements vfs.FileDescriptionImpl.Stat.
@@ -450,5 +731,78 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	return fd.inode().setStat(opts.Stat)
+	creds := auth.CredentialsFromContext(ctx)
+	d := fd.dentry()
+	if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil {
+		return err
+	}
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
+}
+
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.inode().listxattr(size)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+	d := fd.dentry()
+	if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+		return err
+	}
+
+	// Generate inotify events.
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+	d := fd.dentry()
+	if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+		return err
+	}
+
+	// Generate inotify events.
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// NewMemfd creates a new tmpfs regular file and file description that can back
+// an anonymous fd created by memfd_create.
+func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) {
+	fs, ok := mount.Filesystem().Impl().(*filesystem)
+	if !ok {
+		panic("NewMemfd() called with non-tmpfs mount")
+	}
+
+	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
+	// S_IRWXUGO.
+	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
+	rf := inode.impl.(*regularFile)
+	if allowSeals {
+		rf.seals = 0
+	}
+
+	d := fs.newDentry(inode)
+	defer d.DecRef()
+	d.name = name
+
+	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
+	// FMODE_READ | FMODE_WRITE.
+	var fd regularFileFD
+	flags := uint32(linux.O_RDWR)
+	if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
new file mode 100644
index 000000000..a240fb276
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -0,0 +1,156 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// nextFileID is used to generate unique file names.
+var nextFileID int64
+
+// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error
+// is not nil, then cleanup should be called when the root is no longer needed.
+func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
+	}
+
+	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
+	}
+	root := mntns.Root()
+	return vfsObj, root, func() {
+		root.DecRef()
+		mntns.DecRef()
+	}, nil
+}
+
+// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
+// the returned err is not nil, then cleanup should be called when the FD is no
+// longer needed.
+func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1))
+
+	// Create the file that will be write/read.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+		Mode:  linux.ModeRegular | mode,
+	})
+	if err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
+	}
+
+	return fd, cleanup, nil
+}
+
+// newDirFD is like newFileFD, but for directories.
+func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1))
+
+	// Create the dir.
+	if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(dirname),
+	}, &vfs.MkdirOptions{
+		Mode: linux.ModeDirectory | mode,
+	}); err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err)
+	}
+
+	// Open the dir and return it.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(dirname),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+	})
+	if err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err)
+	}
+
+	return fd, cleanup, nil
+}
+
+// newPipeFD is like newFileFD, but for pipes.
+func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	name := fmt.Sprintf("tmpfs-test-%d", atomic.AddInt64(&nextFileID, 1))
+
+	if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(name),
+	}, &vfs.MknodOptions{
+		Mode: linux.ModeNamedPipe | mode,
+	}); err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to create pipe %q: %v", name, err)
+	}
+
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(name),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to open pipe %q: %v", name, err)
+	}
+
+	return fd, cleanup, nil
+}
diff --git a/pkg/sentry/hostfd/BUILD b/pkg/sentry/hostfd/BUILD
new file mode 100644
index 000000000..364a78306
--- /dev/null
+++ b/pkg/sentry/hostfd/BUILD
@@ -0,0 +1,17 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "hostfd",
+    srcs = [
+        "hostfd.go",
+        "hostfd_unsafe.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/safemem",
+        "//pkg/sync",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/hostfd/hostfd.go b/pkg/sentry/hostfd/hostfd.go
new file mode 100644
index 000000000..70dd9cafb
--- /dev/null
+++ b/pkg/sentry/hostfd/hostfd.go
@@ -0,0 +1,84 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hostfd provides efficient I/O with host file descriptors.
+package hostfd
+
+import (
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// ReadWriterAt implements safemem.Reader and safemem.Writer by reading from
+// and writing to a host file descriptor respectively. ReadWriterAts should be
+// obtained by calling GetReadWriterAt.
+//
+// Clients should usually prefer to use Preadv2 and Pwritev2 directly.
+type ReadWriterAt struct {
+	fd     int32
+	offset int64
+	flags  uint32
+}
+
+var rwpool = sync.Pool{
+	New: func() interface{} {
+		return &ReadWriterAt{}
+	},
+}
+
+// GetReadWriterAt returns a ReadWriterAt that reads from / writes to the given
+// host file descriptor, starting at the given offset and using the given
+// preadv2(2)/pwritev2(2) flags. If offset is -1, the host file descriptor's
+// offset is used instead. Users are responsible for ensuring that fd remains
+// valid for the lifetime of the returned ReadWriterAt, and must call
+// PutReadWriterAt when it is no longer needed.
+func GetReadWriterAt(fd int32, offset int64, flags uint32) *ReadWriterAt {
+	rw := rwpool.Get().(*ReadWriterAt)
+	*rw = ReadWriterAt{
+		fd:     fd,
+		offset: offset,
+		flags:  flags,
+	}
+	return rw
+}
+
+// PutReadWriterAt releases a ReadWriterAt returned by a previous call to
+// GetReadWriterAt that is no longer in use.
+func PutReadWriterAt(rw *ReadWriterAt) {
+	rwpool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *ReadWriterAt) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+	n, err := Preadv2(rw.fd, dsts, rw.offset, rw.flags)
+	if rw.offset >= 0 {
+		rw.offset += int64(n)
+	}
+	return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *ReadWriterAt) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+	n, err := Pwritev2(rw.fd, srcs, rw.offset, rw.flags)
+	if rw.offset >= 0 {
+		rw.offset += int64(n)
+	}
+	return n, err
+}
diff --git a/pkg/sentry/hostfd/hostfd_unsafe.go b/pkg/sentry/hostfd/hostfd_unsafe.go
new file mode 100644
index 000000000..cd4dc67fb
--- /dev/null
+++ b/pkg/sentry/hostfd/hostfd_unsafe.go
@@ -0,0 +1,85 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostfd
+
+import (
+	"io"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/safemem"
+)
+
+// Preadv2 reads up to dsts.NumBytes() bytes from host file descriptor fd into
+// dsts. offset and flags are interpreted as for preadv2(2).
+//
+// Preconditions: !dsts.IsEmpty().
+func Preadv2(fd int32, dsts safemem.BlockSeq, offset int64, flags uint32) (uint64, error) {
+	// No buffering is necessary regardless of safecopy; host syscalls will
+	// return EFAULT if appropriate, instead of raising SIGBUS.
+	var (
+		n uintptr
+		e syscall.Errno
+	)
+	if flags == 0 && dsts.NumBlocks() == 1 {
+		// Use read() or pread() to avoid iovec allocation and copying.
+		dst := dsts.Head()
+		if offset == -1 {
+			n, _, e = syscall.Syscall(unix.SYS_READ, uintptr(fd), dst.Addr(), uintptr(dst.Len()))
+		} else {
+			n, _, e = syscall.Syscall6(unix.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */)
+		}
+	} else {
+		iovs := safemem.IovecsFromBlockSeq(dsts)
+		n, _, e = syscall.Syscall6(unix.SYS_PREADV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags))
+	}
+	if e != 0 {
+		return 0, e
+	}
+	if n == 0 {
+		return 0, io.EOF
+	}
+	return uint64(n), nil
+}
+
+// Pwritev2 writes up to srcs.NumBytes() from srcs into host file descriptor
+// fd. offset and flags are interpreted as for pwritev2(2).
+//
+// Preconditions: !srcs.IsEmpty().
+func Pwritev2(fd int32, srcs safemem.BlockSeq, offset int64, flags uint32) (uint64, error) {
+	// No buffering is necessary regardless of safecopy; host syscalls will
+	// return EFAULT if appropriate, instead of raising SIGBUS.
+	var (
+		n uintptr
+		e syscall.Errno
+	)
+	if flags == 0 && srcs.NumBlocks() == 1 {
+		// Use write() or pwrite() to avoid iovec allocation and copying.
+		src := srcs.Head()
+		if offset == -1 {
+			n, _, e = syscall.Syscall(unix.SYS_WRITE, uintptr(fd), src.Addr(), uintptr(src.Len()))
+		} else {
+			n, _, e = syscall.Syscall6(unix.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */)
+		}
+	} else {
+		iovs := safemem.IovecsFromBlockSeq(srcs)
+		n, _, e = syscall.Syscall6(unix.SYS_PWRITEV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags))
+	}
+	if e != 0 {
+		return 0, e
+	}
+	return uint64(n), nil
+}
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 334432abf..07bf39fed 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -10,6 +10,7 @@ go_library(
     srcs = [
         "context.go",
         "inet.go",
+        "namespace.go",
         "test_stack.go",
     ],
     deps = [
diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go
new file mode 100644
index 000000000..029af3025
--- /dev/null
+++ b/pkg/sentry/inet/namespace.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package inet
+
+// Namespace represents a network namespace. See network_namespaces(7).
+//
+// +stateify savable
+type Namespace struct {
+	// stack is the network stack implementation of this network namespace.
+	stack Stack `state:"nosave"`
+
+	// creator allows kernel to create new network stack for network namespaces.
+	// If nil, no networking will function if network is namespaced.
+	//
+	// At afterLoad(), creator will be used to create network stack. Stateify
+	// needs to wait for this field to be loaded before calling afterLoad().
+	creator NetworkStackCreator `state:"wait"`
+
+	// isRoot indicates whether this is the root network namespace.
+	isRoot bool
+}
+
+// NewRootNamespace creates the root network namespace, with creator
+// allowing new network namespaces to be created. If creator is nil, no
+// networking will function if the network is namespaced.
+func NewRootNamespace(stack Stack, creator NetworkStackCreator) *Namespace {
+	return &Namespace{
+		stack:   stack,
+		creator: creator,
+		isRoot:  true,
+	}
+}
+
+// NewNamespace creates a new network namespace from the root.
+func NewNamespace(root *Namespace) *Namespace {
+	n := &Namespace{
+		creator: root.creator,
+	}
+	n.init()
+	return n
+}
+
+// Stack returns the network stack of n. Stack may return nil if no network
+// stack is configured.
+func (n *Namespace) Stack() Stack {
+	return n.stack
+}
+
+// IsRoot returns whether n is the root network namespace.
+func (n *Namespace) IsRoot() bool {
+	return n.isRoot
+}
+
+// RestoreRootStack restores the root network namespace with stack. This should
+// only be called when restoring kernel.
+func (n *Namespace) RestoreRootStack(stack Stack) {
+	if !n.isRoot {
+		panic("RestoreRootStack can only be called on root network namespace")
+	}
+	if n.stack != nil {
+		panic("RestoreRootStack called after a stack has already been set")
+	}
+	n.stack = stack
+}
+
+func (n *Namespace) init() {
+	// Root network namespace will have stack assigned later.
+	if n.isRoot {
+		return
+	}
+	if n.creator != nil {
+		var err error
+		n.stack, err = n.creator.CreateStack()
+		if err != nil {
+			panic(err)
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (n *Namespace) afterLoad() {
+	n.init()
+}
+
+// NetworkStackCreator allows new instances of a network stack to be created. It
+// is used by the kernel to create new network namespaces when requested.
+type NetworkStackCreator interface {
+	// CreateStack creates a new network stack for a network namespace.
+	CreateStack() (Stack, error)
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 2231d6973..a28eab8b8 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -157,6 +157,7 @@ go_library(
         "//pkg/context",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/refs",
@@ -167,6 +168,12 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/fsimpl/pipefs",
+        "//pkg/sentry/fsimpl/sockfs",
+        "//pkg/sentry/fsimpl/timerfd",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel/auth",
@@ -199,6 +206,7 @@ go_library(
         "//pkg/tcpip/stack",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
     ],
 )
 
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index e057d2c6d..6862f2ef5 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -232,3 +232,31 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) {
 	}
 	return NoID, syserror.EPERM
 }
+
+// SetUID translates the provided uid to the root user namespace and updates c's
+// uids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetUID(uid UID) error {
+	kuid := c.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	c.RealKUID = kuid
+	c.EffectiveKUID = kuid
+	c.SavedKUID = kuid
+	return nil
+}
+
+// SetGID translates the provided gid to the root user namespace and updates c's
+// gids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetGID(gid GID) error {
+	kgid := c.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	c.RealKGID = kgid
+	c.EffectiveKGID = kgid
+	c.SavedKGID = kgid
+	return nil
+}
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index dedf0fa15..75eedd5a2 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -24,6 +24,7 @@ go_library(
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/refs",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 8bffb78fc..3d78cd48f 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -30,19 +31,6 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// Event describes the event mask that was observed and the user data to be
-// returned when one of the events occurs. It has this format to match the linux
-// format to avoid extra copying/allocation when writing events to userspace.
-type Event struct {
-	// Events is the event mask containing the set of events that have been
-	// observed on an entry.
-	Events uint32
-
-	// Data is an opaque 64-bit value provided by the caller when adding the
-	// entry, and returned to the caller when the entry reports an event.
-	Data [2]int32
-}
-
 // EntryFlags is a bitmask that holds an entry's flags.
 type EntryFlags int
 
@@ -227,9 +215,9 @@ func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
 }
 
 // ReadEvents returns up to max available events.
-func (e *EventPoll) ReadEvents(max int) []Event {
+func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
 	var local pollEntryList
-	var ret []Event
+	var ret []linux.EpollEvent
 
 	e.listsMu.Lock()
 
@@ -251,7 +239,7 @@ func (e *EventPoll) ReadEvents(max int) []Event {
 		}
 
 		// Add event to the array that will be returned to caller.
-		ret = append(ret, Event{
+		ret = append(ret, linux.EpollEvent{
 			Events: uint32(ready),
 			Data:   entry.userData,
 		})
@@ -296,8 +284,10 @@ func (*readyCallback) Callback(w *waiter.Entry) {
 		e.waitingList.Remove(entry)
 		e.readyList.PushBack(entry)
 		entry.curList = &e.readyList
+		e.listsMu.Unlock()
 
 		e.Notify(waiter.EventIn)
+		return
 	}
 
 	e.listsMu.Unlock()
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index a0d35d350..8e9f200d0 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -38,11 +38,14 @@ func (e *EventPoll) afterLoad() {
 		}
 	}
 
-	for it := e.waitingList.Front(); it != nil; it = it.Next() {
-		if it.id.File.Readiness(it.mask) != 0 {
-			e.waitingList.Remove(it)
-			e.readyList.PushBack(it)
-			it.curList = &e.readyList
+	for it := e.waitingList.Front(); it != nil; {
+		entry := it
+		it = it.Next()
+
+		if entry.id.File.Readiness(entry.mask) != 0 {
+			e.waitingList.Remove(entry)
+			e.readyList.PushBack(entry)
+			entry.curList = &e.readyList
 			e.Notify(waiter.EventIn)
 		}
 	}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 23b88f7a6..dbfcef0fa 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -152,7 +152,13 @@ func (f *FDTable) drop(file *fs.File) {
 // dropVFS2 drops the table reference.
 func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
 	// TODO(gvisor.dev/issue/1480): Release locks.
-	// TODO(gvisor.dev/issue/1479): Send inotify events.
+
+	// Generate inotify events.
+	ev := uint32(linux.IN_CLOSE_NOWRITE)
+	if file.IsWritable() {
+		ev = linux.IN_CLOSE_WRITE
+	}
+	file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent)
 
 	// Drop the table reference.
 	file.DecRef()
@@ -191,10 +197,12 @@ func (f *FDTable) Size() int {
 	return int(size)
 }
 
-// forEach iterates over all non-nil files.
+// forEach iterates over all non-nil files in sorted order.
 //
 // It is the caller's responsibility to acquire an appropriate lock.
 func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
+	// retries tracks the number of failed TryIncRef attempts for the same FD.
+	retries := 0
 	fd := int32(0)
 	for {
 		file, fileVFS2, flags, ok := f.getAll(fd)
@@ -204,17 +212,26 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes
 		switch {
 		case file != nil:
 			if !file.TryIncRef() {
+				retries++
+				if retries > 1000 {
+					panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations))
+				}
 				continue // Race caught.
 			}
 			fn(fd, file, nil, flags)
 			file.DecRef()
 		case fileVFS2 != nil:
 			if !fileVFS2.TryIncRef() {
+				retries++
+				if retries > 1000 {
+					panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl()))
+				}
 				continue // Race caught.
 			}
 			fn(fd, nil, fileVFS2, flags)
 			fileVFS2.DecRef()
 		}
+		retries = 0
 		fd++
 	}
 }
@@ -296,6 +313,105 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	return fds, nil
 }
 
+// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return nil, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if fd >= end {
+			return nil, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	if fd < f.next {
+		fd = f.next
+	}
+
+	// Install all entries.
+	for i := fd; i < end && len(fds) < len(files); i++ {
+		if d, _, _ := f.getVFS2(i); d == nil {
+			f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
+			fds = append(fds, i)                 // Record the file descriptor.
+		}
+	}
+
+	// Failure? Unwind existing FDs.
+	if len(fds) < len(files) {
+		for _, i := range fds {
+			f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+		}
+		return nil, syscall.EMFILE
+	}
+
+	if fd == f.next {
+		// Update next search start position.
+		f.next = fds[len(fds)-1] + 1
+	}
+
+	return fds, nil
+}
+
+// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
+// the given file description. If it succeeds, it takes a reference on file.
+func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	if minfd < 0 {
+		// Don't accept negative FDs.
+		return -1, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if minfd >= end {
+			return -1, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	fd := minfd
+	if fd < f.next {
+		fd = f.next
+	}
+	for fd < end {
+		if d, _, _ := f.getVFS2(fd); d == nil {
+			f.setVFS2(fd, file, flags)
+			if fd == f.next {
+				// Update next search start position.
+				f.next = fd + 1
+			}
+			return fd, nil
+		}
+		fd++
+	}
+	return -1, syscall.EMFILE
+}
+
 // NewFDAt sets the file reference for the given FD. If there is an active
 // reference for that FD, the ref count for that existing reference is
 // decremented.
@@ -316,9 +432,6 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 		return syscall.EBADF
 	}
 
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
 	// Check the limit for the provided file.
 	if limitSet := limits.FromContext(ctx); limitSet != nil {
 		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
@@ -327,6 +440,8 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 	}
 
 	// Install the entry.
+	f.mu.Lock()
+	defer f.mu.Unlock()
 	f.setAll(fd, file, fileVFS2, flags)
 	return nil
 }
@@ -404,7 +519,10 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
 	}
 }
 
-// GetFDs returns a list of valid fds.
+// GetFDs returns a sorted list of valid fds.
+//
+// Precondition: The caller must be running on the task goroutine, or Task.mu
+// must be locked.
 func (f *FDTable) GetFDs() []int32 {
 	fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
 	f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
@@ -479,7 +597,9 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
 	case orig2 != nil:
 		orig2.IncRef()
 	}
-	f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+	if orig != nil || orig2 != nil {
+		f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+	}
 	return orig, orig2
 }
 
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 2448c1d99..47f78df9a 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
@@ -37,10 +38,16 @@ type FSContext struct {
 	// destroyed.
 	root *fs.Dirent
 
+	// rootVFS2 is the filesystem root.
+	rootVFS2 vfs.VirtualDentry
+
 	// cwd is the current working directory. Will be nil iff the FSContext
 	// has been destroyed.
 	cwd *fs.Dirent
 
+	// cwdVFS2 is the current working directory.
+	cwdVFS2 vfs.VirtualDentry
+
 	// umask is the current file mode creation mask. When a thread using this
 	// context invokes a syscall that creates a file, bits set in umask are
 	// removed from the permissions that the file is created with.
@@ -60,6 +67,19 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
 	return &f
 }
 
+// NewFSContextVFS2 returns a new filesystem context.
+func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
+	root.IncRef()
+	cwd.IncRef()
+	f := FSContext{
+		rootVFS2: root,
+		cwdVFS2:  cwd,
+		umask:    umask,
+	}
+	f.EnableLeakCheck("kernel.FSContext")
+	return &f
+}
+
 // destroy is the destructor for an FSContext.
 //
 // This will call DecRef on both root and cwd Dirents.  If either call to
@@ -75,11 +95,17 @@ func (f *FSContext) destroy() {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.root.DecRef()
-	f.root = nil
-
-	f.cwd.DecRef()
-	f.cwd = nil
+	if VFS2Enabled {
+		f.rootVFS2.DecRef()
+		f.rootVFS2 = vfs.VirtualDentry{}
+		f.cwdVFS2.DecRef()
+		f.cwdVFS2 = vfs.VirtualDentry{}
+	} else {
+		f.root.DecRef()
+		f.root = nil
+		f.cwd.DecRef()
+		f.cwd = nil
+	}
 }
 
 // DecRef implements RefCounter.DecRef with destructor f.destroy.
@@ -93,12 +119,21 @@ func (f *FSContext) DecRef() {
 func (f *FSContext) Fork() *FSContext {
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	f.cwd.IncRef()
-	f.root.IncRef()
+
+	if VFS2Enabled {
+		f.cwdVFS2.IncRef()
+		f.rootVFS2.IncRef()
+	} else {
+		f.cwd.IncRef()
+		f.root.IncRef()
+	}
+
 	return &FSContext{
-		cwd:   f.cwd,
-		root:  f.root,
-		umask: f.umask,
+		cwd:      f.cwd,
+		root:     f.root,
+		cwdVFS2:  f.cwdVFS2,
+		rootVFS2: f.rootVFS2,
+		umask:    f.umask,
 	}
 }
 
@@ -109,12 +144,23 @@ func (f *FSContext) Fork() *FSContext {
 func (f *FSContext) WorkingDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	if f.cwd != nil {
-		f.cwd.IncRef()
-	}
+
+	f.cwd.IncRef()
 	return f.cwd
 }
 
+// WorkingDirectoryVFS2 returns the current working directory.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.cwdVFS2.IncRef()
+	return f.cwdVFS2
+}
+
 // SetWorkingDirectory sets the current working directory.
 // This will take an extra reference on the Dirent.
 //
@@ -137,6 +183,20 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
 	old.DecRef()
 }
 
+// SetWorkingDirectoryVFS2 sets the current working directory.
+// This will take an extra reference on the VirtualDentry.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	old := f.cwdVFS2
+	f.cwdVFS2 = d
+	d.IncRef()
+	old.DecRef()
+}
+
 // RootDirectory returns the current filesystem root.
 //
 // This will return nil if called after destroy(), otherwise it will return a
@@ -150,6 +210,18 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
 	return f.root
 }
 
+// RootDirectoryVFS2 returns the current filesystem root.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.rootVFS2.IncRef()
+	return f.rootVFS2
+}
+
 // SetRootDirectory sets the root directory.
 // This will take an extra reference on the Dirent.
 //
@@ -172,6 +244,28 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
 	old.DecRef()
 }
 
+// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
+	if !vd.Ok() {
+		panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
+	}
+
+	f.mu.Lock()
+
+	if !f.rootVFS2.Ok() {
+		f.mu.Unlock()
+		panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd))
+	}
+
+	old := f.rootVFS2
+	vd.IncRef()
+	f.rootVFS2 = vd
+	f.mu.Unlock()
+	old.DecRef()
+}
+
 // Umask returns the current umask.
 func (f *FSContext) Umask() uint {
 	f.mu.Lock()
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 3ee760ba2..5efeb3767 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -43,11 +43,17 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
+	oldtimerfd "gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -71,6 +77,10 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
+// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow
+// easy access everywhere. To be removed once VFS2 becomes the default.
+var VFS2Enabled = false
+
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
 // Init() or LoadFrom().
 //
@@ -105,7 +115,7 @@ type Kernel struct {
 	timekeeper                  *Timekeeper
 	tasks                       *TaskSet
 	rootUserNamespace           *auth.UserNamespace
-	networkStack                inet.Stack `state:"nosave"`
+	rootNetworkNamespace        *inet.Namespace
 	applicationCores            uint
 	useHostCores                bool
 	extraAuxv                   []arch.AuxEntry
@@ -238,6 +248,33 @@ type Kernel struct {
 
 	// SpecialOpts contains special kernel options.
 	SpecialOpts
+
+	// VFS keeps the filesystem state used across the kernel.
+	vfs vfs.VirtualFilesystem
+
+	// hostMount is the Mount used for file descriptors that were imported
+	// from the host.
+	hostMount *vfs.Mount
+
+	// pipeMount is the Mount used for pipes created by the pipe() and pipe2()
+	// syscalls (as opposed to named pipes created by mknod()).
+	pipeMount *vfs.Mount
+
+	// shmMount is the Mount used for anonymous files created by the
+	// memfd_create() syscalls. It is analagous to Linux's shm_mnt.
+	shmMount *vfs.Mount
+
+	// socketMount is the Mount used for sockets created by the socket() and
+	// socketpair() syscalls. There are several cases where a socket dentry will
+	// not be contained in socketMount:
+	// 1. Socket files created by mknod()
+	// 2. Socket fds imported from the host (Kernel.hostMount is used for these)
+	// 3. Socket files created by binding Unix sockets to a file path
+	socketMount *vfs.Mount
+
+	// If set to true, report address space activation waits as if the task is in
+	// external wait so that the watchdog doesn't report the task stuck.
+	SleepForAddressSpaceActivation bool
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -251,8 +288,9 @@ type InitKernelArgs struct {
 	// RootUserNamespace is the root user namespace.
 	RootUserNamespace *auth.UserNamespace
 
-	// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
-	NetworkStack inet.Stack
+	// RootNetworkNamespace is the root network namespace. If nil, no networking
+	// will be available.
+	RootNetworkNamespace *inet.Namespace
 
 	// ApplicationCores is the number of logical CPUs visible to sandboxed
 	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
@@ -297,6 +335,9 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	if args.Timekeeper == nil {
 		return fmt.Errorf("Timekeeper is nil")
 	}
+	if args.Timekeeper.clocks == nil {
+		return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()")
+	}
 	if args.RootUserNamespace == nil {
 		return fmt.Errorf("RootUserNamespace is nil")
 	}
@@ -311,7 +352,10 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.rootUTSNamespace = args.RootUTSNamespace
 	k.rootIPCNamespace = args.RootIPCNamespace
 	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
-	k.networkStack = args.NetworkStack
+	k.rootNetworkNamespace = args.RootNetworkNamespace
+	if k.rootNetworkNamespace == nil {
+		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil)
+	}
 	k.applicationCores = args.ApplicationCores
 	if args.UseHostCores {
 		k.useHostCores = true
@@ -331,6 +375,47 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
 	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
+
+	if VFS2Enabled {
+		if err := k.vfs.Init(); err != nil {
+			return fmt.Errorf("failed to initialize VFS: %v", err)
+		}
+
+		pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs)
+		if err != nil {
+			return fmt.Errorf("failed to create pipefs filesystem: %v", err)
+		}
+		defer pipeFilesystem.DecRef()
+		pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to create pipefs mount: %v", err)
+		}
+		k.pipeMount = pipeMount
+
+		tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
+		if err != nil {
+			return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
+		}
+		defer tmpfsFilesystem.DecRef()
+		defer tmpfsRoot.DecRef()
+		shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to create tmpfs mount: %v", err)
+		}
+		k.shmMount = shmMount
+
+		socketFilesystem, err := sockfs.NewFilesystem(&k.vfs)
+		if err != nil {
+			return fmt.Errorf("failed to create sockfs filesystem: %v", err)
+		}
+		defer socketFilesystem.DecRef()
+		socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to create sockfs mount: %v", err)
+		}
+		k.socketMount = socketMount
+	}
+
 	return nil
 }
 
@@ -450,6 +535,11 @@ func (k *Kernel) flushMountSourceRefs() error {
 //
 // Precondition: Must be called with the kernel paused.
 func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+	if VFS2Enabled {
+		return nil
+	}
+
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
@@ -467,7 +557,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
 }
 
 func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
-	// TODO(gvisor.dev/issues/1663): Add save support for VFS2.
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
 	return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
 		if flags := file.Flags(); !flags.Write {
 			return nil
@@ -516,17 +606,32 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
 }
 
 func (ts *TaskSet) unregisterEpollWaiters() {
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+	if VFS2Enabled {
+		return
+	}
+
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
+
+	// Tasks that belong to the same process could potentially point to the
+	// same FDTable. So we retain a map of processed ones to avoid
+	// processing the same FDTable multiple times.
+	processed := make(map[*FDTable]struct{})
 	for t := range ts.Root.tids {
 		// We can skip locking Task.mu here since the kernel is paused.
-		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-				if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
-					e.UnregisterEpollWaiters()
-				}
-			})
+		if t.fdTable == nil {
+			continue
 		}
+		if _, ok := processed[t.fdTable]; ok {
+			continue
+		}
+		t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+			if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
+				e.UnregisterEpollWaiters()
+			}
+		})
+		processed[t.fdTable] = struct{}{}
 	}
 }
 
@@ -534,8 +639,6 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
 	loadStart := time.Now()
 
-	k.networkStack = net
-
 	initAppCores := k.applicationCores
 
 	// Load the pre-saved CPUID FeatureSet.
@@ -566,6 +669,10 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 	log.Infof("Kernel load stats: %s", &stats)
 	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
 
+	// rootNetworkNamespace should be populated after loading the state file.
+	// Restore the root network stack.
+	k.rootNetworkNamespace.RestoreRootStack(net)
+
 	// Load the memory file's state.
 	memoryStart := time.Now()
 	if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
@@ -624,7 +731,7 @@ type CreateProcessArgs struct {
 	// File is a passed host FD pointing to a file to load as the init binary.
 	//
 	// This is checked if and only if Filename is "".
-	File *fs.File
+	File fsbridge.File
 
 	// Argvv is a list of arguments.
 	Argv []string
@@ -673,6 +780,13 @@ type CreateProcessArgs struct {
 	// increment it).
 	MountNamespace *fs.MountNamespace
 
+	// MountNamespaceVFS2 optionally contains the mount namespace for this
+	// process. If nil, the init process's mount namespace is used.
+	//
+	// Anyone setting MountNamespaceVFS2 must donate a reference (i.e.
+	// increment it).
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// ContainerID is the container that the process belongs to.
 	ContainerID string
 }
@@ -711,13 +825,26 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.args.Credentials
 	case fs.CtxRoot:
 		if ctx.args.MountNamespace != nil {
-			// MountNamespace.Root() will take a reference on the root
-			// dirent for us.
+			// MountNamespace.Root() will take a reference on the root dirent for us.
 			return ctx.args.MountNamespace.Root()
 		}
 		return nil
+	case vfs.CtxRoot:
+		if ctx.args.MountNamespaceVFS2 == nil {
+			return nil
+		}
+		// MountNamespaceVFS2.Root() takes a reference on the root dirent for us.
+		return ctx.args.MountNamespaceVFS2.Root()
+	case vfs.CtxMountNamespace:
+		if ctx.k.globalInit == nil {
+			return nil
+		}
+		// MountNamespaceVFS2 takes a reference for us.
+		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -757,34 +884,77 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	defer k.extMu.Unlock()
 	log.Infof("EXEC: %v", args.Argv)
 
-	// Grab the mount namespace.
-	mounts := args.MountNamespace
-	if mounts == nil {
-		mounts = k.GlobalInit().Leader().MountNamespace()
-		mounts.IncRef()
-	}
-
-	tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
 	ctx := args.NewContext(k)
 
-	// Get the root directory from the MountNamespace.
-	root := mounts.Root()
-	// The call to newFSContext below will take a reference on root, so we
-	// don't need to hold this one.
-	defer root.DecRef()
-
-	// Grab the working directory.
-	remainingTraversals := uint(args.MaxSymlinkTraversals)
-	wd := root // Default.
-	if args.WorkingDirectory != "" {
-		var err error
-		wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
-		if err != nil {
-			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+	var (
+		opener    fsbridge.Lookup
+		fsContext *FSContext
+		mntns     *fs.MountNamespace
+	)
+
+	if VFS2Enabled {
+		mntnsVFS2 := args.MountNamespaceVFS2
+		if mntnsVFS2 == nil {
+			// MountNamespaceVFS2 adds a reference to the namespace, which is
+			// transferred to the new process.
+			mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2()
+		}
+		// Get the root directory from the MountNamespace.
+		root := args.MountNamespaceVFS2.Root()
+		// The call to newFSContext below will take a reference on root, so we
+		// don't need to hold this one.
+		defer root.DecRef()
+
+		// Grab the working directory.
+		wd := root // Default.
+		if args.WorkingDirectory != "" {
+			pop := vfs.PathOperation{
+				Root:               root,
+				Start:              wd,
+				Path:               fspath.Parse(args.WorkingDirectory),
+				FollowFinalSymlink: true,
+			}
+			var err error
+			wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			}
+			defer wd.DecRef()
+		}
+		opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
+		fsContext = NewFSContextVFS2(root, wd, args.Umask)
+
+	} else {
+		mntns = args.MountNamespace
+		if mntns == nil {
+			mntns = k.GlobalInit().Leader().MountNamespace()
+			mntns.IncRef()
 		}
-		defer wd.DecRef()
+		// Get the root directory from the MountNamespace.
+		root := mntns.Root()
+		// The call to newFSContext below will take a reference on root, so we
+		// don't need to hold this one.
+		defer root.DecRef()
+
+		// Grab the working directory.
+		remainingTraversals := args.MaxSymlinkTraversals
+		wd := root // Default.
+		if args.WorkingDirectory != "" {
+			var err error
+			wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
+			if err != nil {
+				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			}
+			defer wd.DecRef()
+		}
+		opener = fsbridge.NewFSLookup(mntns, root, wd)
+		fsContext = newFSContext(root, wd, args.Umask)
 	}
 
+	tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
+
 	// Check which file to start from.
 	switch {
 	case args.Filename != "":
@@ -805,11 +975,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	}
 
 	// Create a fresh task context.
-	remainingTraversals = uint(args.MaxSymlinkTraversals)
+	remainingTraversals := args.MaxSymlinkTraversals
 	loadArgs := loader.LoadArgs{
-		Mounts:              mounts,
-		Root:                root,
-		WorkingDirectory:    wd,
+		Opener:              opener,
 		RemainingTraversals: &remainingTraversals,
 		ResolveFinal:        true,
 		Filename:            args.Filename,
@@ -834,13 +1002,15 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		Kernel:                  k,
 		ThreadGroup:             tg,
 		TaskContext:             tc,
-		FSContext:               newFSContext(root, wd, args.Umask),
+		FSContext:               fsContext,
 		FDTable:                 args.FDTable,
 		Credentials:             args.Credentials,
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
+		MountNamespaceVFS2:      args.MountNamespaceVFS2,
 		ContainerID:             args.ContainerID,
 	}
 	t, err := k.tasks.NewTask(config)
@@ -923,9 +1093,15 @@ func (k *Kernel) pauseTimeLocked() {
 		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
 		// but ktime.Timer.Pause is idempotent so this is harmless.
 		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
-					tfd.PauseTimer()
+			t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+				if VFS2Enabled {
+					if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
+						tfd.PauseTimer()
+					}
+				} else {
+					if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok {
+						tfd.PauseTimer()
+					}
 				}
 			})
 		}
@@ -953,9 +1129,15 @@ func (k *Kernel) resumeTimeLocked() {
 			}
 		}
 		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
-					tfd.ResumeTimer()
+			t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
+				if VFS2Enabled {
+					if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
+						tfd.ResumeTimer()
+					}
+				} else {
+					if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok {
+						tfd.ResumeTimer()
+					}
 				}
 			})
 		}
@@ -1100,6 +1282,14 @@ func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
 	k.sendExternalSignal(info, context)
 }
 
+// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
+// This function doesn't skip signals like SendExternalSignal does.
+func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return tg.SendSignal(info)
+}
+
 // SendContainerSignal sends the given signal to all processes inside the
 // namespace that match the given container ID.
 func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
@@ -1178,10 +1368,9 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
 	return k.rootAbstractSocketNamespace
 }
 
-// NetworkStack returns the network stack. NetworkStack may return nil if no
-// network stack is available.
-func (k *Kernel) NetworkStack() inet.Stack {
-	return k.networkStack
+// RootNetworkNamespace returns the root network namespace, always non-nil.
+func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
+	return k.rootNetworkNamespace
 }
 
 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
@@ -1311,9 +1500,10 @@ func (k *Kernel) SupervisorContext() context.Context {
 // +stateify savable
 type SocketEntry struct {
 	socketEntry
-	k    *Kernel
-	Sock *refs.WeakRef
-	ID   uint64 // Socket table entry number.
+	k        *Kernel
+	Sock     *refs.WeakRef
+	SockVFS2 *vfs.FileDescription
+	ID       uint64 // Socket table entry number.
 }
 
 // WeakRefGone implements refs.WeakRefUser.WeakRefGone.
@@ -1336,7 +1526,30 @@ func (k *Kernel) RecordSocket(sock *fs.File) {
 	k.extMu.Unlock()
 }
 
+// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for
+// tracking.
+//
+// Precondition: Caller must hold a reference to sock.
+//
+// Note that the socket table will not hold a reference on the
+// vfs.FileDescription, because we do not support weak refs on VFS2 files.
+func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
+	k.extMu.Lock()
+	id := k.nextSocketEntry
+	k.nextSocketEntry++
+	s := &SocketEntry{
+		k:        k,
+		ID:       id,
+		SockVFS2: sock,
+	}
+	k.sockets.PushBack(s)
+	k.extMu.Unlock()
+}
+
 // ListSockets returns a snapshot of all sockets.
+//
+// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef()
+// to get a reference on a socket in the table.
 func (k *Kernel) ListSockets() []*SocketEntry {
 	k.extMu.Lock()
 	var socks []*SocketEntry
@@ -1378,8 +1591,24 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 			return ctx.k.globalInit.mounts.Root()
 		}
 		return nil
+	case vfs.CtxRoot:
+		if ctx.k.globalInit == nil {
+			return vfs.VirtualDentry{}
+		}
+		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		defer mntns.DecRef()
+		// Root() takes a reference on the root dirent for us.
+		return mntns.Root()
+	case vfs.CtxMountNamespace:
+		if ctx.k.globalInit == nil {
+			return nil
+		}
+		// MountNamespaceVFS2() takes a reference for us.
+		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -1423,3 +1652,36 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
 		Registers: t.Arch().StateData().Proto(),
 	})
 }
+
+// VFS returns the virtual filesystem for the kernel.
+func (k *Kernel) VFS() *vfs.VirtualFilesystem {
+	return &k.vfs
+}
+
+// SetHostMount sets the hostfs mount.
+func (k *Kernel) SetHostMount(mnt *vfs.Mount) {
+	if k.hostMount != nil {
+		panic("Kernel.hostMount cannot be set more than once")
+	}
+	k.hostMount = mnt
+}
+
+// HostMount returns the hostfs mount.
+func (k *Kernel) HostMount() *vfs.Mount {
+	return k.hostMount
+}
+
+// PipeMount returns the pipefs mount.
+func (k *Kernel) PipeMount() *vfs.Mount {
+	return k.pipeMount
+}
+
+// ShmMount returns the tmpfs mount.
+func (k *Kernel) ShmMount() *vfs.Mount {
+	return k.shmMount
+}
+
+// SocketMount returns the sockfs mount.
+func (k *Kernel) SocketMount() *vfs.Mount {
+	return k.socketMount
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 4c049d5b4..7bfa9075a 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,28 +1,14 @@
 load("//tools:defs.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-go_template_instance(
-    name = "buffer_list",
-    out = "buffer_list.go",
-    package = "pipe",
-    prefix = "buffer",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*buffer",
-        "Linker": "*buffer",
-    },
-)
-
 go_library(
     name = "pipe",
     srcs = [
-        "buffer.go",
-        "buffer_list.go",
         "device.go",
         "node.go",
         "pipe.go",
+        "pipe_unsafe.go",
         "pipe_util.go",
         "reader.go",
         "reader_writer.go",
@@ -33,6 +19,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/buffer",
         "//pkg/context",
         "//pkg/safemem",
         "//pkg/sentry/arch",
@@ -51,7 +38,6 @@ go_test(
     name = "pipe_test",
     size = "small",
     srcs = [
-        "buffer_test.go",
         "node_test.go",
         "pipe_test.go",
     ],
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
deleted file mode 100644
index fe3be5dbd..000000000
--- a/pkg/sentry/kernel/pipe/buffer.go
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pipe
-
-import (
-	"io"
-
-	"gvisor.dev/gvisor/pkg/safemem"
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-// buffer encapsulates a queueable byte buffer.
-//
-// Note that the total size is slightly less than two pages. This
-// is done intentionally to ensure that the buffer object aligns
-// with runtime internals. We have no hard size or alignment
-// requirements. This two page size will effectively minimize
-// internal fragmentation, but still have a large enough chunk
-// to limit excessive segmentation.
-//
-// +stateify savable
-type buffer struct {
-	data  [8144]byte
-	read  int
-	write int
-	bufferEntry
-}
-
-// Reset resets internal data.
-//
-// This must be called before use.
-func (b *buffer) Reset() {
-	b.read = 0
-	b.write = 0
-}
-
-// Empty indicates the buffer is empty.
-//
-// This indicates there is no data left to read.
-func (b *buffer) Empty() bool {
-	return b.read == b.write
-}
-
-// Full indicates the buffer is full.
-//
-// This indicates there is no capacity left to write.
-func (b *buffer) Full() bool {
-	return b.write == len(b.data)
-}
-
-// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
-func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
-	dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.write:]))
-	n, err := safemem.CopySeq(dst, srcs)
-	b.write += int(n)
-	return n, err
-}
-
-// WriteFromReader writes to the buffer from an io.Reader.
-func (b *buffer) WriteFromReader(r io.Reader, count int64) (int64, error) {
-	dst := b.data[b.write:]
-	if count < int64(len(dst)) {
-		dst = b.data[b.write:][:count]
-	}
-	n, err := r.Read(dst)
-	b.write += n
-	return int64(n), err
-}
-
-// ReadToBlocks implements safemem.Reader.ReadToBlocks.
-func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
-	src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write]))
-	n, err := safemem.CopySeq(dsts, src)
-	b.read += int(n)
-	return n, err
-}
-
-// ReadToWriter reads from the buffer into an io.Writer.
-func (b *buffer) ReadToWriter(w io.Writer, count int64, dup bool) (int64, error) {
-	src := b.data[b.read:b.write]
-	if count < int64(len(src)) {
-		src = b.data[b.read:][:count]
-	}
-	n, err := w.Write(src)
-	if !dup {
-		b.read += n
-	}
-	return int64(n), err
-}
-
-// bufferPool is a pool for buffers.
-var bufferPool = sync.Pool{
-	New: func() interface{} {
-		return new(buffer)
-	},
-}
-
-// newBuffer grabs a new buffer from the pool.
-func newBuffer() *buffer {
-	b := bufferPool.Get().(*buffer)
-	b.Reset()
-	return b
-}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 08410283f..79645d7d2 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -20,6 +20,7 @@ import (
 	"sync/atomic"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -70,10 +71,10 @@ type Pipe struct {
 	// mu protects all pipe internal state below.
 	mu sync.Mutex `state:"nosave"`
 
-	// data is the buffer queue of pipe contents.
+	// view is the underlying set of buffers.
 	//
 	// This is protected by mu.
-	data bufferList
+	view buffer.View
 
 	// max is the maximum size of the pipe in bytes. When this max has been
 	// reached, writers will get EWOULDBLOCK.
@@ -81,11 +82,6 @@ type Pipe struct {
 	// This is protected by mu.
 	max int64
 
-	// size is the current size of the pipe in bytes.
-	//
-	// This is protected by mu.
-	size int64
-
 	// hadWriter indicates if this pipe ever had a writer. Note that this
 	// does not necessarily indicate there is *currently* a writer, just
 	// that there has been a writer at some point since the pipe was
@@ -196,7 +192,7 @@ type readOps struct {
 	limit func(int64)
 
 	// read performs the actual read operation.
-	read func(*buffer) (int64, error)
+	read func(*buffer.View) (int64, error)
 }
 
 // read reads data from the pipe into dst and returns the number of bytes
@@ -211,82 +207,27 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 
 	p.mu.Lock()
 	defer p.mu.Unlock()
-
-	// Is the pipe empty?
-	if p.size == 0 {
-		if !p.HasWriters() {
-			// There are no writers, return EOF.
-			return 0, nil
-		}
-		return 0, syserror.ErrWouldBlock
-	}
-
-	// Limit how much we consume.
-	if ops.left() > p.size {
-		ops.limit(p.size)
-	}
-
-	done := int64(0)
-	for ops.left() > 0 {
-		// Pop the first buffer.
-		first := p.data.Front()
-		if first == nil {
-			break
-		}
-
-		// Copy user data.
-		n, err := ops.read(first)
-		done += int64(n)
-		p.size -= n
-
-		// Empty buffer?
-		if first.Empty() {
-			// Push to the free list.
-			p.data.Remove(first)
-			bufferPool.Put(first)
-		}
-
-		// Handle errors.
-		if err != nil {
-			return done, err
-		}
-	}
-
-	return done, nil
+	return p.readLocked(ctx, ops)
 }
 
-// dup duplicates all data from this pipe into the given writer.
-//
-// There is no blocking behavior implemented here. The writer may propagate
-// some blocking error. All the writes must be complete writes.
-func (p *Pipe) dup(ctx context.Context, ops readOps) (int64, error) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-
+func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
 	// Is the pipe empty?
-	if p.size == 0 {
+	if p.view.Size() == 0 {
 		if !p.HasWriters() {
-			// See above.
+			// There are no writers, return EOF.
 			return 0, nil
 		}
 		return 0, syserror.ErrWouldBlock
 	}
 
 	// Limit how much we consume.
-	if ops.left() > p.size {
-		ops.limit(p.size)
+	if ops.left() > p.view.Size() {
+		ops.limit(p.view.Size())
 	}
 
-	done := int64(0)
-	for buf := p.data.Front(); buf != nil; buf = buf.Next() {
-		n, err := ops.read(buf)
-		done += n
-		if err != nil {
-			return done, err
-		}
-	}
-
-	return done, nil
+	// Copy user data; the read op is responsible for trimming.
+	done, err := ops.read(&p.view)
+	return done, err
 }
 
 type writeOps struct {
@@ -297,7 +238,7 @@ type writeOps struct {
 	limit func(int64)
 
 	// write should write to the provided buffer.
-	write func(*buffer) (int64, error)
+	write func(*buffer.View) (int64, error)
 }
 
 // write writes data from sv into the pipe and returns the number of bytes
@@ -308,7 +249,10 @@ type writeOps struct {
 func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
+	return p.writeLocked(ctx, ops)
+}
 
+func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
 	// Can't write to a pipe with no readers.
 	if !p.HasReaders() {
 		return 0, syscall.EPIPE
@@ -317,35 +261,28 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
 	// atomic, but requires no atomicity for writes larger than this.
 	wanted := ops.left()
-	if avail := p.max - p.size; wanted > avail {
+	avail := p.max - p.view.Size()
+	if wanted > avail {
 		if wanted <= p.atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
 		ops.limit(avail)
 	}
 
-	done := int64(0)
-	for ops.left() > 0 {
-		// Need a new buffer?
-		last := p.data.Back()
-		if last == nil || last.Full() {
-			// Add a new buffer to the data list.
-			last = newBuffer()
-			p.data.PushBack(last)
-		}
-
-		// Copy user data.
-		n, err := ops.write(last)
-		done += int64(n)
-		p.size += n
+	// Copy user data.
+	done, err := ops.write(&p.view)
+	if err != nil {
+		return done, err
+	}
 
-		// Handle errors.
-		if err != nil {
-			return done, err
-		}
+	if done < avail {
+		// Non-failure, but short write.
+		return done, nil
 	}
-	if wanted > done {
-		// Partial write due to full pipe.
+	if done < wanted {
+		// Partial write due to full pipe. Note that this could also be
+		// the short write case above, we would expect a second call
+		// and the write to return zero bytes in this case.
 		return done, syserror.ErrWouldBlock
 	}
 
@@ -396,7 +333,7 @@ func (p *Pipe) HasWriters() bool {
 // Precondition: mu must be held.
 func (p *Pipe) rReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasReaders() && p.data.Front() != nil {
+	if p.HasReaders() && p.view.Size() != 0 {
 		ready |= waiter.EventIn
 	}
 	if !p.HasWriters() && p.hadWriter {
@@ -422,7 +359,7 @@ func (p *Pipe) rReadiness() waiter.EventMask {
 // Precondition: mu must be held.
 func (p *Pipe) wReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasWriters() && p.size < p.max {
+	if p.HasWriters() && p.view.Size() < p.max {
 		ready |= waiter.EventOut
 	}
 	if !p.HasReaders() {
@@ -451,7 +388,7 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
 func (p *Pipe) queued() int64 {
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	return p.size
+	return p.view.Size()
 }
 
 // FifoSize implements fs.FifoSizer.FifoSize.
@@ -474,7 +411,7 @@ func (p *Pipe) SetFifoSize(size int64) (int64, error) {
 	}
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	if size < p.size {
+	if size < p.view.Size() {
 		return 0, syserror.EBUSY
 	}
 	p.max = size
diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go
index 4d54b8b8f..dd60cba24 100644
--- a/pkg/sentry/kernel/pipe/buffer_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go
@@ -15,18 +15,21 @@
 package pipe
 
 import (
-	"testing"
 	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func TestBufferSize(t *testing.T) {
-	bufferSize := unsafe.Sizeof(buffer{})
-	if bufferSize < usermem.PageSize {
-		t.Errorf("buffer is less than a page")
-	}
-	if bufferSize > (2 * usermem.PageSize) {
-		t.Errorf("buffer is greater than two pages")
+// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be
+// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that
+// concurrent calls cannot deadlock.
+//
+// Preconditions: x != y.
+func lockTwoPipes(x, y *Pipe) {
+	// Lock the two pipes in order of increasing address.
+	if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) {
+		x.mu.Lock()
+		y.mu.Lock()
+	} else {
+		y.mu.Lock()
+		x.mu.Lock()
 	}
 }
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 80158239e..aacf28da2 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -49,9 +50,10 @@ func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 		limit: func(l int64) {
 			dst = dst.TakeFirst64(l)
 		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := dst.CopyOutFrom(ctx, buf)
+		read: func(view *buffer.View) (int64, error) {
+			n, err := dst.CopyOutFrom(ctx, view)
 			dst = dst.DropFirst64(n)
+			view.TrimFront(n)
 			return n, err
 		},
 	})
@@ -70,16 +72,15 @@ func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool)
 		limit: func(l int64) {
 			count = l
 		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := buf.ReadToWriter(w, count, dup)
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadToWriter(w, count)
+			if !dup {
+				view.TrimFront(n)
+			}
 			count -= n
 			return n, err
 		},
 	}
-	if dup {
-		// There is no notification for dup operations.
-		return p.dup(ctx, ops)
-	}
 	n, err := p.read(ctx, ops)
 	if n > 0 {
 		p.Notify(waiter.EventOut)
@@ -96,8 +97,8 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error)
 		limit: func(l int64) {
 			src = src.TakeFirst64(l)
 		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := src.CopyInTo(ctx, buf)
+		write: func(view *buffer.View) (int64, error) {
+			n, err := src.CopyInTo(ctx, view)
 			src = src.DropFirst64(n)
 			return n, err
 		},
@@ -117,8 +118,8 @@ func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, e
 		limit: func(l int64) {
 			count = l
 		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := buf.WriteFromReader(r, count)
+		write: func(view *buffer.View) (int64, error) {
+			n, err := view.WriteFromReader(r, count)
 			count -= n
 			return n, err
 		},
@@ -143,7 +144,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
 		if v > math.MaxInt32 {
 			v = math.MaxInt32 // Silently truncate.
 		}
-		// Copy result to user-space.
+		// Copy result to userspace.
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
 			AddressSpaceActive: true,
 		})
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index a5675bd70..2602bed72 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -16,7 +16,9 @@ package pipe
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -49,38 +51,42 @@ type VFSPipe struct {
 }
 
 // NewVFSPipe returns an initialized VFSPipe.
-func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe {
+func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
 	var vp VFSPipe
-	initPipe(&vp.pipe, true /* isNamed */, sizeBytes, atomicIOBytes)
+	initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes)
 	return &vp
 }
 
-// NewVFSPipeFD opens a named pipe. Named pipes have special blocking semantics
-// during open:
+// ReaderWriterPair returns read-only and write-only FDs for vp.
 //
-// "Normally, opening the FIFO blocks until the other end is opened also. A
-// process can open a FIFO in nonblocking mode. In this case, opening for
-// read-only will succeed even if no-one has opened on the write side yet,
-// opening for write-only will fail with ENXIO (no such device or address)
-// unless the other end has already been opened. Under Linux, opening a FIFO
-// for read and write will succeed both in blocking and nonblocking mode. POSIX
-// leaves this behavior undefined. This can be used to open a FIFO for writing
-// while there are no readers available." - fifo(7)
-func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
+// Preconditions: statusFlags should not contain an open access mode.
+func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
+	return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags)
+}
+
+// Open opens the pipe represented by vp.
+func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) {
 	vp.mu.Lock()
 	defer vp.mu.Unlock()
 
-	readable := vfs.MayReadFileWithOpenFlags(flags)
-	writable := vfs.MayWriteFileWithOpenFlags(flags)
+	readable := vfs.MayReadFileWithOpenFlags(statusFlags)
+	writable := vfs.MayWriteFileWithOpenFlags(statusFlags)
 	if !readable && !writable {
 		return nil, syserror.EINVAL
 	}
 
-	vfd, err := vp.open(vfsd, vfsfd, flags)
-	if err != nil {
-		return nil, err
-	}
+	fd := vp.newFD(mnt, vfsd, statusFlags)
 
+	// Named pipes have special blocking semantics during open:
+	//
+	// "Normally, opening the FIFO blocks until the other end is opened also. A
+	// process can open a FIFO in nonblocking mode. In this case, opening for
+	// read-only will succeed even if no-one has opened on the write side yet,
+	// opening for write-only will fail with ENXIO (no such device or address)
+	// unless the other end has already been opened. Under Linux, opening a
+	// FIFO for read and write will succeed both in blocking and nonblocking
+	// mode. POSIX leaves this behavior undefined. This can be used to open a
+	// FIFO for writing while there are no readers available." - fifo(7)
 	switch {
 	case readable && writable:
 		// Pipes opened for read-write always succeed without blocking.
@@ -89,23 +95,26 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf
 
 	case readable:
 		newHandleLocked(&vp.rWakeup)
-		// If this pipe is being opened as nonblocking and there's no
+		// If this pipe is being opened as blocking and there's no
 		// writer, we have to wait for a writer to open the other end.
-		if flags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
+		if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
+			fd.DecRef()
 			return nil, syserror.EINTR
 		}
 
 	case writable:
 		newHandleLocked(&vp.wWakeup)
 
-		if !vp.pipe.HasReaders() {
-			// Nonblocking, write-only opens fail with ENXIO when
-			// the read side isn't open yet.
-			if flags&linux.O_NONBLOCK != 0 {
+		if vp.pipe.isNamed && !vp.pipe.HasReaders() {
+			// Non-blocking, write-only opens fail with ENXIO when the read
+			// side isn't open yet.
+			if statusFlags&linux.O_NONBLOCK != 0 {
+				fd.DecRef()
 				return nil, syserror.ENXIO
 			}
 			// Wait for a reader to open the other end.
 			if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
+				fd.DecRef()
 				return nil, syserror.EINTR
 			}
 		}
@@ -114,96 +123,95 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf
 		panic("invalid pipe flags: must be readable, writable, or both")
 	}
 
-	return vfd, nil
+	return fd, nil
 }
 
 // Preconditions: vp.mu must be held.
-func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
-	var fd VFSPipeFD
-	fd.flags = flags
-	fd.readable = vfs.MayReadFileWithOpenFlags(flags)
-	fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
-	fd.vfsfd = vfsfd
-	fd.pipe = &vp.pipe
+func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription {
+	fd := &VFSPipeFD{
+		pipe: &vp.pipe,
+	}
+	fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		UseDentryMetadata: true,
+	})
 
 	switch {
-	case fd.readable && fd.writable:
+	case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
 		vp.pipe.rOpen()
 		vp.pipe.wOpen()
-	case fd.readable:
+	case fd.vfsfd.IsReadable():
 		vp.pipe.rOpen()
-	case fd.writable:
+	case fd.vfsfd.IsWritable():
 		vp.pipe.wOpen()
 	default:
 		panic("invalid pipe flags: must be readable, writable, or both")
 	}
 
-	return &fd, nil
+	return &fd.vfsfd
 }
 
-// VFSPipeFD implements a subset of vfs.FileDescriptionImpl for pipes. It is
-// expected that filesystesm will use this in a struct implementing
-// vfs.FileDescriptionImpl.
+// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
+// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
+// other FileDescriptions for splice(2) and tee(2).
 type VFSPipeFD struct {
-	pipe     *Pipe
-	flags    uint32
-	readable bool
-	writable bool
-	vfsfd    *vfs.FileDescription
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	pipe *Pipe
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *VFSPipeFD) Release() {
 	var event waiter.EventMask
-	if fd.readable {
+	if fd.vfsfd.IsReadable() {
 		fd.pipe.rClose()
-		event |= waiter.EventIn
+		event |= waiter.EventOut
 	}
-	if fd.writable {
+	if fd.vfsfd.IsWritable() {
 		fd.pipe.wClose()
-		event |= waiter.EventOut
+		event |= waiter.EventIn | waiter.EventHUp
 	}
 	if event == 0 {
 		panic("invalid pipe flags: must be readable, writable, or both")
 	}
 
-	if fd.writable {
-		fd.vfsfd.VirtualDentry().Mount().EndWrite()
-	}
-
 	fd.pipe.Notify(event)
 }
 
-// OnClose implements vfs.FileDescriptionImpl.OnClose.
-func (fd *VFSPipeFD) OnClose(_ context.Context) error {
-	return nil
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	switch {
+	case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
+		return fd.pipe.rwReadiness()
+	case fd.vfsfd.IsReadable():
+		return fd.pipe.rReadiness()
+	case fd.vfsfd.IsWritable():
+		return fd.pipe.wReadiness()
+	default:
+		panic("pipe FD is neither readable nor writable")
+	}
 }
 
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *VFSPipeFD) PRead(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.ReadOptions) (int64, error) {
-	return 0, syserror.ESPIPE
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.pipe.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) {
+	fd.pipe.EventUnregister(e)
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
 func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
-	if !fd.readable {
-		return 0, syserror.EINVAL
-	}
-
 	return fd.pipe.Read(ctx, dst)
 }
 
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *VFSPipeFD) PWrite(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.WriteOptions) (int64, error) {
-	return 0, syserror.ESPIPE
-}
-
 // Write implements vfs.FileDescriptionImpl.Write.
 func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
-	if !fd.writable {
-		return 0, syserror.EINVAL
-	}
-
 	return fd.pipe.Write(ctx, src)
 }
 
@@ -211,3 +219,230 @@ func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.Wr
 func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	return fd.pipe.Ioctl(ctx, uio, args)
 }
+
+// PipeSize implements fcntl(F_GETPIPE_SZ).
+func (fd *VFSPipeFD) PipeSize() int64 {
+	// Inline Pipe.FifoSize() rather than calling it with nil Context and
+	// fs.File and ignoring the returned error (which is always nil).
+	fd.pipe.mu.Lock()
+	defer fd.pipe.mu.Unlock()
+	return fd.pipe.max
+}
+
+// SetPipeSize implements fcntl(F_SETPIPE_SZ).
+func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
+	return fd.pipe.SetFifoSize(size)
+}
+
+// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
+// or writes up to count bytes to, fd.
+func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
+	return usermem.IOSequence{
+		IO:    fd,
+		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+	}
+}
+
+// CopyIn implements usermem.IO.CopyIn.
+func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+	origCount := int64(len(dst))
+	n, err := fd.pipe.read(ctx, readOps{
+		left: func() int64 {
+			return int64(len(dst))
+		},
+		limit: func(l int64) {
+			dst = dst[:l]
+		},
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadAt(dst, 0)
+			view.TrimFront(int64(n))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventOut)
+	}
+	if err == nil && n != origCount {
+		return int(n), syserror.ErrWouldBlock
+	}
+	return int(n), err
+}
+
+// CopyOut implements usermem.IO.CopyOut.
+func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+	origCount := int64(len(src))
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return int64(len(src))
+		},
+		limit: func(l int64) {
+			src = src[:l]
+		},
+		write: func(view *buffer.View) (int64, error) {
+			view.Append(src)
+			return int64(len(src)), nil
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return int(n), syserror.ErrWouldBlock
+	}
+	return int(n), err
+}
+
+// ZeroOut implements usermem.IO.ZeroOut.
+func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+	origCount := toZero
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return toZero
+		},
+		limit: func(l int64) {
+			toZero = l
+		},
+		write: func(view *buffer.View) (int64, error) {
+			view.Grow(view.Size()+toZero, true /* zero */)
+			return toZero, nil
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// CopyInTo implements usermem.IO.CopyInTo.
+func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+	count := ars.NumBytes()
+	if count == 0 {
+		return 0, nil
+	}
+	origCount := count
+	n, err := fd.pipe.read(ctx, readOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadToSafememWriter(dst, uint64(count))
+			view.TrimFront(int64(n))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventOut)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// CopyOutFrom implements usermem.IO.CopyOutFrom.
+func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+	count := ars.NumBytes()
+	if count == 0 {
+		return 0, nil
+	}
+	origCount := count
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(view *buffer.View) (int64, error) {
+			n, err := view.WriteFromSafememReader(src, uint64(count))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// SwapUint32 implements usermem.IO.SwapUint32.
+func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+	// How did a pipe get passed as the virtual address space to futex(2)?
+	panic("VFSPipeFD.SwapUint32 called unexpectedly")
+}
+
+// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
+func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+	panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly")
+}
+
+// LoadUint32 implements usermem.IO.LoadUint32.
+func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+	panic("VFSPipeFD.LoadUint32 called unexpectedly")
+}
+
+// Splice reads up to count bytes from src and writes them to dst. It returns
+// the number of bytes moved.
+//
+// Preconditions: count > 0.
+func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+	return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */)
+}
+
+// Tee reads up to count bytes from src and writes them to dst, without
+// removing the read bytes from src. It returns the number of bytes copied.
+//
+// Preconditions: count > 0.
+func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+	return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */)
+}
+
+// Preconditions: count > 0.
+func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) {
+	if dst.pipe == src.pipe {
+		return 0, syserror.EINVAL
+	}
+
+	lockTwoPipes(dst.pipe, src.pipe)
+	defer dst.pipe.mu.Unlock()
+	defer src.pipe.mu.Unlock()
+
+	n, err := dst.pipe.writeLocked(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(dstView *buffer.View) (int64, error) {
+			return src.pipe.readLocked(ctx, readOps{
+				left: func() int64 {
+					return count
+				},
+				limit: func(l int64) {
+					count = l
+				},
+				read: func(srcView *buffer.View) (int64, error) {
+					n, err := srcView.ReadToSafememWriter(dstView, uint64(count))
+					if n > 0 && removeFromSrc {
+						srcView.TrimFront(int64(n))
+					}
+					return int64(n), err
+				},
+			})
+		},
+	})
+	if n > 0 {
+		dst.pipe.Notify(waiter.EventIn)
+		src.pipe.Notify(waiter.EventOut)
+	}
+	return n, err
+}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 35ad97d5d..e23e796ef 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -184,7 +184,6 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
 	if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
 		return false
 	}
-	// TODO: Yama LSM
 	return true
 }
 
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index efebfd872..18416643b 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -303,26 +303,14 @@ func (t *Task) rseqAddrInterrupt() {
 		return
 	}
 
-	buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection)
-	if _, err := t.CopyInBytes(critAddr, buf); err != nil {
+	var cs linux.RSeqCriticalSection
+	if _, err := cs.CopyIn(t, critAddr); err != nil {
 		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 		return
 	}
 
-	// Manually marshal RSeqCriticalSection as this is in the hot path when
-	// rseq is enabled. It must be as fast as possible.
-	//
-	// TODO(b/130243041): Replace with go_marshal.
-	cs := linux.RSeqCriticalSection{
-		Version:          usermem.ByteOrder.Uint32(buf[0:4]),
-		Flags:            usermem.ByteOrder.Uint32(buf[4:8]),
-		Start:            usermem.ByteOrder.Uint64(buf[8:16]),
-		PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]),
-		Abort:            usermem.ByteOrder.Uint64(buf[24:32]),
-	}
-
 	if cs.Version != 0 {
 		t.Debugf("Unknown version in %+v", cs)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 1000f3287..c00fa1138 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -554,6 +554,7 @@ func (s *sem) wakeWaiters() {
 	for w := s.waiters.Front(); w != nil; {
 		if s.value < w.value {
 			// Still blocked, skip it.
+			w = w.Next()
 			continue
 		}
 		w.ch <- struct{}{}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 047b5214d..0e19286de 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -246,7 +246,7 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
 
 	var lastErr error
 	for tg := range tasks.Root.tgids {
-		if tg.ProcessGroup() == pg {
+		if tg.processGroup == pg {
 			tg.signalHandlers.mu.Lock()
 			infoCopy := *info
 			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 208569057..f66cfcc7f 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -461,7 +461,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A
 func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	// TODO(b/38173783): RemoveMapping may be called during task exit, when ctx
+	// RemoveMapping may be called during task exit, when ctx
 	// is context.Background. Gracefully handle missing clocks. Failing to
 	// update the detach time in these cases is ok, since no one can observe the
 	// omission.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 93c4fe969..413111faf 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -29,7 +29,7 @@ import (
 //
 // The types below create fast lookup slices for all syscalls. This maximum
 // serves as a sanity check that we don't allocate huge slices for a very large
-// syscall.
+// syscall. This is checked during registration.
 const maxSyscallNum = 2000
 
 // SyscallSupportLevel is a syscall support levels.
@@ -209,65 +209,71 @@ type Stracer interface {
 	// SyscallEnter is called on syscall entry.
 	//
 	// The returned private data is passed to SyscallExit.
-	//
-	// TODO(gvisor.dev/issue/155): remove kernel imports from the strace
-	// package so that the type can be used directly.
 	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
 
 	// SyscallExit is called on syscall exit.
 	SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
 }
 
-// SyscallTable is a lookup table of system calls. Critically, a SyscallTable
-// is *immutable*. In order to make supporting suspend and resume sane, they
-// must be uniquely registered and may not change during operation.
+// SyscallTable is a lookup table of system calls.
 //
-// +stateify savable
+// Note that a SyscallTable is not savable directly. Instead, they are saved as
+// an OS/Arch pair and lookup happens again on restore.
 type SyscallTable struct {
 	// OS is the operating system that this syscall table implements.
-	OS abi.OS `state:"wait"`
+	OS abi.OS
 
 	// Arch is the architecture that this syscall table targets.
-	Arch arch.Arch `state:"wait"`
+	Arch arch.Arch
 
 	// The OS version that this syscall table implements.
-	Version Version `state:"manual"`
+	Version Version
 
 	// AuditNumber is a numeric constant that represents the syscall table. If
 	// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
 	// linux/audit.h.
-	AuditNumber uint32 `state:"manual"`
+	AuditNumber uint32
 
 	// Table is the collection of functions.
-	Table map[uintptr]Syscall `state:"manual"`
+	Table map[uintptr]Syscall
 
 	// lookup is a fixed-size array that holds the syscalls (indexed by
 	// their numbers). It is used for fast look ups.
-	lookup []SyscallFn `state:"manual"`
+	lookup []SyscallFn
 
 	// Emulate is a collection of instruction addresses to emulate. The
 	// keys are addresses, and the values are system call numbers.
-	Emulate map[usermem.Addr]uintptr `state:"manual"`
+	Emulate map[usermem.Addr]uintptr
 
 	// The function to call in case of a missing system call.
-	Missing MissingFn `state:"manual"`
+	Missing MissingFn
 
 	// Stracer traces this syscall table.
-	Stracer Stracer `state:"manual"`
+	Stracer Stracer
 
 	// External is used to handle an external callback.
-	External func(*Kernel) `state:"manual"`
+	External func(*Kernel)
 
 	// ExternalFilterBefore is called before External is called before the syscall is executed.
 	// External is not called if it returns false.
-	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool
 
 	// ExternalFilterAfter is called before External is called after the syscall is executed.
 	// External is not called if it returns false.
-	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool
 
 	// FeatureEnable stores the strace and one-shot enable bits.
-	FeatureEnable SyscallFlagsTable `state:"manual"`
+	FeatureEnable SyscallFlagsTable
+}
+
+// MaxSysno returns the largest system call number.
+func (s *SyscallTable) MaxSysno() (max uintptr) {
+	for num := range s.Table {
+		if num > max {
+			max = num
+		}
+	}
+	return max
 }
 
 // allSyscallTables contains all known tables.
@@ -290,6 +296,20 @@ func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
 
 // RegisterSyscallTable registers a new syscall table for use by a Kernel.
 func RegisterSyscallTable(s *SyscallTable) {
+	if max := s.MaxSysno(); max > maxSyscallNum {
+		panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
+	}
+	if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
+		panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
+	}
+	allSyscallTables = append(allSyscallTables, s)
+	s.Init()
+}
+
+// Init initializes the system call table.
+//
+// This should normally be called only during registration.
+func (s *SyscallTable) Init() {
 	if s.Table == nil {
 		// Ensure non-nil lookup table.
 		s.Table = make(map[uintptr]Syscall)
@@ -299,35 +319,16 @@ func RegisterSyscallTable(s *SyscallTable) {
 		s.Emulate = make(map[usermem.Addr]uintptr)
 	}
 
-	var max uintptr
-	for num := range s.Table {
-		if num > max {
-			max = num
-		}
-	}
-
-	if max > maxSyscallNum {
-		panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
-	}
-
-	s.lookup = make([]SyscallFn, max+1)
+	max := s.MaxSysno() // Checked during RegisterSyscallTable.
 
 	// Initialize the fast-lookup table.
+	s.lookup = make([]SyscallFn, max+1)
 	for num, sc := range s.Table {
 		s.lookup[num] = sc.Fn
 	}
 
+	// Initialize all features.
 	s.FeatureEnable.init(s.Table, max)
-
-	if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
-		panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
-	}
-
-	// Save a reference to this table.
-	//
-	// This is required for a Kernel to find the table and for save/restore
-	// operations below.
-	allSyscallTables = append(allSyscallTables, s)
 }
 
 // Lookup returns the syscall implementation, if one exists.
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 00358326b..90f890495 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -14,16 +14,34 @@
 
 package kernel
 
-import "fmt"
+import (
+	"fmt"
 
-// afterLoad is invoked by stateify.
-func (s *SyscallTable) afterLoad() {
-	otherTable, ok := LookupSyscallTable(s.OS, s.Arch)
-	if !ok {
-		// Couldn't find a reference?
-		panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch))
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+// syscallTableInfo is used to reload the SyscallTable.
+//
+// +stateify savable
+type syscallTableInfo struct {
+	OS   abi.OS
+	Arch arch.Arch
+}
+
+// saveSt saves the SyscallTable.
+func (tc *TaskContext) saveSt() syscallTableInfo {
+	return syscallTableInfo{
+		OS:   tc.st.OS,
+		Arch: tc.st.Arch,
 	}
+}
 
-	// Copy the table.
-	*s = *otherTable
+// loadSt loads the SyscallTable.
+func (tc *TaskContext) loadSt(sti syscallTableInfo) {
+	st, ok := LookupSyscallTable(sti.OS, sti.Arch)
+	if !ok {
+		panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch))
+	}
+	tc.st = st // Save the table reference.
 }
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 981e8c7fe..f48247c94 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -424,6 +425,11 @@ type Task struct {
 	// abstractSockets is protected by mu.
 	abstractSockets *AbstractSocketNamespace
 
+	// mountNamespaceVFS2 is the task's mount namespace.
+	//
+	// It is protected by mu. It is owned by the task goroutine.
+	mountNamespaceVFS2 *vfs.MountNamespace
+
 	// parentDeathSignal is sent to this task's thread group when its parent exits.
 	//
 	// parentDeathSignal is protected by mu.
@@ -478,16 +484,13 @@ type Task struct {
 	// bit.
 	//
 	// numaPolicy and numaNodeMask are protected by mu.
-	numaPolicy   int32
+	numaPolicy   linux.NumaPolicy
 	numaNodeMask uint64
 
-	// If netns is true, the task is in a non-root network namespace. Network
-	// namespaces aren't currently implemented in full; being in a network
-	// namespace simply prevents the task from observing any network devices
-	// (including loopback) or using abstract socket addresses (see unix(7)).
+	// netns is the task's network namespace. netns is never nil.
 	//
-	// netns is protected by mu. netns is owned by the task goroutine.
-	netns bool
+	// netns is protected by mu.
+	netns *inet.Namespace
 
 	// If rseqPreempted is true, before the next call to p.Switch(),
 	// interrupt rseq critical regions as defined by rseqAddr and
@@ -638,6 +641,11 @@ func (t *Task) Value(key interface{}) interface{} {
 		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
 		return t.fsContext.RootDirectory()
+	case vfs.CtxRoot:
+		return t.fsContext.RootDirectoryVFS2()
+	case vfs.CtxMountNamespace:
+		t.mountNamespaceVFS2.IncRef()
+		return t.mountNamespaceVFS2
 	case fs.CtxDirentCacheLimiter:
 		return t.k.DirentCacheLimiter
 	case inet.CtxStack:
@@ -701,6 +709,14 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
 // Preconditions: The caller must be running on the task goroutine, or t.mu
 // must be locked.
 func (t *Task) IsChrooted() bool {
+	if VFS2Enabled {
+		realRoot := t.mountNamespaceVFS2.Root()
+		defer realRoot.DecRef()
+		root := t.fsContext.RootDirectoryVFS2()
+		defer root.DecRef()
+		return root != realRoot
+	}
+
 	realRoot := t.tg.mounts.Root()
 	defer realRoot.DecRef()
 	root := t.fsContext.RootDirectory()
@@ -761,6 +777,15 @@ func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error
 	return t.fdTable.NewFDs(t, fd, files, flags)
 }
 
+// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) {
+	return t.fdTable.NewFDsVFS2(t, fd, files, flags)
+}
+
 // NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
 //
 // This automatically passes the task as the context.
@@ -774,6 +799,15 @@ func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error)
 	return fds[0], nil
 }
 
+// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.Get.
+func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	return t.fdTable.NewFDVFS2(t, fd, file, flags)
+}
+
 // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
 //
 // This automatically passes the task as the context.
@@ -783,6 +817,15 @@ func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
 	return t.fdTable.NewFDAt(t, fd, file, flags)
 }
 
+// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error {
+	return t.fdTable.NewFDAtVFS2(t, fd, file, flags)
+}
+
 // WithMuLocked executes f with t.mu locked.
 func (t *Task) WithMuLocked(f func(*Task)) {
 	t.mu.Lock()
@@ -796,6 +839,15 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
 	return t.tg.mounts
 }
 
+// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the
+// returned mount namespace.
+func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.mountNamespaceVFS2.IncRef()
+	return t.mountNamespaceVFS2
+}
+
 // AbstractSockets returns t's AbstractSocketNamespace.
 func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 	return t.abstractSockets
@@ -805,3 +857,30 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 func (t *Task) ContainerID() string {
 	return t.containerID
 }
+
+// OOMScoreAdj gets the task's thread group's OOM score adjustment.
+func (t *Task) OOMScoreAdj() int32 {
+	return atomic.LoadInt32(&t.tg.oomScoreAdj)
+}
+
+// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The
+// value should be between -1000 and 1000 inclusive.
+func (t *Task) SetOOMScoreAdj(adj int32) error {
+	if adj > 1000 || adj < -1000 {
+		return syserror.EINVAL
+	}
+	atomic.StoreInt32(&t.tg.oomScoreAdj, adj)
+	return nil
+}
+
+// UID returns t's uid.
+// TODO(gvisor.dev/issue/170): This method is not namespaced yet.
+func (t *Task) UID() uint32 {
+	return uint32(t.Credentials().EffectiveKUID)
+}
+
+// GID returns t's gid.
+// TODO(gvisor.dev/issue/170): This method is not namespaced yet.
+func (t *Task) GID() uint32 {
+	return uint32(t.Credentials().EffectiveKGID)
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 53d4d211b..e1ecca99e 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -15,8 +15,11 @@
 package kernel
 
 import (
+	"sync/atomic"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -54,8 +57,7 @@ type SharingOptions struct {
 	NewUserNamespace bool
 
 	// If NewNetworkNamespace is true, the task should have an independent
-	// network namespace. (Note that network namespaces are not really
-	// implemented; see comment on Task.netns for details.)
+	// network namespace.
 	NewNetworkNamespace bool
 
 	// If NewFiles is true, the task should use an independent file descriptor
@@ -199,6 +201,17 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		ipcns = NewIPCNamespace(userns)
 	}
 
+	netns := t.NetworkNamespace()
+	if opts.NewNetworkNamespace {
+		netns = inet.NewNamespace(netns)
+	}
+
+	// TODO(b/63601033): Implement CLONE_NEWNS.
+	mntnsVFS2 := t.mountNamespaceVFS2
+	if mntnsVFS2 != nil {
+		mntnsVFS2.IncRef()
+	}
+
 	tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
 	if err != nil {
 		return 0, nil, err
@@ -241,12 +254,15 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	rseqAddr := usermem.Addr(0)
 	rseqSignature := uint32(0)
 	if opts.NewThreadGroup {
-		tg.mounts.IncRef()
+		if tg.mounts != nil {
+			tg.mounts.IncRef()
+		}
 		sh := t.tg.signalHandlers
 		if opts.NewSignalHandlers {
 			sh = sh.Fork()
 		}
 		tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
+		tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj)
 		rseqAddr = t.rseqAddr
 		rseqSignature = t.rseqSignature
 	}
@@ -260,11 +276,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		FDTable:                 fdTable,
 		Credentials:             creds,
 		Niceness:                t.Niceness(),
-		NetworkNamespaced:       t.netns,
+		NetworkNamespace:        netns,
 		AllowedCPUMask:          t.CPUMask(),
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
 		AbstractSocketNamespace: t.abstractSockets,
+		MountNamespaceVFS2:      mntnsVFS2,
 		RSeqAddr:                rseqAddr,
 		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
@@ -274,9 +291,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	} else {
 		cfg.InheritParent = t
 	}
-	if opts.NewNetworkNamespace {
-		cfg.NetworkNamespaced = true
-	}
 	nt, err := t.tg.pidns.owner.NewTask(cfg)
 	if err != nil {
 		if opts.NewThreadGroup {
@@ -473,7 +487,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 			t.mu.Unlock()
 			return syserror.EPERM
 		}
-		t.netns = true
+		t.netns = inet.NewNamespace(t.netns)
 	}
 	if opts.NewUTSNamespace {
 		if !haveCapSysAdmin {
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 2d6e7733c..9fa528384 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -49,7 +49,7 @@ type TaskContext struct {
 	fu *futex.Manager
 
 	// st is the task's syscall table.
-	st *SyscallTable
+	st *SyscallTable `state:".(syscallTableInfo)"`
 }
 
 // release releases all resources held by the TaskContext. release is called by
@@ -58,7 +58,6 @@ func (tc *TaskContext) release() {
 	// Nil out pointers so that if the task is saved after release, it doesn't
 	// follow the pointers to possibly now-invalid objects.
 	if tc.MemoryManager != nil {
-		// TODO(b/38173783)
 		tc.MemoryManager.DecUsers(context.Background())
 		tc.MemoryManager = nil
 	}
@@ -136,11 +135,11 @@ func (t *Task) Stack() *arch.Stack {
 func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) {
 	// If File is not nil, we should load that instead of resolving Filename.
 	if args.File != nil {
-		args.Filename = args.File.MappedName(ctx)
+		args.Filename = args.File.PathnameWithDeleted(ctx)
 	}
 
 	// Prepare a new user address space to load into.
-	m := mm.NewMemoryManager(k, k)
+	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
 	defer m.DecUsers(ctx)
 	args.MemoryManager = m
 
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 8f57a34a6..9b69f3cbe 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -198,6 +198,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
 	t.tg.pidns.owner.mu.Unlock()
 
+	oldFDTable := t.fdTable
+	t.fdTable = t.fdTable.Fork()
+	oldFDTable.DecRef()
+
 	// Remove FDs with the CloseOnExec flag set.
 	t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
 		return flags.CloseOnExec
@@ -220,7 +224,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.mu.Unlock()
 	t.unstopVforkParent()
 	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
-	t.MemoryManager().Activate()
+	t.MemoryManager().Activate(t)
 
 	t.ptraceExec(oldTID)
 	return (*runSyscallExit)(nil)
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 435761e5a..c4ade6e8e 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -269,6 +269,13 @@ func (*runExitMain) execute(t *Task) taskRunState {
 	t.fsContext.DecRef()
 	t.fdTable.DecRef()
 
+	t.mu.Lock()
+	if t.mountNamespaceVFS2 != nil {
+		t.mountNamespaceVFS2.DecRef()
+		t.mountNamespaceVFS2 = nil
+	}
+	t.mu.Unlock()
+
 	// If this is the last task to exit from the thread group, release the
 	// thread group's resources.
 	if lastExiter {
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index ce3e6ef28..0325967e4 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -455,7 +455,7 @@ func (t *Task) SetKeepCaps(k bool) {
 	t.creds.Store(creds)
 }
 
-// updateCredsForExec updates t.creds to reflect an execve().
+// updateCredsForExecLocked updates t.creds to reflect an execve().
 //
 // NOTE(b/30815691): We currently do not implement privileged executables
 // (set-user/group-ID bits and file capabilities). This allows us to make a lot
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 41259210c..eeccaa197 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -32,21 +32,21 @@ const (
 // Infof logs an formatted info message by calling log.Infof.
 func (t *Task) Infof(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Info) {
-		log.Infof(t.logPrefix.Load().(string)+fmt, v...)
+		log.InfofAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
 // Warningf logs a warning string by calling log.Warningf.
 func (t *Task) Warningf(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Warning) {
-		log.Warningf(t.logPrefix.Load().(string)+fmt, v...)
+		log.WarningfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
 // Debugf creates a debug string that includes the task ID.
 func (t *Task) Debugf(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Debug) {
-		log.Debugf(t.logPrefix.Load().(string)+fmt, v...)
+		log.DebugfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
@@ -198,18 +198,11 @@ func (t *Task) traceExecEvent(tc *TaskContext) {
 	if !trace.IsEnabled() {
 		return
 	}
-	d := tc.MemoryManager.Executable()
-	if d == nil {
+	file := tc.MemoryManager.Executable()
+	if file == nil {
 		trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
 		return
 	}
-	defer d.DecRef()
-	root := t.fsContext.RootDirectory()
-	if root == nil {
-		trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>")
-		return
-	}
-	defer root.DecRef()
-	n, _ := d.FullName(root)
-	trace.Logf(t.traceContext, traceCategory, "exec: %s", n)
+	defer file.DecRef()
+	trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t))
 }
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
index 172a31e1d..f7711232c 100644
--- a/pkg/sentry/kernel/task_net.go
+++ b/pkg/sentry/kernel/task_net.go
@@ -22,14 +22,23 @@ import (
 func (t *Task) IsNetworkNamespaced() bool {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	return t.netns
+	return !t.netns.IsRoot()
 }
 
 // NetworkContext returns the network stack used by the task. NetworkContext
 // may return nil if no network stack is available.
+//
+// TODO(gvisor.dev/issue/1833): Migrate callers of this method to
+// NetworkNamespace().
 func (t *Task) NetworkContext() inet.Stack {
-	if t.IsNetworkNamespaced() {
-		return nil
-	}
-	return t.k.networkStack
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns.Stack()
+}
+
+// NetworkNamespace returns the network namespace observed by the task.
+func (t *Task) NetworkNamespace() *inet.Namespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns
 }
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 5568c91bc..d654dd997 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -96,6 +96,7 @@ func (t *Task) run(threadID uintptr) {
 			t.tg.liveGoroutines.Done()
 			t.tg.pidns.owner.liveGoroutines.Done()
 			t.tg.pidns.owner.runningGoroutines.Done()
+			t.p.Release()
 
 			// Keep argument alive because stack trace for dead variables may not be correct.
 			runtime.KeepAlive(threadID)
@@ -126,13 +127,39 @@ func (t *Task) doStop() {
 	}
 }
 
+func (*runApp) handleCPUIDInstruction(t *Task) error {
+	if len(arch.CPUIDInstruction) == 0 {
+		// CPUID emulation isn't supported, but this code can be
+		// executed, because the ptrace platform returns
+		// ErrContextSignalCPUID on page faults too. Look at
+		// pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
+		// details.
+		return platform.ErrContextSignal
+	}
+	// Is this a CPUID instruction?
+	region := trace.StartRegion(t.traceContext, cpuidRegion)
+	expected := arch.CPUIDInstruction[:]
+	found := make([]byte, len(expected))
+	_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+	if err == nil && bytes.Equal(expected, found) {
+		// Skip the cpuid instruction.
+		t.Arch().CPUIDEmulate(t)
+		t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+		region.End()
+
+		return nil
+	}
+	region.End() // Not an actual CPUID, but required copy-in.
+	return platform.ErrContextSignal
+}
+
 // The runApp state checks for interrupts before executing untrusted
 // application code.
 //
 // +stateify savable
 type runApp struct{}
 
-func (*runApp) execute(t *Task) taskRunState {
+func (app *runApp) execute(t *Task) taskRunState {
 	if t.interrupted() {
 		// Checkpointing instructs tasks to stop by sending an interrupt, so we
 		// must check for stops before entering runInterrupt (instead of
@@ -237,21 +264,10 @@ func (*runApp) execute(t *Task) taskRunState {
 		return (*runApp)(nil)
 
 	case platform.ErrContextSignalCPUID:
-		// Is this a CPUID instruction?
-		region := trace.StartRegion(t.traceContext, cpuidRegion)
-		expected := arch.CPUIDInstruction[:]
-		found := make([]byte, len(expected))
-		_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
-		if err == nil && bytes.Equal(expected, found) {
-			// Skip the cpuid instruction.
-			t.Arch().CPUIDEmulate(t)
-			t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
-			region.End()
-
+		if err := app.handleCPUIDInstruction(t); err == nil {
 			// Resume execution.
 			return (*runApp)(nil)
 		}
-		region.End() // Not an actual CPUID, but required copy-in.
 
 		// The instruction at the given RIP was not a CPUID, and we
 		// fallthrough to the default signal deliver behavior below.
@@ -338,7 +354,7 @@ func (*runApp) execute(t *Task) taskRunState {
 	default:
 		// What happened? Can't continue.
 		t.Warningf("Unexpected SwitchToApp error: %v", err)
-		t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)})
+		t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)})
 		return (*runExit)(nil)
 	}
 }
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 8b148db35..09366b60c 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -653,14 +653,14 @@ func (t *Task) SetNiceness(n int) {
 }
 
 // NumaPolicy returns t's current numa policy.
-func (t *Task) NumaPolicy() (policy int32, nodeMask uint64) {
+func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	return t.numaPolicy, t.numaNodeMask
 }
 
 // SetNumaPolicy sets t's numa policy.
-func (t *Task) SetNumaPolicy(policy int32, nodeMask uint64) {
+func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	t.numaPolicy = policy
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 8802db142..79766cafe 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -174,7 +174,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
 					fallthrough
 				case (sre == ERESTARTSYS && !act.IsRestart()):
 					t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
-					t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1)))
+					t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1)))
 				default:
 					t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
 					t.Arch().RestartSyscall()
@@ -263,6 +263,19 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
 	if t.haveSavedSignalMask {
 		mask = t.savedSignalMask
 	}
+
+	// Set up the restorer.
+	// x86-64 should always uses SA_RESTORER, but this flag is optional on other platforms.
+	// Please see the linux code as reference:
+	// linux/arch/x86/kernel/signal.c:__setup_rt_frame()
+	// If SA_RESTORER is not configured, we can use the sigreturn trampolines
+	// the vdso provides instead.
+	// Please see the linux code as reference:
+	// linux/arch/arm64/kernel/signal.c:setup_return()
+	if act.Flags&linux.SA_RESTORER == 0 {
+		act.Restorer = t.MemoryManager().VDSOSigReturn()
+	}
+
 	if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
 		return err
 	}
@@ -513,8 +526,6 @@ func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
 	if t.stop != nil {
 		return false
 	}
-	// - TODO(b/38173783): No special case for when t is also the sending task,
-	// because the identity of the sender is unknown.
 	// - Do not choose tasks that have already been interrupted, as they may be
 	// busy handling another signal.
 	if len(t.interruptChan) != 0 {
@@ -705,7 +716,7 @@ func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (a
 func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
 	n := t.Arch().NewSignalAct()
 	n.SerializeFrom(s)
-	_, err := t.CopyOut(addr, n)
+	_, err := n.CopyOut(t, addr)
 	return err
 }
 
@@ -714,7 +725,7 @@ func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
 func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
 	n := t.Arch().NewSignalAct()
 	var s arch.SignalAct
-	if _, err := t.CopyIn(addr, n); err != nil {
+	if _, err := n.CopyIn(t, addr); err != nil {
 		return s, err
 	}
 	n.DeserializeTo(&s)
@@ -726,7 +737,7 @@ func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
 func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
 	n := t.Arch().NewSignalStack()
 	n.SerializeFrom(s)
-	_, err := t.CopyOut(addr, n)
+	_, err := n.CopyOut(t, addr)
 	return err
 }
 
@@ -735,7 +746,7 @@ func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error
 func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
 	n := t.Arch().NewSignalStack()
 	var s arch.SignalStack
-	if _, err := t.CopyIn(addr, n); err != nil {
+	if _, err := n.CopyIn(t, addr); err != nil {
 		return s, err
 	}
 	n.DeserializeTo(&s)
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index de838beef..8485fb4b6 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -17,10 +17,12 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -64,9 +66,8 @@ type TaskConfig struct {
 	// Niceness is the niceness of the new task.
 	Niceness int
 
-	// If NetworkNamespaced is true, the new task should observe a non-root
-	// network namespace.
-	NetworkNamespaced bool
+	// NetworkNamespace is the network namespace to be used for the new task.
+	NetworkNamespace *inet.Namespace
 
 	// AllowedCPUMask contains the cpus that this task can run on.
 	AllowedCPUMask sched.CPUSet
@@ -80,6 +81,9 @@ type TaskConfig struct {
 	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
 	AbstractSocketNamespace *AbstractSocketNamespace
 
+	// MountNamespaceVFS2 is the MountNamespace of the new task.
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
 	RSeqAddr usermem.Addr
 
@@ -100,6 +104,9 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
 		cfg.TaskContext.release()
 		cfg.FSContext.DecRef()
 		cfg.FDTable.DecRef()
+		if cfg.MountNamespaceVFS2 != nil {
+			cfg.MountNamespaceVFS2.DecRef()
+		}
 		return nil, err
 	}
 	return t, nil
@@ -116,28 +123,29 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 			parent:   cfg.Parent,
 			children: make(map[*Task]struct{}),
 		},
-		runState:        (*runApp)(nil),
-		interruptChan:   make(chan struct{}, 1),
-		signalMask:      cfg.SignalMask,
-		signalStack:     arch.SignalStack{Flags: arch.SignalStackFlagDisable},
-		tc:              *tc,
-		fsContext:       cfg.FSContext,
-		fdTable:         cfg.FDTable,
-		p:               cfg.Kernel.Platform.NewContext(),
-		k:               cfg.Kernel,
-		ptraceTracees:   make(map[*Task]struct{}),
-		allowedCPUMask:  cfg.AllowedCPUMask.Copy(),
-		ioUsage:         &usage.IO{},
-		niceness:        cfg.Niceness,
-		netns:           cfg.NetworkNamespaced,
-		utsns:           cfg.UTSNamespace,
-		ipcns:           cfg.IPCNamespace,
-		abstractSockets: cfg.AbstractSocketNamespace,
-		rseqCPU:         -1,
-		rseqAddr:        cfg.RSeqAddr,
-		rseqSignature:   cfg.RSeqSignature,
-		futexWaiter:     futex.NewWaiter(),
-		containerID:     cfg.ContainerID,
+		runState:           (*runApp)(nil),
+		interruptChan:      make(chan struct{}, 1),
+		signalMask:         cfg.SignalMask,
+		signalStack:        arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+		tc:                 *tc,
+		fsContext:          cfg.FSContext,
+		fdTable:            cfg.FDTable,
+		p:                  cfg.Kernel.Platform.NewContext(),
+		k:                  cfg.Kernel,
+		ptraceTracees:      make(map[*Task]struct{}),
+		allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
+		ioUsage:            &usage.IO{},
+		niceness:           cfg.Niceness,
+		netns:              cfg.NetworkNamespace,
+		utsns:              cfg.UTSNamespace,
+		ipcns:              cfg.IPCNamespace,
+		abstractSockets:    cfg.AbstractSocketNamespace,
+		mountNamespaceVFS2: cfg.MountNamespaceVFS2,
+		rseqCPU:            -1,
+		rseqAddr:           cfg.RSeqAddr,
+		rseqSignature:      cfg.RSeqSignature,
+		futexWaiter:        futex.NewWaiter(),
+		containerID:        cfg.ContainerID,
 	}
 	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index d555d69a8..a5903b0b5 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -194,6 +194,19 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
 //
 // The syscall path is very hot; avoid defer.
 func (t *Task) doSyscall() taskRunState {
+	// Save value of the register which is clobbered in the following
+	// t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64.
+	//
+	// On x86, register rax was shared by syscall number and return
+	// value, and at the entry of the syscall handler, the rax was
+	// saved to regs.orig_rax which was exposed to userspace.
+	// But on arm64, syscall number was passed through X8, and the X0
+	// was shared by the first syscall argument and return value. The
+	// X0 was saved to regs.orig_x0 which was not exposed to userspace.
+	// So we have to do the same operation here to save the X0 value
+	// into the task context.
+	t.Arch().SyscallSaveOrig()
+
 	sysno := t.Arch().SyscallNo()
 	args := t.Arch().SyscallArgs()
 
@@ -269,6 +282,7 @@ func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
 		return (*runSyscallExit)(nil)
 	}
 	args := t.Arch().SyscallArgs()
+
 	return t.doSyscallInvoke(sysno, args)
 }
 
@@ -298,7 +312,7 @@ func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRu
 			return ctrl.next
 		}
 	} else if err != nil {
-		t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
 		t.haveSyscallReturn = true
 	} else {
 		t.Arch().SetReturn(rval)
@@ -417,7 +431,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle
 			// A return is not emulated in this case.
 			return (*runApp)(nil)
 		}
-		t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+		t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
 	}
 	t.Arch().SetIP(t.Arch().Value(caller))
 	t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
@@ -427,7 +441,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle
 // ExtractErrno extracts an integer error number from the error.
 // The syscall number is purely for context in the error case. Use -1 if
 // syscall number is unknown.
-func (t *Task) ExtractErrno(err error, sysno int) int {
+func ExtractErrno(err error, sysno int) int {
 	switch err := err.(type) {
 	case nil:
 		return 0
@@ -441,11 +455,11 @@ func (t *Task) ExtractErrno(err error, sysno int) int {
 		// handled (and the SIGBUS is delivered).
 		return int(syscall.EFAULT)
 	case *os.PathError:
-		return t.ExtractErrno(err.Err, sysno)
+		return ExtractErrno(err.Err, sysno)
 	case *os.LinkError:
-		return t.ExtractErrno(err.Err, sysno)
+		return ExtractErrno(err.Err, sysno)
 	case *os.SyscallError:
-		return t.ExtractErrno(err.Err, sysno)
+		return ExtractErrno(err.Err, sysno)
 	default:
 		if errno, ok := syserror.TranslateError(err); ok {
 			return int(errno)
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 2bf3ce8a8..b02044ad2 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -30,7 +30,7 @@ var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
 // Activate ensures that the task has an active address space.
 func (t *Task) Activate() {
 	if mm := t.MemoryManager(); mm != nil {
-		if err := mm.Activate(); err != nil {
+		if err := mm.Activate(t); err != nil {
 			panic("unable to activate mm: " + err.Error())
 		}
 	}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 768e958d2..52849f5b3 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -254,9 +254,16 @@ type ThreadGroup struct {
 	//
 	// tty is protected by the signal mutex.
 	tty *TTY
+
+	// oomScoreAdj is the thread group's OOM score adjustment. This is
+	// currently not used but is maintained for consistency.
+	// TODO(gvisor.dev/issue/1967)
+	//
+	// oomScoreAdj is accessed using atomic memory operations.
+	oomScoreAdj int32
 }
 
-// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The
 // thread group leader will send its parent terminationSignal when it exits.
 // The new thread group isn't visible to the system until a task has been
 // created inside of it by a successful call to TaskSet.NewTask.
@@ -317,7 +324,9 @@ func (tg *ThreadGroup) release() {
 	for _, it := range its {
 		it.DestroyTimer()
 	}
-	tg.mounts.DecRef()
+	if tg.mounts != nil {
+		tg.mounts.DecRef()
+	}
 }
 
 // forEachChildThreadGroupLocked indicates over all child ThreadGroups.
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 706de83ef..e959700f2 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -245,7 +245,7 @@ type Clock interface {
 type WallRateClock struct{}
 
 // WallTimeUntil implements Clock.WallTimeUntil.
-func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
+func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration {
 	return t.Sub(now)
 }
 
@@ -254,16 +254,16 @@ func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
 type NoClockEvents struct{}
 
 // Readiness implements waiter.Waitable.Readiness.
-func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return 0
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
-func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+func (*NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
-func (NoClockEvents) EventUnregister(e *waiter.Entry) {
+func (*NoClockEvents) EventUnregister(e *waiter.Entry) {
 }
 
 // ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
@@ -273,7 +273,7 @@ type ClockEventsQueue struct {
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return 0
 }
 
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index dc99301de..da0ea7bb5 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"fmt"
+	"sync/atomic"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/log"
@@ -48,6 +49,9 @@ type Timekeeper struct {
 	// It is set only once, by SetClocks.
 	monotonicOffset int64 `state:"nosave"`
 
+	// monotonicLowerBound is the lowerBound for monotonic time.
+	monotonicLowerBound int64 `state:"nosave"`
+
 	// restored, if non-nil, indicates that this Timekeeper was restored
 	// from a state file. The clocks are not set until restored is closed.
 	restored chan struct{} `state:"nosave"`
@@ -271,6 +275,21 @@ func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
 	now, err := t.clocks.GetTime(c)
 	if err == nil && c == sentrytime.Monotonic {
 		now += t.monotonicOffset
+		for {
+			// It's possible that the clock is shaky. This may be due to
+			// platform issues, e.g. the KVM platform relies on the guest
+			// TSC and host TSC, which may not be perfectly in sync. To
+			// work around this issue, ensure that the monotonic time is
+			// always bounded by the last time read.
+			oldLowerBound := atomic.LoadInt64(&t.monotonicLowerBound)
+			if now < oldLowerBound {
+				now = oldLowerBound
+				break
+			}
+			if atomic.CompareAndSwapInt64(&t.monotonicLowerBound, oldLowerBound, now) {
+				break
+			}
+		}
 	}
 	return now, err
 }
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 23790378a..c6aa65f28 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -33,6 +33,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
@@ -40,6 +41,7 @@ go_library(
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 122ed05c2..616fafa2c 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -27,7 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -97,11 +97,11 @@ type elfInfo struct {
 // accepts from the ELF, and it doesn't parse unnecessary parts of the file.
 //
 // ctx may be nil if f does not need it.
-func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
+func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) {
 	// Check ident first; it will tell us the endianness of the rest of the
 	// structs.
 	var ident [elf.EI_NIDENT]byte
-	_, err := readFull(ctx, f, usermem.BytesIOSequence(ident[:]), 0)
+	_, err := f.ReadFull(ctx, usermem.BytesIOSequence(ident[:]), 0)
 	if err != nil {
 		log.Infof("Error reading ELF ident: %v", err)
 		// The entire ident array always exists.
@@ -137,7 +137,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
 
 	var hdr elf.Header64
 	hdrBuf := make([]byte, header64Size)
-	_, err = readFull(ctx, f, usermem.BytesIOSequence(hdrBuf), 0)
+	_, err = f.ReadFull(ctx, usermem.BytesIOSequence(hdrBuf), 0)
 	if err != nil {
 		log.Infof("Error reading ELF header: %v", err)
 		// The entire header always exists.
@@ -187,7 +187,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
 	}
 
 	phdrBuf := make([]byte, totalPhdrSize)
-	_, err = readFull(ctx, f, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
+	_, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
 	if err != nil {
 		log.Infof("Error reading ELF phdrs: %v", err)
 		// If phdrs were specified, they should all exist.
@@ -227,7 +227,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
 
 // mapSegment maps a phdr into the Task. offset is the offset to apply to
 // phdr.Vaddr.
-func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
+func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
 	// We must make a page-aligned mapping.
 	adjust := usermem.Addr(phdr.Vaddr).PageOffset()
 
@@ -395,7 +395,7 @@ type loadedELF struct {
 //
 // Preconditions:
 //  * f is an ELF file
-func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
+func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
 	first := true
 	var start, end usermem.Addr
 	var interpreter string
@@ -431,7 +431,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 			}
 
 			path := make([]byte, phdr.Filesz)
-			_, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off))
+			_, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off))
 			if err != nil {
 				// If an interpreter was specified, it should exist.
 				ctx.Infof("Error reading PT_INTERP path: %v", err)
@@ -564,7 +564,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 // Preconditions:
 //  * f is an ELF file
 //  * f is the first ELF loaded into m
-func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		ctx.Infof("Failed to parse initial ELF: %v", err)
@@ -602,7 +602,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS
 //
 // Preconditions:
 //  * f is an ELF file
-func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, initial loadedELF) (loadedELF, error) {
+func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		if err == syserror.ENOEXEC {
@@ -649,16 +649,14 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error
 		// Refresh the traversal limit.
 		*args.RemainingTraversals = linux.MaxSymlinkTraversals
 		args.Filename = bin.interpreter
-		d, i, err := openPath(ctx, args)
+		intFile, err := openPath(ctx, args)
 		if err != nil {
 			ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
 			return loadedELF{}, nil, err
 		}
-		defer i.DecRef()
-		// We don't need the Dirent.
-		d.DecRef()
+		defer intFile.DecRef()
 
-		interp, err = loadInterpreterELF(ctx, args.MemoryManager, i, bin)
+		interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin)
 		if err != nil {
 			ctx.Infof("Error loading interpreter: %v", err)
 			return loadedELF{}, nil, err
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index 098a45d36..3886b4d33 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -19,7 +19,7 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -37,9 +37,9 @@ const (
 )
 
 // parseInterpreterScript returns the interpreter path and argv.
-func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv []string) (newpath string, newargv []string, err error) {
+func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.File, argv []string) (newpath string, newargv []string, err error) {
 	line := make([]byte, interpMaxLineLength)
-	n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0)
+	n, err := f.ReadFull(ctx, usermem.BytesIOSequence(line), 0)
 	// Short read is OK.
 	if err != nil && err != io.ErrUnexpectedEOF {
 		if err == io.EOF {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 9a613d6b7..88449fe95 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -20,7 +20,6 @@ import (
 	"fmt"
 	"io"
 	"path"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -29,8 +28,10 @@ import (
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -41,16 +42,6 @@ type LoadArgs struct {
 	// MemoryManager is the memory manager to load the executable into.
 	MemoryManager *mm.MemoryManager
 
-	// Mounts is the mount namespace in which to look up Filename.
-	Mounts *fs.MountNamespace
-
-	// Root is the root directory under which to look up Filename.
-	Root *fs.Dirent
-
-	// WorkingDirectory is the working directory under which to look up
-	// Filename.
-	WorkingDirectory *fs.Dirent
-
 	// RemainingTraversals is the maximum number of symlinks to follow to
 	// resolve Filename. This counter is passed by reference to keep it
 	// updated throughout the call stack.
@@ -65,7 +56,12 @@ type LoadArgs struct {
 
 	// File is an open fs.File object of the executable. If File is not
 	// nil, then File will be loaded and Filename will be ignored.
-	File *fs.File
+	//
+	// The caller is responsible for checking that the user can execute this file.
+	File fsbridge.File
+
+	// Opener is used to open the executable file when 'File' is nil.
+	Opener fsbridge.Lookup
 
 	// CloseOnExec indicates that the executable (or one of its parent
 	// directories) was opened with O_CLOEXEC. If the executable is an
@@ -106,103 +102,32 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
 // installed in the Task FDTable. The caller takes ownership of both.
 //
 // args.Filename must be a readable, executable, regular file.
-func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error) {
+func openPath(ctx context.Context, args LoadArgs) (fsbridge.File, error) {
 	if args.Filename == "" {
 		ctx.Infof("cannot open empty name")
-		return nil, nil, syserror.ENOENT
-	}
-
-	var d *fs.Dirent
-	var err error
-	if args.ResolveFinal {
-		d, err = args.Mounts.FindInode(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
-	} else {
-		d, err = args.Mounts.FindLink(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
-	}
-	if err != nil {
-		return nil, nil, err
-	}
-	// Defer a DecRef for the sake of failure cases.
-	defer d.DecRef()
-
-	if !args.ResolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
-		return nil, nil, syserror.ELOOP
-	}
-
-	if err := checkPermission(ctx, d); err != nil {
-		return nil, nil, err
-	}
-
-	// If they claim it's a directory, then make sure.
-	//
-	// N.B. we reject directories below, but we must first reject
-	// non-directories passed as directories.
-	if strings.HasSuffix(args.Filename, "/") && !fs.IsDir(d.Inode.StableAttr) {
-		return nil, nil, syserror.ENOTDIR
-	}
-
-	if err := checkIsRegularFile(ctx, d, args.Filename); err != nil {
-		return nil, nil, err
-	}
-
-	f, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
-	if err != nil {
-		return nil, nil, err
-	}
-	// Defer a DecRef for the sake of failure cases.
-	defer f.DecRef()
-
-	if err := checkPread(ctx, f, args.Filename); err != nil {
-		return nil, nil, err
-	}
-
-	d.IncRef()
-	f.IncRef()
-	return d, f, err
-}
-
-// checkFile performs checks on a file to be executed.
-func checkFile(ctx context.Context, f *fs.File, filename string) error {
-	if err := checkPermission(ctx, f.Dirent); err != nil {
-		return err
+		return nil, syserror.ENOENT
 	}
 
-	if err := checkIsRegularFile(ctx, f.Dirent, filename); err != nil {
-		return err
-	}
-
-	return checkPread(ctx, f, filename)
-}
-
-// checkPermission checks whether the file is readable and executable.
-func checkPermission(ctx context.Context, d *fs.Dirent) error {
-	perms := fs.PermMask{
-		// TODO(gvisor.dev/issue/160): Linux requires only execute
-		// permission, not read. However, our backing filesystems may
-		// prevent us from reading the file without read permission.
-		//
-		// Additionally, a task with a non-readable executable has
-		// additional constraints on access via ptrace and procfs.
-		Read:    true,
-		Execute: true,
+	// TODO(gvisor.dev/issue/160): Linux requires only execute permission,
+	// not read. However, our backing filesystems may prevent us from reading
+	// the file without read permission. Additionally, a task with a
+	// non-readable executable has additional constraints on access via
+	// ptrace and procfs.
+	opts := vfs.OpenOptions{
+		Flags:    linux.O_RDONLY,
+		FileExec: true,
 	}
-	return d.Inode.CheckPermission(ctx, perms)
+	return args.Opener.OpenPath(ctx, args.Filename, opts, args.RemainingTraversals, args.ResolveFinal)
 }
 
 // checkIsRegularFile prevents us from trying to execute a directory, pipe, etc.
-func checkIsRegularFile(ctx context.Context, d *fs.Dirent, filename string) error {
-	attr := d.Inode.StableAttr
-	if !fs.IsRegular(attr) {
-		ctx.Infof("%s is not regular: %v", filename, attr)
-		return syserror.EACCES
+func checkIsRegularFile(ctx context.Context, file fsbridge.File, filename string) error {
+	t, err := file.Type(ctx)
+	if err != nil {
+		return err
 	}
-	return nil
-}
-
-// checkPread checks whether we can read the file at arbitrary offsets.
-func checkPread(ctx context.Context, f *fs.File, filename string) error {
-	if !f.Flags().Pread {
-		ctx.Infof("%s cannot be read at an offset: %+v", filename, f.Flags())
+	if t != linux.ModeRegular {
+		ctx.Infof("%q is not a regular file: %v", filename, t)
 		return syserror.EACCES
 	}
 	return nil
@@ -224,8 +149,10 @@ const (
 	maxLoaderAttempts = 6
 )
 
-// loadExecutable loads an executable that is pointed to by args.File. If nil,
-// the path args.Filename is resolved and loaded. If the executable is an
+// loadExecutable loads an executable that is pointed to by args.File. The
+// caller is responsible for checking that the user can execute this file.
+// If nil, the path args.Filename is resolved and loaded (check that the user
+// can execute this file is done here in this case). If the executable is an
 // interpreter script rather than an ELF, the binary of the corresponding
 // interpreter will be loaded.
 //
@@ -234,37 +161,27 @@ const (
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
 //  * Possibly updated args.Argv
-func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, fsbridge.File, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
-		var (
-			d   *fs.Dirent
-			err error
-		)
 		if args.File == nil {
-			d, args.File, err = openPath(ctx, args)
-			// We will return d in the successful case, but defer a DecRef for the
-			// sake of intermediate loops and failure cases.
-			if d != nil {
-				defer d.DecRef()
-			}
-			if args.File != nil {
-				defer args.File.DecRef()
+			var err error
+			args.File, err = openPath(ctx, args)
+			if err != nil {
+				ctx.Infof("Error opening %s: %v", args.Filename, err)
+				return loadedELF{}, nil, nil, nil, err
 			}
+			// Ensure file is release in case the code loops or errors out.
+			defer args.File.DecRef()
 		} else {
-			d = args.File.Dirent
-			d.IncRef()
-			defer d.DecRef()
-			err = checkFile(ctx, args.File, args.Filename)
-		}
-		if err != nil {
-			ctx.Infof("Error opening %s: %v", args.Filename, err)
-			return loadedELF{}, nil, nil, nil, err
+			if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil {
+				return loadedELF{}, nil, nil, nil, err
+			}
 		}
 
 		// Check the header. Is this an ELF or interpreter script?
 		var hdr [4]uint8
 		// N.B. We assume that reading from a regular file cannot block.
-		_, err = readFull(ctx, args.File, usermem.BytesIOSequence(hdr[:]), 0)
+		_, err := args.File.ReadFull(ctx, usermem.BytesIOSequence(hdr[:]), 0)
 		// Allow unexpected EOF, as a valid executable could be only three bytes
 		// (e.g., #!a).
 		if err != nil && err != io.ErrUnexpectedEOF {
@@ -281,9 +198,10 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 				ctx.Infof("Error loading ELF: %v", err)
 				return loadedELF{}, nil, nil, nil, err
 			}
-			// An ELF is always terminal. Hold on to d.
-			d.IncRef()
-			return loaded, ac, d, args.Argv, err
+			// An ELF is always terminal. Hold on to file.
+			args.File.IncRef()
+			return loaded, ac, args.File, args.Argv, err
+
 		case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
 			if args.CloseOnExec {
 				return loadedELF{}, nil, nil, nil, syserror.ENOENT
@@ -295,6 +213,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 			}
 			// Refresh the traversal limit for the interpreter.
 			*args.RemainingTraversals = linux.MaxSymlinkTraversals
+
 		default:
 			ctx.Infof("Unknown magic: %v", hdr)
 			return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
@@ -317,11 +236,11 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 //  * Load is called on the Task goroutine.
 func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the executable itself.
-	loaded, ac, d, newArgv, err := loadExecutable(ctx, args)
+	loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
 	}
-	defer d.DecRef()
+	defer file.DecRef()
 
 	// Load the VDSO.
 	vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded)
@@ -390,7 +309,16 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
 	m.SetEnvvStart(sl.EnvvStart)
 	m.SetEnvvEnd(sl.EnvvEnd)
 	m.SetAuxv(auxv)
-	m.SetExecutable(d)
+	m.SetExecutable(file)
+
+	symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn")
+	if err != nil {
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to find rt_sigreturn in vdso: %v", err), syserr.FromError(err).ToLinux())
+	}
+
+	// Found rt_sigretrun.
+	addr := uint64(vdsoAddr) + symbolValue - vdsoPrelink
+	m.SetVDSOSigReturn(addr)
 
 	ac.SetIP(uintptr(loaded.entry))
 	ac.SetStack(uintptr(stack.Bottom))
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 52f446ed7..00977fc08 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -15,9 +15,11 @@
 package loader
 
 import (
+	"bytes"
 	"debug/elf"
 	"fmt"
 	"io"
+	"strings"
 
 	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/context"
@@ -27,6 +29,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -37,6 +40,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+const vdsoPrelink = 0xffffffffff700000
+
 type fileContext struct {
 	context.Context
 }
@@ -69,6 +74,8 @@ type byteReader struct {
 var _ fs.FileOperations = (*byteReader)(nil)
 
 // newByteReaderFile creates a fake file to read data from.
+//
+// TODO(gvisor.dev/issue/1623): Convert to VFS2.
 func newByteReaderFile(ctx context.Context, data []byte) *fs.File {
 	// Create a fake inode.
 	inode := fs.NewInode(
@@ -123,7 +130,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq
 // * PT_LOAD segments don't extend beyond the end of the file.
 //
 // ctx may be nil if f does not need it.
-func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) {
+func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) {
 	info, err := parseHeader(ctx, f)
 	if err != nil {
 		log.Infof("Unable to parse VDSO header: %v", err)
@@ -218,10 +225,31 @@ type VDSO struct {
 	phdrs []elf.ProgHeader `state:".([]elfProgHeader)"`
 }
 
+// getSymbolValueFromVDSO returns the specific symbol value in vdso.so.
+func getSymbolValueFromVDSO(symbol string) (uint64, error) {
+	f, err := elf.NewFile(bytes.NewReader(vdsoBin))
+	if err != nil {
+		return 0, err
+	}
+	syms, err := f.Symbols()
+	if err != nil {
+		return 0, err
+	}
+
+	for _, sym := range syms {
+		if elf.ST_BIND(sym.Info) != elf.STB_LOCAL && sym.Section != elf.SHN_UNDEF {
+			if strings.Contains(sym.Name, symbol) {
+				return sym.Value, nil
+			}
+		}
+	}
+	return 0, fmt.Errorf("no %v in vdso.so", symbol)
+}
+
 // PrepareVDSO validates the system VDSO and returns a VDSO, containing the
 // param page for updating by the kernel.
 func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
-	vdsoFile := newByteReaderFile(ctx, vdsoBin)
+	vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin))
 
 	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
 	// nil context can be passed.
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index e5729ced5..a036ce53c 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -25,6 +25,7 @@ go_template_instance(
     out = "vma_set.go",
     consts = {
         "minDegree": "8",
+        "trackGaps": "1",
     },
     imports = {
         "usermem": "gvisor.dev/gvisor/pkg/usermem",
@@ -105,8 +106,8 @@ go_library(
         "//pkg/safecopy",
         "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/futex",
         "//pkg/sentry/kernel/shm",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index e58a63deb..5c667117c 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -18,7 +18,7 @@ import (
 	"fmt"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/atomicbitops"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -39,11 +39,18 @@ func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
 //
 // When this MemoryManager is no longer needed by a task, it should call
 // Deactivate to release the reference.
-func (mm *MemoryManager) Activate() error {
+func (mm *MemoryManager) Activate(ctx context.Context) error {
 	// Fast path: the MemoryManager already has an active
 	// platform.AddressSpace, and we just need to indicate that we need it too.
-	if atomicbitops.IncUnlessZeroInt32(&mm.active) {
-		return nil
+	for {
+		active := atomic.LoadInt32(&mm.active)
+		if active == 0 {
+			// Fall back to the slow path.
+			break
+		}
+		if atomic.CompareAndSwapInt32(&mm.active, active, active+1) {
+			return nil
+		}
 	}
 
 	for {
@@ -85,16 +92,20 @@ func (mm *MemoryManager) Activate() error {
 		if as == nil {
 			// AddressSpace is unavailable, we must wait.
 			//
-			// activeMu must not be held while waiting, as the user
-			// of the address space we are waiting on may attempt
-			// to take activeMu.
-			//
-			// Don't call UninterruptibleSleepStart to register the
-			// wait to allow the watchdog stuck task to trigger in
-			// case a process is starved waiting for the address
-			// space.
+			// activeMu must not be held while waiting, as the user of the address
+			// space we are waiting on may attempt to take activeMu.
 			mm.activeMu.Unlock()
+
+			sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation
+			if sleep {
+				// Mark this task sleeping while waiting for the address space to
+				// prevent the watchdog from reporting it as a stuck task.
+				ctx.UninterruptibleSleepStart(false)
+			}
 			<-c
+			if sleep {
+				ctx.UninterruptibleSleepFinish(false)
+			}
 			continue
 		}
 
@@ -118,8 +129,15 @@ func (mm *MemoryManager) Activate() error {
 func (mm *MemoryManager) Deactivate() {
 	// Fast path: this is not the last goroutine to deactivate the
 	// MemoryManager.
-	if atomicbitops.DecUnlessOneInt32(&mm.active) {
-		return
+	for {
+		active := atomic.LoadInt32(&mm.active)
+		if active == 1 {
+			// Fall back to the slow path.
+			break
+		}
+		if atomic.CompareAndSwapInt32(&mm.active, active, active-1) {
+			return
+		}
 	}
 
 	mm.activeMu.Lock()
@@ -183,8 +201,10 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		if pma.needCOW {
 			perms.Write = false
 		}
-		if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
-			return err
+		if perms.Any() { // MapFile precondition
+			if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
+				return err
+			}
 		}
 		pseg = pseg.NextSegment()
 	}
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index cb29d94b0..379148903 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -59,25 +59,27 @@ func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
 	}
 
 	a.contexts[id] = &AIOContext{
-		done:           make(chan struct{}, 1),
+		requestReady:   make(chan struct{}, 1),
 		maxOutstanding: events,
 	}
 	return true
 }
 
-// destroyAIOContext destroys an asynchronous I/O context.
+// destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for
+// for pending requests to complete. Returns the destroyed AIOContext so it can
+// be drained.
 //
-// False is returned if the context does not exist.
-func (a *aioManager) destroyAIOContext(id uint64) bool {
+// Nil is returned if the context does not exist.
+func (a *aioManager) destroyAIOContext(id uint64) *AIOContext {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 	ctx, ok := a.contexts[id]
 	if !ok {
-		return false
+		return nil
 	}
 	delete(a.contexts, id)
 	ctx.destroy()
-	return true
+	return ctx
 }
 
 // lookupAIOContext looks up the given context.
@@ -102,8 +104,8 @@ type ioResult struct {
 //
 // +stateify savable
 type AIOContext struct {
-	// done is the notification channel used for all requests.
-	done chan struct{} `state:"nosave"`
+	// requestReady is the notification channel used for all requests.
+	requestReady chan struct{} `state:"nosave"`
 
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
@@ -129,8 +131,14 @@ func (ctx *AIOContext) destroy() {
 	ctx.mu.Lock()
 	defer ctx.mu.Unlock()
 	ctx.dead = true
-	if ctx.outstanding == 0 {
-		close(ctx.done)
+	ctx.checkForDone()
+}
+
+// Preconditions: ctx.mu must be held by caller.
+func (ctx *AIOContext) checkForDone() {
+	if ctx.dead && ctx.outstanding == 0 {
+		close(ctx.requestReady)
+		ctx.requestReady = nil
 	}
 }
 
@@ -154,11 +162,12 @@ func (ctx *AIOContext) PopRequest() (interface{}, bool) {
 
 	// Is there anything ready?
 	if e := ctx.results.Front(); e != nil {
-		ctx.results.Remove(e)
-		ctx.outstanding--
-		if ctx.outstanding == 0 && ctx.dead {
-			close(ctx.done)
+		if ctx.outstanding == 0 {
+			panic("AIOContext outstanding is going negative")
 		}
+		ctx.outstanding--
+		ctx.results.Remove(e)
+		ctx.checkForDone()
 		return e.data, true
 	}
 	return nil, false
@@ -172,26 +181,58 @@ func (ctx *AIOContext) FinishRequest(data interface{}) {
 
 	// Push to the list and notify opportunistically. The channel notify
 	// here is guaranteed to be safe because outstanding must be non-zero.
-	// The done channel is only closed when outstanding reaches zero.
+	// The requestReady channel is only closed when outstanding reaches zero.
 	ctx.results.PushBack(&ioResult{data: data})
 
 	select {
-	case ctx.done <- struct{}{}:
+	case ctx.requestReady <- struct{}{}:
 	default:
 	}
 }
 
 // WaitChannel returns a channel that is notified when an AIO request is
-// completed.
-//
-// The boolean return value indicates whether or not the context is active.
-func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
+// completed. Returns nil if the context is destroyed and there are no more
+// outstanding requests.
+func (ctx *AIOContext) WaitChannel() chan struct{} {
 	ctx.mu.Lock()
 	defer ctx.mu.Unlock()
-	if ctx.outstanding == 0 && ctx.dead {
-		return nil, false
+	return ctx.requestReady
+}
+
+// Dead returns true if the context has been destroyed.
+func (ctx *AIOContext) Dead() bool {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	return ctx.dead
+}
+
+// CancelPendingRequest forgets about a request that hasn't yet completed.
+func (ctx *AIOContext) CancelPendingRequest() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	if ctx.outstanding == 0 {
+		panic("AIOContext outstanding is going negative")
 	}
-	return ctx.done, true
+	ctx.outstanding--
+	ctx.checkForDone()
+}
+
+// Drain drops all completed requests. Pending requests remain untouched.
+func (ctx *AIOContext) Drain() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	if ctx.outstanding == 0 {
+		return
+	}
+	size := uint32(ctx.results.Len())
+	if ctx.outstanding < size {
+		panic("AIOContext outstanding is going negative")
+	}
+	ctx.outstanding -= size
+	ctx.results.Reset()
+	ctx.checkForDone()
 }
 
 // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
@@ -332,9 +373,9 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 		Length:          aioRingBufferSize,
 		MappingIdentity: m,
 		Mappable:        m,
-		// TODO(fvoznika): Linux does "do_mmap_pgoff(..., PROT_READ |
-		// PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this
-		// mapping read-only?
+		// Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in
+		// fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC,
+		// user mode should not write to this page.
 		Perms:    usermem.Read,
 		MaxPerms: usermem.Read,
 	})
@@ -349,11 +390,11 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 	return id, nil
 }
 
-// DestroyAIOContext destroys an asynchronous I/O context. It returns false if
-// the context does not exist.
-func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) bool {
+// DestroyAIOContext destroys an asynchronous I/O context. It returns the
+// destroyed context. nil if the context does not exist.
+func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext {
 	if _, ok := mm.LookupAIOContext(ctx, id); !ok {
-		return false
+		return nil
 	}
 
 	// Only unmaps after it assured that the address is a valid aio context to
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
index c37fc9f7b..3dabac1af 100644
--- a/pkg/sentry/mm/aio_context_state.go
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -16,5 +16,5 @@ package mm
 
 // afterLoad is invoked by stateify.
 func (a *AIOContext) afterLoad() {
-	a.done = make(chan struct{}, 1)
+	a.requestReady = make(chan struct{}, 1)
 }
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 47b8fbf43..aac56679b 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -29,16 +28,17 @@ import (
 )
 
 // NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
-func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager {
+func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager {
 	return &MemoryManager{
-		p:           p,
-		mfp:         mfp,
-		haveASIO:    p.SupportsAddressSpaceIO(),
-		privateRefs: &privateRefs{},
-		users:       1,
-		auxv:        arch.Auxv{},
-		dumpability: UserDumpable,
-		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
+		p:                  p,
+		mfp:                mfp,
+		haveASIO:           p.SupportsAddressSpaceIO(),
+		privateRefs:        &privateRefs{},
+		users:              1,
+		auxv:               arch.Auxv{},
+		dumpability:        UserDumpable,
+		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
+		sleepForActivation: sleepForActivation,
 	}
 }
 
@@ -80,9 +80,11 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		envv:                 mm.envv,
 		auxv:                 append(arch.Auxv(nil), mm.auxv...),
 		// IncRef'd below, once we know that there isn't an error.
-		executable:  mm.executable,
-		dumpability: mm.dumpability,
-		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
+		executable:         mm.executable,
+		dumpability:        mm.dumpability,
+		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
+		sleepForActivation: mm.sleepForActivation,
+		vdsoSigReturnAddr:  mm.vdsoSigReturnAddr,
 	}
 
 	// Copy vmas.
@@ -229,7 +231,15 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 // IncUsers increments mm's user count and returns true. If the user count is
 // already 0, IncUsers does nothing and returns false.
 func (mm *MemoryManager) IncUsers() bool {
-	return atomicbitops.IncUnlessZeroInt32(&mm.users)
+	for {
+		users := atomic.LoadInt32(&mm.users)
+		if users == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt32(&mm.users, users, users+1) {
+			return true
+		}
+	}
 }
 
 // DecUsers decrements mm's user count. If the user count reaches 0, all
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index f550acae0..28e5057f7 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -16,7 +16,7 @@ package mm
 
 import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -132,7 +132,7 @@ func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) {
 //
 // An additional reference will be taken in the case of a non-nil executable,
 // which must be released by the caller.
-func (mm *MemoryManager) Executable() *fs.Dirent {
+func (mm *MemoryManager) Executable() fsbridge.File {
 	mm.metadataMu.Lock()
 	defer mm.metadataMu.Unlock()
 
@@ -147,15 +147,15 @@ func (mm *MemoryManager) Executable() *fs.Dirent {
 // SetExecutable sets the executable.
 //
 // This takes a reference on d.
-func (mm *MemoryManager) SetExecutable(d *fs.Dirent) {
+func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
 	mm.metadataMu.Lock()
 
 	// Grab a new reference.
-	d.IncRef()
+	file.IncRef()
 
 	// Set the executable.
 	orig := mm.executable
-	mm.executable = d
+	mm.executable = file
 
 	mm.metadataMu.Unlock()
 
@@ -167,3 +167,17 @@ func (mm *MemoryManager) SetExecutable(d *fs.Dirent) {
 		orig.DecRef()
 	}
 }
+
+// VDSOSigReturn returns the address of vdso_sigreturn.
+func (mm *MemoryManager) VDSOSigReturn() uint64 {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.vdsoSigReturnAddr
+}
+
+// SetVDSOSigReturn sets the address of vdso_sigreturn.
+func (mm *MemoryManager) SetVDSOSigReturn(addr uint64) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.vdsoSigReturnAddr = addr
+}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 09e582dd3..6db7c3d40 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -35,9 +35,10 @@
 package mm
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
@@ -215,7 +216,7 @@ type MemoryManager struct {
 	// is not nil, it holds a reference on the Dirent.
 	//
 	// executable is protected by metadataMu.
-	executable *fs.Dirent
+	executable fsbridge.File
 
 	// dumpability describes if and how this MemoryManager may be dumped to
 	// userspace.
@@ -226,6 +227,14 @@ type MemoryManager struct {
 	// aioManager keeps track of AIOContexts used for async IOs. AIOManager
 	// must be cloned when CLONE_VM is used.
 	aioManager aioManager
+
+	// sleepForActivation indicates whether the task should report to be sleeping
+	// before trying to activate the address space. When set to true, delays in
+	// activation are not reported as stuck tasks by the watchdog.
+	sleepForActivation bool
+
+	// vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
+	vdsoSigReturnAddr uint64
 }
 
 // vma represents a virtual memory area.
@@ -278,7 +287,7 @@ type vma struct {
 	mlockMode memmap.MLockMode
 
 	// numaPolicy is the NUMA policy for this vma set by mbind().
-	numaPolicy int32
+	numaPolicy linux.NumaPolicy
 
 	// numaNodemask is the NUMA nodemask for this vma set by mbind().
 	numaNodemask uint64
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index edacca741..fdc308542 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -31,7 +31,7 @@ import (
 func testMemoryManager(ctx context.Context) *MemoryManager {
 	p := platform.FromContext(ctx)
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
-	mm := NewMemoryManager(p, mfp)
+	mm := NewMemoryManager(p, mfp, false)
 	mm.layout = arch.MmapLayout{
 		MinAddr:      p.MinUserAddress(),
 		MaxAddr:      p.MaxUserAddress(),
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 1ab92f046..6efe5102b 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -148,7 +148,7 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI
 
 	// Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
 	// stack_guard_page_start().
-	fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
+	lineLen, _ := fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
 		vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)
 
 	// Figure out our filename or hint.
@@ -165,7 +165,7 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI
 	}
 	if s != "" {
 		// Per linux, we pad until the 74th character.
-		if pad := 73 - b.Len(); pad > 0 {
+		if pad := 73 - lineLen; pad > 0 {
 			b.WriteString(strings.Repeat(" ", pad))
 		}
 		b.WriteString(s)
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index c5dfa5972..3f496aa9f 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -974,7 +974,7 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error
 }
 
 // NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
-func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) {
+func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (linux.NumaPolicy, uint64, error) {
 	mm.mappingMu.RLock()
 	defer mm.mappingMu.RUnlock()
 	vseg := mm.vmas.FindSegment(addr)
@@ -986,7 +986,7 @@ func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) {
 }
 
 // SetNumaPolicy implements the semantics of Linux's mbind().
-func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy int32, nodemask uint64) error {
+func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error {
 	if !addr.IsPageAligned() {
 		return syserror.EINVAL
 	}
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 9a14e69e6..16d8207e9 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -195,7 +195,7 @@ func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange {
 
 // Preconditions: mm.mappingMu must be locked.
 func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
-	for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextGap() {
+	for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(usermem.Addr(length)) {
 		if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
 			// Can we shift up to match the alignment?
 			if offset := uint64(gr.Start) % alignment; offset != 0 {
@@ -214,7 +214,7 @@ func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bou
 
 // Preconditions: mm.mappingMu must be locked.
 func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
-	for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevGap() {
+	for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(usermem.Addr(length)) {
 		if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
 			// Can we shift down to match the alignment?
 			start := gr.End - usermem.Addr(length)
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 1eeb9f317..a9836ba71 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -33,6 +33,7 @@ go_template_instance(
     out = "usage_set.go",
     consts = {
         "minDegree": "10",
+        "trackGaps": "1",
     },
     imports = {
         "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
@@ -48,6 +49,26 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "reclaim_set",
+    out = "reclaim_set.go",
+    consts = {
+        "minDegree": "10",
+    },
+    imports = {
+        "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+    },
+    package = "pgalloc",
+    prefix = "reclaim",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "platform.FileRange",
+        "Value": "reclaimSetValue",
+        "Functions": "reclaimSetFunctions",
+    },
+)
+
 go_library(
     name = "pgalloc",
     srcs = [
@@ -56,6 +77,7 @@ go_library(
         "evictable_range_set.go",
         "pgalloc.go",
         "pgalloc_unsafe.go",
+        "reclaim_set.go",
         "save_restore.go",
         "usage_set.go",
     ],
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 577e9306a..c8d9facc2 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -108,12 +108,6 @@ type MemoryFile struct {
 	usageSwapped  uint64
 	usageLast     time.Time
 
-	// minUnallocatedPage is the minimum page that may be unallocated.
-	// i.e., there are no unallocated pages below minUnallocatedPage.
-	//
-	// minUnallocatedPage is protected by mu.
-	minUnallocatedPage uint64
-
 	// fileSize is the size of the backing memory file in bytes. fileSize is
 	// always a power-of-two multiple of chunkSize.
 	//
@@ -146,11 +140,9 @@ type MemoryFile struct {
 	// is protected by mu.
 	reclaimable bool
 
-	// minReclaimablePage is the minimum page that may be reclaimable.
-	// i.e., all reclaimable pages are >= minReclaimablePage.
-	//
-	// minReclaimablePage is protected by mu.
-	minReclaimablePage uint64
+	// relcaim is the collection of regions for reclaim. relcaim is protected
+	// by mu.
+	reclaim reclaimSet
 
 	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
 	// transitions from false to true.
@@ -180,6 +172,11 @@ type MemoryFileOpts struct {
 	// notifications to determine when eviction is necessary. This option has
 	// no effect unless DelayedEviction is DelayedEvictionEnabled.
 	UseHostMemcgPressure bool
+
+	// If ManualZeroing is true, MemoryFile must not assume that new pages
+	// obtained from the host are zero-filled, such that MemoryFile must manually
+	// zero newly-allocated pages.
+	ManualZeroing bool
 }
 
 // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
@@ -268,12 +265,10 @@ type evictableMemoryUserInfo struct {
 }
 
 const (
-	chunkShift = 24
-	chunkSize  = 1 << chunkShift // 16 MB
+	chunkShift = 30
+	chunkSize  = 1 << chunkShift // 1 GB
 	chunkMask  = chunkSize - 1
 
-	initialSize = chunkSize
-
 	// maxPage is the highest 64-bit page.
 	maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
 )
@@ -297,19 +292,12 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
 	if err := file.Truncate(0); err != nil {
 		return nil, err
 	}
-	if err := file.Truncate(initialSize); err != nil {
-		return nil, err
-	}
 	f := &MemoryFile{
-		opts:     opts,
-		fileSize: initialSize,
-		file:     file,
-		// No pages are reclaimable. DecRef will always be able to
-		// decrease minReclaimablePage from this point.
-		minReclaimablePage: maxPage,
-		evictable:          make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
+		opts:      opts,
+		file:      file,
+		evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
 	}
-	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+	f.mappings.Store(make([]uintptr, 0))
 	f.reclaimCond.L = &f.mu
 
 	if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
@@ -399,39 +387,37 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi
 		alignment = usermem.HugePageSize
 	}
 
-	start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
-	end := start + length
-	// File offsets are int64s. Since length must be strictly positive, end
-	// cannot legitimately be 0.
-	if end < start || int64(end) <= 0 {
+	// Find a range in the underlying file.
+	fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment)
+	if !ok {
 		return platform.FileRange{}, syserror.ENOMEM
 	}
 
-	// Expand the file if needed. Double the file size on each expansion;
-	// uncommitted pages have effectively no cost.
-	fileSize := f.fileSize
-	for int64(end) > fileSize {
-		if fileSize >= 2*fileSize {
-			// fileSize overflow.
-			return platform.FileRange{}, syserror.ENOMEM
-		}
-		fileSize *= 2
-	}
-	if fileSize > f.fileSize {
-		if err := f.file.Truncate(fileSize); err != nil {
+	// Expand the file if needed. Note that findAvailableRange will
+	// appropriately double the fileSize when required.
+	if int64(fr.End) > f.fileSize {
+		if err := f.file.Truncate(int64(fr.End)); err != nil {
 			return platform.FileRange{}, err
 		}
-		f.fileSize = fileSize
+		f.fileSize = int64(fr.End)
 		f.mappingsMu.Lock()
 		oldMappings := f.mappings.Load().([]uintptr)
-		newMappings := make([]uintptr, fileSize>>chunkShift)
+		newMappings := make([]uintptr, f.fileSize>>chunkShift)
 		copy(newMappings, oldMappings)
 		f.mappings.Store(newMappings)
 		f.mappingsMu.Unlock()
 	}
 
 	// Mark selected pages as in use.
-	fr := platform.FileRange{start, end}
+	if f.opts.ManualZeroing {
+		if err := f.forEachMappingSlice(fr, func(bs []byte) {
+			for i := range bs {
+				bs[i] = 0
+			}
+		}); err != nil {
+			return platform.FileRange{}, err
+		}
+	}
 	if !f.usage.Add(fr, usageInfo{
 		kind: kind,
 		refs: 1,
@@ -439,49 +425,71 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi
 		panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
 	}
 
-	if minUnallocatedPage < start {
-		f.minUnallocatedPage = minUnallocatedPage
-	} else {
-		// start was the first unallocated page. The next must be
-		// somewhere beyond end.
-		f.minUnallocatedPage = end
-	}
-
 	return fr, nil
 }
 
-// findUnallocatedRange returns the first unallocated page in usage of the
-// specified length and alignment beginning at page start and the first single
-// unallocated page.
-func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
-	// Only searched until the first page is found.
-	firstPage := start
-	foundFirstPage := false
-	alignMask := alignment - 1
-	for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
-		r := seg.Range()
+// findAvailableRange returns an available range in the usageSet.
+//
+// Note that scanning for available slots takes place from end first backwards,
+// then forwards. This heuristic has important consequence for how sequential
+// mappings can be merged in the host VMAs, given that addresses for both
+// application and sentry mappings are allocated top-down (from higher to
+// lower addresses). The file is also grown expoentially in order to create
+// space for mappings to be allocated downwards.
+//
+// Precondition: alignment must be a power of 2.
+func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (platform.FileRange, bool) {
+	alignmentMask := alignment - 1
+	for gap := usage.UpperBoundGap(uint64(fileSize)); gap.Ok(); gap = gap.PrevLargeEnoughGap(length) {
+		// Start searching only at end of file.
+		end := gap.End()
+		if end > uint64(fileSize) {
+			end = uint64(fileSize)
+		}
 
-		if !foundFirstPage && r.Start > firstPage {
-			foundFirstPage = true
+		// Start at the top and align downwards.
+		start := end - length
+		if start > end {
+			break // Underflow.
 		}
+		start &^= alignmentMask
 
-		if start >= r.End {
-			// start was rounded up to an alignment boundary from the end
-			// of a previous segment and is now beyond r.End.
+		// Is the gap still sufficient?
+		if start < gap.Start() {
 			continue
 		}
-		// This segment represents allocated or reclaimable pages; only the
-		// range from start to the segment's beginning is allocatable, and the
-		// next allocatable range begins after the segment.
-		if r.Start > start && r.Start-start >= length {
-			break
+
+		// Allocate in the given gap.
+		return platform.FileRange{start, start + length}, true
+	}
+
+	// Check that it's possible to fit this allocation at the end of a file of any size.
+	min := usage.LastGap().Start()
+	min = (min + alignmentMask) &^ alignmentMask
+	if min+length < min {
+		// Overflow.
+		return platform.FileRange{}, false
+	}
+
+	// Determine the minimum file size required to fit this allocation at its end.
+	for {
+		if fileSize >= 2*fileSize {
+			// Is this because it's initially empty?
+			if fileSize == 0 {
+				fileSize += chunkSize
+			} else {
+				// fileSize overflow.
+				return platform.FileRange{}, false
+			}
+		} else {
+			// Double the current fileSize.
+			fileSize *= 2
 		}
-		start = (r.End + alignMask) &^ alignMask
-		if !foundFirstPage {
-			firstPage = r.End
+		start := (uint64(fileSize) - length) &^ alignmentMask
+		if start >= min {
+			return platform.FileRange{start, start + length}, true
 		}
 	}
-	return start, firstPage
 }
 
 // AllocateAndFill allocates memory of the given kind and fills it by calling
@@ -602,6 +610,7 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) {
 		}
 		val.refs--
 		if val.refs == 0 {
+			f.reclaim.Add(seg.Range(), reclaimSetValue{})
 			freed = true
 			// Reclassify memory as System, until it's freed by the reclaim
 			// goroutine.
@@ -614,10 +623,6 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) {
 	f.usage.MergeAdjacent(fr)
 
 	if freed {
-		if fr.Start < f.minReclaimablePage {
-			// We've freed at least one lower page.
-			f.minReclaimablePage = fr.Start
-		}
 		f.reclaimable = true
 		f.reclaimCond.Signal()
 	}
@@ -1016,6 +1021,7 @@ func (f *MemoryFile) String() string {
 // for allocation.
 func (f *MemoryFile) runReclaim() {
 	for {
+		// N.B. We must call f.markReclaimed on the returned FrameRange.
 		fr, ok := f.findReclaimable()
 		if !ok {
 			break
@@ -1071,6 +1077,10 @@ func (f *MemoryFile) runReclaim() {
 	}
 }
 
+// findReclaimable finds memory that has been marked for reclaim.
+//
+// Note that there returned range will be removed from tracking. It
+// must be reclaimed (removed from f.usage) at this point.
 func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -1089,18 +1099,15 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
 			}
 			f.reclaimCond.Wait()
 		}
-		// Allocate returns the first usable range in offset order and is
-		// currently a linear scan, so reclaiming from the beginning of the
-		// file minimizes the expected latency of Allocate.
-		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
-			if seg.ValuePtr().refs == 0 {
-				f.minReclaimablePage = seg.End()
-				return seg.Range(), true
-			}
+		// Allocate works from the back of the file inwards, so reclaim
+		// preserves this order to minimize the cost of the search.
+		if seg := f.reclaim.LastSegment(); seg.Ok() {
+			fr := seg.Range()
+			f.reclaim.Remove(seg)
+			return fr, true
 		}
-		// No pages are reclaimable.
+		// Nothing is reclaimable.
 		f.reclaimable = false
-		f.minReclaimablePage = maxPage
 	}
 }
 
@@ -1108,8 +1115,8 @@ func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 	seg := f.usage.FindSegment(fr.Start)
-	// All of fr should be mapped to a single uncommitted reclaimable segment
-	// accounted to System.
+	// All of fr should be mapped to a single uncommitted reclaimable
+	// segment accounted to System.
 	if !seg.Ok() {
 		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
 	}
@@ -1123,14 +1130,10 @@ func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
 	}); got != want {
 		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
 	}
-	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
-	// caller of markReclaimed may not have decommitted it, so we can only mark
-	// fr as reclaimed.
+	// Deallocate reclaimed pages. Even though all of seg is reclaimable,
+	// the caller of markReclaimed may not have decommitted it, so we can
+	// only mark fr as reclaimed.
 	f.usage.Remove(f.usage.Isolate(seg, fr))
-	if fr.Start < f.minUnallocatedPage {
-		// We've deallocated at least one lower page.
-		f.minUnallocatedPage = fr.Start
-	}
 }
 
 // StartEvictions requests that f evict all evictable allocations. It does not
@@ -1241,3 +1244,27 @@ func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetVal
 func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
 	return evictableRangeSetValue{}, evictableRangeSetValue{}
 }
+
+// reclaimSetValue is the value type of reclaimSet.
+type reclaimSetValue struct{}
+
+type reclaimSetFunctions struct{}
+
+func (reclaimSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (reclaimSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) {
+}
+
+func (reclaimSetFunctions) Merge(_ platform.FileRange, _ reclaimSetValue, _ platform.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) {
+	return reclaimSetValue{}, true
+}
+
+func (reclaimSetFunctions) Split(_ platform.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) {
+	return reclaimSetValue{}, reclaimSetValue{}
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go
index 293f22c6b..b5b68eb52 100644
--- a/pkg/sentry/pgalloc/pgalloc_test.go
+++ b/pkg/sentry/pgalloc/pgalloc_test.go
@@ -23,39 +23,49 @@ import (
 const (
 	page     = usermem.PageSize
 	hugepage = usermem.HugePageSize
+	topPage  = (1 << 63) - page
 )
 
 func TestFindUnallocatedRange(t *testing.T) {
 	for _, test := range []struct {
-		desc           string
-		usage          *usageSegmentDataSlices
-		start          uint64
-		length         uint64
-		alignment      uint64
-		unallocated    uint64
-		minUnallocated uint64
+		desc       string
+		usage      *usageSegmentDataSlices
+		fileSize   int64
+		length     uint64
+		alignment  uint64
+		start      uint64
+		expectFail bool
 	}{
 		{
-			desc:           "Initial allocation succeeds",
-			usage:          &usageSegmentDataSlices{},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    0,
-			minUnallocated: 0,
+			desc:      "Initial allocation succeeds",
+			usage:     &usageSegmentDataSlices{},
+			length:    page,
+			alignment: page,
+			start:     chunkSize - page, // Grows by chunkSize, allocate down.
 		},
 		{
-			desc: "Allocation begins at start of file",
+			desc: "Allocation finds empty space at start of file",
 			usage: &usageSegmentDataSlices{
 				Start:  []uint64{page},
 				End:    []uint64{2 * page},
 				Values: []usageInfo{{refs: 1}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    0,
-			minUnallocated: 0,
+			fileSize:  2 * page,
+			length:    page,
+			alignment: page,
+			start:     0,
+		},
+		{
+			desc: "Allocation finds empty space at end of file",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0},
+				End:    []uint64{page},
+				Values: []usageInfo{{refs: 1}},
+			},
+			fileSize:  2 * page,
+			length:    page,
+			alignment: page,
+			start:     page,
 		},
 		{
 			desc: "In-use frames are not allocatable",
@@ -64,11 +74,10 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 2 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 2}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
+			fileSize:  2 * page,
+			length:    page,
+			alignment: page,
+			start:     3 * page, // Double fileSize, allocate top-down.
 		},
 		{
 			desc: "Reclaimable frames are not allocatable",
@@ -77,11 +86,10 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 2 * page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    3 * page,
-			minUnallocated: 3 * page,
+			fileSize:  3 * page,
+			length:    page,
+			alignment: page,
+			start:     5 * page, // Double fileSize, grow down.
 		},
 		{
 			desc: "Gaps between in-use frames are allocatable",
@@ -90,11 +98,10 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    page,
-			minUnallocated: page,
+			fileSize:  3 * page,
+			length:    page,
+			alignment: page,
+			start:     page,
 		},
 		{
 			desc: "Inadequately-sized gaps are rejected",
@@ -103,14 +110,13 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			start:          0,
-			length:         2 * page,
-			alignment:      page,
-			unallocated:    3 * page,
-			minUnallocated: page,
+			fileSize:  3 * page,
+			length:    2 * page,
+			alignment: page,
+			start:     4 * page, // Double fileSize, grow down.
 		},
 		{
-			desc: "Hugepage alignment is honored",
+			desc: "Alignment is honored at end of file",
 			usage: &usageSegmentDataSlices{
 				Start: []uint64{0, hugepage + page},
 				// Hugepage-sized gap here that shouldn't be allocated from
@@ -118,37 +124,95 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, hugepage + 2*page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			start:          0,
-			length:         hugepage,
-			alignment:      hugepage,
-			unallocated:    2 * hugepage,
-			minUnallocated: page,
+			fileSize:  hugepage + 2*page,
+			length:    hugepage,
+			alignment: hugepage,
+			start:     3 * hugepage, // Double fileSize until alignment is satisfied, grow down.
+		},
+		{
+			desc: "Alignment is honored before end of file",
+			usage: &usageSegmentDataSlices{
+				Start: []uint64{0, 2*hugepage + page},
+				// Page will need to be shifted down from top.
+				End:    []uint64{page, 2*hugepage + 2*page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			fileSize:  2*hugepage + 2*page,
+			length:    hugepage,
+			alignment: hugepage,
+			start:     hugepage,
 		},
 		{
-			desc: "Pages before start ignored",
+			desc: "Allocations are compact if possible",
 			usage: &usageSegmentDataSlices{
 				Start:  []uint64{page, 3 * page},
 				End:    []uint64{2 * page, 4 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 2}},
 			},
-			start:          page,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
+			fileSize:  4 * page,
+			length:    page,
+			alignment: page,
+			start:     2 * page,
+		},
+		{
+			desc: "Top-down allocation within one gap",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, 4 * page, 7 * page},
+				End:    []uint64{2 * page, 5 * page, 8 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}},
+			},
+			fileSize:  8 * page,
+			length:    page,
+			alignment: page,
+			start:     6 * page,
+		},
+		{
+			desc: "Top-down allocation between multiple gaps",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, 3 * page, 5 * page},
+				End:    []uint64{2 * page, 4 * page, 6 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}},
+			},
+			fileSize:  6 * page,
+			length:    page,
+			alignment: page,
+			start:     4 * page,
 		},
 		{
-			desc: "start may be in the middle of segment",
+			desc: "Top-down allocation with large top gap",
 			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, 3 * page},
+				Start:  []uint64{page, 3 * page},
 				End:    []uint64{2 * page, 4 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 2}},
 			},
-			start:          page,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
+			fileSize:  8 * page,
+			length:    page,
+			alignment: page,
+			start:     7 * page,
+		},
+		{
+			desc: "Gaps found with possible overflow",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, topPage - page},
+				End:    []uint64{2 * page, topPage},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			fileSize:  topPage,
+			length:    page,
+			alignment: page,
+			start:     topPage - 2*page,
+		},
+		{
+			desc: "Overflow detected",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page},
+				End:    []uint64{topPage},
+				Values: []usageInfo{{refs: 1}},
+			},
+			fileSize:   topPage,
+			length:     2 * page,
+			alignment:  page,
+			expectFail: true,
 		},
 	} {
 		t.Run(test.desc, func(t *testing.T) {
@@ -156,12 +220,18 @@ func TestFindUnallocatedRange(t *testing.T) {
 			if err := usage.ImportSortedSlices(test.usage); err != nil {
 				t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
 			}
-			unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment)
-			if unallocated != test.unallocated {
-				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated)
+			fr, ok := findAvailableRange(&usage, test.fileSize, test.length, test.alignment)
+			if !test.expectFail && !ok {
+				t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, false wanted %x, true", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
+			}
+			if test.expectFail && ok {
+				t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, true wanted %x, false", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
+			}
+			if ok && fr.Start != test.start {
+				t.Errorf("findAvailableRange(%v, %x, %x, %x): got start=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
 			}
-			if minUnallocated != test.minUnallocated {
-				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated)
+			if ok && fr.End != test.start+test.length {
+				t.Errorf("findAvailableRange(%v, %x, %x, %x): got end=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.End, test.start+test.length)
 			}
 		})
 	}
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 159f7eafd..4792454c4 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -6,8 +6,8 @@ go_library(
     name = "kvm",
     srcs = [
         "address_space.go",
-        "allocator.go",
         "bluepill.go",
+        "bluepill_allocator.go",
         "bluepill_amd64.go",
         "bluepill_amd64.s",
         "bluepill_amd64_unsafe.go",
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index be213bfe8..faf1d5e1c 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -26,16 +26,15 @@ import (
 
 // dirtySet tracks vCPUs for invalidation.
 type dirtySet struct {
-	vCPUs []uint64
+	vCPUMasks []uint64
 }
 
 // forEach iterates over all CPUs in the dirty set.
+//
+//go:nosplit
 func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
-	m.mu.RLock()
-	defer m.mu.RUnlock()
-
-	for index := range ds.vCPUs {
-		mask := atomic.SwapUint64(&ds.vCPUs[index], 0)
+	for index := range ds.vCPUMasks {
+		mask := atomic.SwapUint64(&ds.vCPUMasks[index], 0)
 		if mask != 0 {
 			for bit := 0; bit < 64; bit++ {
 				if mask&(1<<uint64(bit)) == 0 {
@@ -54,7 +53,7 @@ func (ds *dirtySet) mark(c *vCPU) bool {
 	index := uint64(c.id) / 64
 	bit := uint64(1) << uint(c.id%64)
 
-	oldValue := atomic.LoadUint64(&ds.vCPUs[index])
+	oldValue := atomic.LoadUint64(&ds.vCPUMasks[index])
 	if oldValue&bit != 0 {
 		return false // Not clean.
 	}
@@ -62,7 +61,7 @@ func (ds *dirtySet) mark(c *vCPU) bool {
 	// Set the bit unilaterally, and ensure that a flush takes place. Note
 	// that it's possible for races to occur here, but since the flush is
 	// taking place long after these lines there's no race in practice.
-	atomicbitops.OrUint64(&ds.vCPUs[index], bit)
+	atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
 	return true // Previously clean.
 }
 
@@ -113,7 +112,12 @@ type hostMapEntry struct {
 	length uintptr
 }
 
-func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
+// mapLocked maps the given host entry.
+//
+// +checkescape:hard,stack
+//
+//go:nosplit
+func (as *addressSpace) mapLocked(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
 	for m.length > 0 {
 		physical, length, ok := translateToPhysical(m.addr)
 		if !ok {
@@ -133,18 +137,10 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
 		// important; if the pagetable mappings were installed before
 		// ensuring the physical pages were available, then some other
 		// thread could theoretically access them.
-		//
-		// Due to the way KVM's shadow paging implementation works,
-		// modifications to the page tables while in host mode may not
-		// be trapped, leading to the shadow pages being out of sync.
-		// Therefore, we need to ensure that we are in guest mode for
-		// page table modifications. See the call to bluepill, below.
-		as.machine.retryInGuest(func() {
-			inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
-				AccessType: at,
-				User:       true,
-			}, physical) || inv
-		})
+		inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
+			AccessType: at,
+			User:       true,
+		}, physical) || inv
 		m.addr += length
 		m.length -= length
 		addr += usermem.Addr(length)
@@ -176,6 +172,10 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
 		return err
 	}
 
+	// See block in mapLocked.
+	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
+	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
+
 	// Map the mappings in the sentry's address space (guest physical memory)
 	// into the application's address space (guest virtual memory).
 	inv := false
@@ -190,7 +190,12 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
 				_ = s[i] // Touch to commit.
 			}
 		}
-		prev := as.mapHost(addr, hostMapEntry{
+
+		// See bluepill_allocator.go.
+		bluepill(as.pageTables.Allocator.(*allocator).cpu)
+
+		// Perform the mapping.
+		prev := as.mapLocked(addr, hostMapEntry{
 			addr:   b.Addr(),
 			length: uintptr(b.Len()),
 		}, at)
@@ -204,17 +209,27 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
 	return nil
 }
 
+// unmapLocked is an escape-checked wrapped around Unmap.
+//
+// +checkescape:hard,stack
+//
+//go:nosplit
+func (as *addressSpace) unmapLocked(addr usermem.Addr, length uint64) bool {
+	return as.pageTables.Unmap(addr, uintptr(length))
+}
+
 // Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
 func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
 	as.mu.Lock()
 	defer as.mu.Unlock()
 
-	// See above re: retryInGuest.
-	var prev bool
-	as.machine.retryInGuest(func() {
-		prev = as.pageTables.Unmap(addr, uintptr(length)) || prev
-	})
-	if prev {
+	// See above & bluepill_allocator.go.
+	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
+	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
+	bluepill(as.pageTables.Allocator.(*allocator).cpu)
+
+	if prev := as.unmapLocked(addr, length); prev {
+		// Invalidate all active vCPUs.
 		as.invalidate()
 
 		// Recycle any freed intermediate pages.
@@ -227,7 +242,7 @@ func (as *addressSpace) Release() {
 	as.Unmap(0, ^uint64(0))
 
 	// Free all pages from the allocator.
-	as.pageTables.Allocator.(allocator).base.Drain()
+	as.pageTables.Allocator.(*allocator).base.Drain()
 
 	// Drop all cached machine references.
 	as.machine.dropPageTables(as.pageTables)
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 35cd55fef..4b23f7803 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -81,12 +81,6 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
 	// Save the death message, which will be thrown.
 	c.dieState.message = msg
 
-	// Reload all registers to have an accurate stack trace when we return
-	// to host mode. This means that the stack should be unwound correctly.
-	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
-		throw(msg)
-	}
-
 	// Setup the trampoline.
 	dieArchSetup(c, context, &c.dieState.guestRegs)
 }
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/bluepill_allocator.go
index 3f35414bb..9485e1301 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/bluepill_allocator.go
@@ -21,56 +21,80 @@ import (
 )
 
 type allocator struct {
-	base *pagetables.RuntimeAllocator
+	base pagetables.RuntimeAllocator
+
+	// cpu must be set prior to any pagetable operation.
+	//
+	// Due to the way KVM's shadow paging implementation works,
+	// modifications to the page tables while in host mode may not be
+	// trapped, leading to the shadow pages being out of sync.  Therefore,
+	// we need to ensure that we are in guest mode for page table
+	// modifications. See the call to bluepill, below.
+	cpu *vCPU
 }
 
 // newAllocator is used to define the allocator.
-func newAllocator() allocator {
-	return allocator{
-		base: pagetables.NewRuntimeAllocator(),
-	}
+func newAllocator() *allocator {
+	a := new(allocator)
+	a.base.Init()
+	return a
 }
 
 // NewPTEs implements pagetables.Allocator.NewPTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) NewPTEs() *pagetables.PTEs {
-	return a.base.NewPTEs()
+func (a *allocator) NewPTEs() *pagetables.PTEs {
+	ptes := a.base.NewPTEs() // escapes: bluepill below.
+	if a.cpu != nil {
+		bluepill(a.cpu)
+	}
+	return ptes
 }
 
 // PhysicalFor returns the physical address for a set of PTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
+func (a *allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
 	virtual := a.base.PhysicalFor(ptes)
 	physical, _, ok := translateToPhysical(virtual)
 	if !ok {
-		panic(fmt.Sprintf("PhysicalFor failed for %p", ptes))
+		panic(fmt.Sprintf("PhysicalFor failed for %p", ptes)) // escapes: panic.
 	}
 	return physical
 }
 
 // LookupPTEs implements pagetables.Allocator.LookupPTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
+func (a *allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
 	virtualStart, physicalStart, _, ok := calculateBluepillFault(physical, physicalRegions)
 	if !ok {
-		panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical))
+		panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) // escapes: panic.
 	}
 	return a.base.LookupPTEs(virtualStart + (physical - physicalStart))
 }
 
 // FreePTEs implements pagetables.Allocator.FreePTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) FreePTEs(ptes *pagetables.PTEs) {
-	a.base.FreePTEs(ptes)
+func (a *allocator) FreePTEs(ptes *pagetables.PTEs) {
+	a.base.FreePTEs(ptes) // escapes: bluepill below.
+	if a.cpu != nil {
+		bluepill(a.cpu)
+	}
 }
 
 // Recycle implements pagetables.Allocator.Recycle.
 //
 //go:nosplit
-func (a allocator) Recycle() {
+func (a *allocator) Recycle() {
 	a.base.Recycle()
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index 133c2203d..ddc1554d5 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -63,6 +63,8 @@ func bluepillArchEnter(context *arch.SignalContext64) *vCPU {
 
 // KernelSyscall handles kernel syscalls.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelSyscall() {
 	regs := c.Registers()
@@ -72,13 +74,15 @@ func (c *vCPU) KernelSyscall() {
 	// We only trigger a bluepill entry in the bluepill function, and can
 	// therefore be guaranteed that there is no floating point state to be
 	// loaded on resuming from halt. We only worry about saving on exit.
-	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
 	ring0.Halt()
-	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+	ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment.
 }
 
 // KernelException handles kernel exceptions.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelException(vector ring0.Vector) {
 	regs := c.Registers()
@@ -89,9 +93,9 @@ func (c *vCPU) KernelException(vector ring0.Vector) {
 		regs.Rip = 0
 	}
 	// See above.
-	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
 	ring0.Halt()
-	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+	ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment.
 }
 
 // bluepillArchExit is called during bluepillEnter.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index a63a6a071..99cac665d 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -31,6 +31,12 @@ import (
 //
 //go:nosplit
 func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
+	// Reload all registers to have an accurate stack trace when we return
+	// to host mode. This means that the stack should be unwound correctly.
+	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
+		throw(c.dieState.message)
+	}
+
 	// If the vCPU is in user mode, we set the stack to the stored stack
 	// value in the vCPU itself. We don't want to unwind the user stack.
 	if guestRegs.RFLAGS&ring0.UserFlagsSet == ring0.UserFlagsSet {
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index 552341721..83643c602 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -54,26 +54,60 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
 	context.Pstate = regs.Pstate
 	context.Pstate &^= uint64(ring0.UserFlagsClear)
 	context.Pstate |= ring0.UserFlagsSet
+
+	lazyVfp := c.GetLazyVFP()
+	if lazyVfp != 0 {
+		fpsimd := fpsimdPtr((*byte)(c.floatingPointState))
+		context.Fpsimd64.Fpsr = fpsimd.Fpsr
+		context.Fpsimd64.Fpcr = fpsimd.Fpcr
+		context.Fpsimd64.Vregs = fpsimd.Vregs
+	}
 }
 
 // KernelSyscall handles kernel syscalls.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelSyscall() {
 	regs := c.Registers()
 	if regs.Regs[8] != ^uint64(0) {
 		regs.Pc -= 4 // Rewind.
 	}
+
+	vfpEnable := ring0.CPACREL1()
+	if vfpEnable != 0 {
+		fpsimd := fpsimdPtr((*byte)(c.floatingPointState))
+		fpcr := ring0.GetFPCR()
+		fpsr := ring0.GetFPSR()
+		fpsimd.Fpcr = uint32(fpcr)
+		fpsimd.Fpsr = uint32(fpsr)
+		ring0.SaveVRegs((*byte)(c.floatingPointState))
+	}
+
 	ring0.Halt()
 }
 
 // KernelException handles kernel exceptions.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelException(vector ring0.Vector) {
 	regs := c.Registers()
 	if vector == ring0.Vector(bounce) {
 		regs.Pc = 0
 	}
+
+	vfpEnable := ring0.CPACREL1()
+	if vfpEnable != 0 {
+		fpsimd := fpsimdPtr((*byte)(c.floatingPointState))
+		fpcr := ring0.GetFPCR()
+		fpsr := ring0.GetFPSR()
+		fpsimd.Fpcr = uint32(fpcr)
+		fpsimd.Fpsr = uint32(fpsr)
+		ring0.SaveVRegs((*byte)(c.floatingPointState))
+	}
+
 	ring0.Halt()
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
index c61700892..04efa0147 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.s
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -82,6 +82,8 @@ fallback:
 
 // dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation.
 TEXT ·dieTrampoline(SB),NOSPLIT,$0
-	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
-	MOVD	R9, 8(RSP)
-	BL	·dieHandler(SB)
+	// R0: Fake the old PC as caller
+	// R1: First argument (vCPU)
+	MOVD.P R1, 8(RSP) // R1: First argument (vCPU)
+	MOVD.P R0, 8(RSP) // R0: Fake the old PC as caller
+	B ·dieHandler(SB)
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index 2f02c03cf..4ca2b7717 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -17,10 +17,47 @@
 package kvm
 
 import (
+	"unsafe"
+
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
+// fpsimdPtr returns a fpsimd64 for the given address.
+//
+//go:nosplit
+func fpsimdPtr(addr *byte) *arch.FpsimdContext {
+	return (*arch.FpsimdContext)(unsafe.Pointer(addr))
+}
+
+// dieArchSetup initialies the state for dieTrampoline.
+//
+// The arm64 dieTrampoline requires the vCPU to be set in R1, and the last PC
+// to be in R0. The trampoline then simulates a call to dieHandler from the
+// provided PC.
+//
 //go:nosplit
 func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
-	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
+	// If the vCPU is in user mode, we set the stack to the stored stack
+	// value in the vCPU itself. We don't want to unwind the user stack.
+	if guestRegs.Regs.Pstate&ring0.PSR_MODE_MASK == ring0.PSR_MODE_EL0t {
+		regs := c.CPU.Registers()
+		context.Regs[0] = regs.Regs[0]
+		context.Sp = regs.Sp
+		context.Regs[29] = regs.Regs[29] // stack base address
+	} else {
+		context.Regs[0] = guestRegs.Regs.Pc
+		context.Sp = guestRegs.Regs.Sp
+		context.Regs[29] = guestRegs.Regs.Regs[29]
+		context.Pstate = guestRegs.Regs.Pstate
+	}
+	context.Regs[1] = uint64(uintptr(unsafe.Pointer(c)))
+	context.Pc = uint64(dieTrampolineAddr)
+}
+
+// bluepillArchFpContext returns the arch-specific fpsimd context.
+//
+//go:nosplit
+func bluepillArchFpContext(context unsafe.Pointer) *arch.FpsimdContext {
+	return &((*arch.SignalContext64)(context).Fpsimd64)
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 9add7c944..c025aa0bb 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -64,6 +64,8 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
 // signal stack. It should only execute raw system calls and functions that are
 // explicitly marked go:nosplit.
 //
+// +checkescape:all
+//
 //go:nosplit
 func bluepillHandler(context unsafe.Pointer) {
 	// Sanitize the registers; interrupts must always be disabled.
@@ -82,7 +84,8 @@ func bluepillHandler(context unsafe.Pointer) {
 	}
 
 	for {
-		switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno {
+		_, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
+		switch errno {
 		case 0: // Expected case.
 		case syscall.EINTR:
 			// First, we process whatever pending signal
@@ -90,7 +93,7 @@ func bluepillHandler(context unsafe.Pointer) {
 			// currently, all signals are masked and the signal
 			// must have been delivered directly to this thread.
 			timeout := syscall.Timespec{}
-			sig, _, errno := syscall.RawSyscall6(
+			sig, _, errno := syscall.RawSyscall6( // escapes: no.
 				syscall.SYS_RT_SIGTIMEDWAIT,
 				uintptr(unsafe.Pointer(&bounceSignalMask)),
 				0,                                 // siginfo.
@@ -125,7 +128,7 @@ func bluepillHandler(context unsafe.Pointer) {
 			// MMIO exit we receive EFAULT from the run ioctl. We
 			// always inject an NMI here since we may be in kernel
 			// mode and have interrupts disabled.
-			if _, _, errno := syscall.RawSyscall(
+			if _, _, errno := syscall.RawSyscall( // escapes: no.
 				syscall.SYS_IOCTL,
 				uintptr(c.fd),
 				_KVM_NMI, 0); errno != 0 {
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index c769ac7b4..6507121ea 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -85,3 +85,6 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a
 func (c *context) Interrupt() {
 	c.interrupt.NotifyInterrupt()
 }
+
+// Release implements platform.Context.Release().
+func (c *context) Release() {}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 972ba85c3..ae813e24e 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -27,6 +27,38 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// userMemoryRegion is a region of physical memory.
+//
+// This mirrors kvm_memory_region.
+type userMemoryRegion struct {
+	slot          uint32
+	flags         uint32
+	guestPhysAddr uint64
+	memorySize    uint64
+	userspaceAddr uint64
+}
+
+// runData is the run structure. This may be mapped for synchronous register
+// access (although that doesn't appear to be supported by my kernel at least).
+//
+// This mirrors kvm_run.
+type runData struct {
+	requestInterruptWindow uint8
+	_                      [7]uint8
+
+	exitReason                 uint32
+	readyForInterruptInjection uint8
+	ifFlag                     uint8
+	_                          [2]uint8
+
+	cr8      uint64
+	apicBase uint64
+
+	// This is the union data for exits. Interpretation depends entirely on
+	// the exitReason above (see vCPU code for more information).
+	data [32]uint64
+}
+
 // KVM represents a lightweight VM context.
 type KVM struct {
 	platform.NoCPUPreemptionDetection
@@ -159,6 +191,11 @@ func (*constructor) OpenDevice() (*os.File, error) {
 	return OpenDevice()
 }
 
+// Flags implements platform.Constructor.Flags().
+func (*constructor) Requirements() platform.Requirements {
+	return platform.Requirements{}
+}
+
 func init() {
 	platform.Register("kvm", &constructor{})
 }
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
index c5a6f9c7d..093497bc4 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -21,17 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
-// userMemoryRegion is a region of physical memory.
-//
-// This mirrors kvm_memory_region.
-type userMemoryRegion struct {
-	slot          uint32
-	flags         uint32
-	guestPhysAddr uint64
-	memorySize    uint64
-	userspaceAddr uint64
-}
-
 // userRegs represents KVM user registers.
 //
 // This mirrors kvm_regs.
@@ -169,27 +158,6 @@ type modelControlRegisters struct {
 	entries [16]modelControlRegister
 }
 
-// runData is the run structure. This may be mapped for synchronous register
-// access (although that doesn't appear to be supported by my kernel at least).
-//
-// This mirrors kvm_run.
-type runData struct {
-	requestInterruptWindow uint8
-	_                      [7]uint8
-
-	exitReason                 uint32
-	readyForInterruptInjection uint8
-	ifFlag                     uint8
-	_                          [2]uint8
-
-	cr8      uint64
-	apicBase uint64
-
-	// This is the union data for exits. Interpretation depends entirely on
-	// the exitReason above (see vCPU code for more information).
-	data [32]uint64
-}
-
 // cpuidEntry is a single CPUID entry.
 //
 // This mirrors kvm_cpuid_entry2.
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
index 2319c86d3..29d457a7e 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -17,20 +17,10 @@
 package kvm
 
 import (
-	"syscall"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
-// userMemoryRegion is a region of physical memory.
-//
-// This mirrors kvm_memory_region.
-type userMemoryRegion struct {
-	slot          uint32
-	flags         uint32
-	guestPhysAddr uint64
-	memorySize    uint64
-	userspaceAddr uint64
-}
-
 type kvmOneReg struct {
 	id   uint64
 	addr uint64
@@ -46,38 +36,17 @@ type userFpsimdState struct {
 }
 
 type userRegs struct {
-	Regs    syscall.PtraceRegs
+	Regs    arch.Registers
 	sp_el1  uint64
 	elr_el1 uint64
 	spsr    [KVM_NR_SPSR]uint64
 	fpRegs  userFpsimdState
 }
 
-// runData is the run structure. This may be mapped for synchronous register
-// access (although that doesn't appear to be supported by my kernel at least).
-//
-// This mirrors kvm_run.
-type runData struct {
-	requestInterruptWindow uint8
-	_                      [7]uint8
-
-	exitReason                 uint32
-	readyForInterruptInjection uint8
-	ifFlag                     uint8
-	_                          [2]uint8
-
-	cr8      uint64
-	apicBase uint64
-
-	// This is the union data for exits. Interpretation depends entirely on
-	// the exitReason above (see vCPU code for more information).
-	data [32]uint64
-}
-
 // updateGlobalOnce does global initialization. It has to be called only once.
 func updateGlobalOnce(fd int) error {
 	physicalInit()
 	err := updateSystemValues(int(fd))
-	updateVectorTable()
+	ring0.Init()
 	return err
 }
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
index 5a74c6e36..531ae8b1e 100644
--- a/pkg/sentry/platform/kvm/kvm_const_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -19,30 +19,31 @@ const (
 	_KVM_GET_ONE_REG = 0x4010aeab
 	_KVM_SET_ONE_REG = 0x4010aeac
 
-	_KVM_ARM_PREFERRED_TARGET = 0x8020aeaf
-	_KVM_ARM_VCPU_INIT        = 0x4020aeae
-	_KVM_ARM64_REGS_PSTATE    = 0x6030000000100042
-	_KVM_ARM64_REGS_SP_EL1    = 0x6030000000100044
-	_KVM_ARM64_REGS_R0        = 0x6030000000100000
-	_KVM_ARM64_REGS_R1        = 0x6030000000100002
-	_KVM_ARM64_REGS_R2        = 0x6030000000100004
-	_KVM_ARM64_REGS_R3        = 0x6030000000100006
-	_KVM_ARM64_REGS_R8        = 0x6030000000100010
-	_KVM_ARM64_REGS_R18       = 0x6030000000100024
-	_KVM_ARM64_REGS_PC        = 0x6030000000100040
-	_KVM_ARM64_REGS_MAIR_EL1  = 0x603000000013c510
-	_KVM_ARM64_REGS_TCR_EL1   = 0x603000000013c102
-	_KVM_ARM64_REGS_TTBR0_EL1 = 0x603000000013c100
-	_KVM_ARM64_REGS_TTBR1_EL1 = 0x603000000013c101
-	_KVM_ARM64_REGS_SCTLR_EL1 = 0x603000000013c080
-	_KVM_ARM64_REGS_CPACR_EL1 = 0x603000000013c082
-	_KVM_ARM64_REGS_VBAR_EL1  = 0x603000000013c600
+	_KVM_ARM_TARGET_GENERIC_V8 = 5
+	_KVM_ARM_PREFERRED_TARGET  = 0x8020aeaf
+	_KVM_ARM_VCPU_INIT         = 0x4020aeae
+	_KVM_ARM64_REGS_PSTATE     = 0x6030000000100042
+	_KVM_ARM64_REGS_SP_EL1     = 0x6030000000100044
+	_KVM_ARM64_REGS_R0         = 0x6030000000100000
+	_KVM_ARM64_REGS_R1         = 0x6030000000100002
+	_KVM_ARM64_REGS_R2         = 0x6030000000100004
+	_KVM_ARM64_REGS_R3         = 0x6030000000100006
+	_KVM_ARM64_REGS_R8         = 0x6030000000100010
+	_KVM_ARM64_REGS_R18        = 0x6030000000100024
+	_KVM_ARM64_REGS_PC         = 0x6030000000100040
+	_KVM_ARM64_REGS_MAIR_EL1   = 0x603000000013c510
+	_KVM_ARM64_REGS_TCR_EL1    = 0x603000000013c102
+	_KVM_ARM64_REGS_TTBR0_EL1  = 0x603000000013c100
+	_KVM_ARM64_REGS_TTBR1_EL1  = 0x603000000013c101
+	_KVM_ARM64_REGS_SCTLR_EL1  = 0x603000000013c080
+	_KVM_ARM64_REGS_CPACR_EL1  = 0x603000000013c082
+	_KVM_ARM64_REGS_VBAR_EL1   = 0x603000000013c600
 )
 
 // Arm64: Architectural Feature Access Control Register EL1.
 const (
-	_FPEN_NOTRAP = 0x3
-	_FPEN_SHIFT  = 0x20
+	_FPEN_NOTRAP = 3
+	_FPEN_SHIFT  = 20
 )
 
 // Arm64: System Control Register EL1.
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index c42752d50..6c8f4fa28 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -117,10 +117,10 @@ func TestKernelFloatingPoint(t *testing.T) {
 	})
 }
 
-func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *syscall.PtraceRegs, *pagetables.PageTables) bool) {
+func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) {
 	// Initialize registers & page tables.
 	var (
-		regs syscall.PtraceRegs
+		regs arch.Registers
 		pt   *pagetables.PageTables
 	)
 	testutil.SetTestTarget(&regs, target)
@@ -154,7 +154,7 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func
 }
 
 func TestApplicationSyscall(t *testing.T) {
-	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
@@ -168,7 +168,7 @@ func TestApplicationSyscall(t *testing.T) {
 		}
 		return false
 	})
-	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
@@ -184,7 +184,7 @@ func TestApplicationSyscall(t *testing.T) {
 }
 
 func TestApplicationFault(t *testing.T) {
-	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, nil) // Cause fault.
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
@@ -199,7 +199,7 @@ func TestApplicationFault(t *testing.T) {
 		}
 		return false
 	})
-	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, nil) // Cause fault.
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
@@ -216,7 +216,7 @@ func TestApplicationFault(t *testing.T) {
 }
 
 func TestRegistersSyscall(t *testing.T) {
-	applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTestRegs(regs) // Fill values for all registers.
 		for {
 			var si arch.SignalInfo
@@ -239,7 +239,7 @@ func TestRegistersSyscall(t *testing.T) {
 }
 
 func TestRegistersFault(t *testing.T) {
-	applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTestRegs(regs) // Fill values for all registers.
 		for {
 			var si arch.SignalInfo
@@ -263,7 +263,7 @@ func TestRegistersFault(t *testing.T) {
 }
 
 func TestSegments(t *testing.T) {
-	applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTestSegments(regs)
 		for {
 			var si arch.SignalInfo
@@ -287,7 +287,7 @@ func TestSegments(t *testing.T) {
 }
 
 func TestBounce(t *testing.T) {
-	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		go func() {
 			time.Sleep(time.Millisecond)
 			c.BounceToKernel()
@@ -302,7 +302,7 @@ func TestBounce(t *testing.T) {
 		}
 		return false
 	})
-	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		go func() {
 			time.Sleep(time.Millisecond)
 			c.BounceToKernel()
@@ -321,7 +321,7 @@ func TestBounce(t *testing.T) {
 }
 
 func TestBounceStress(t *testing.T) {
-	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		randomSleep := func() {
 			// O(hundreds of microseconds) is appropriate to ensure
 			// different overlaps and different schedules.
@@ -357,7 +357,7 @@ func TestBounceStress(t *testing.T) {
 
 func TestInvalidate(t *testing.T) {
 	var data uintptr // Used below.
-	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, &data) // Read legitimate value.
 		for {
 			var si arch.SignalInfo
@@ -398,7 +398,7 @@ func IsFault(err error, si *arch.SignalInfo) bool {
 }
 
 func TestEmptyAddressSpace(t *testing.T) {
-	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
@@ -412,7 +412,7 @@ func TestEmptyAddressSpace(t *testing.T) {
 		}
 		return false
 	})
-	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
@@ -471,7 +471,7 @@ func BenchmarkApplicationSyscall(b *testing.B) {
 		i int // Iteration includes machine.Get() / machine.Put().
 		a int // Count for ErrContextInterrupt.
 	)
-	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
@@ -493,7 +493,7 @@ func BenchmarkApplicationSyscall(b *testing.B) {
 
 func BenchmarkKernelSyscall(b *testing.B) {
 	// Note that the target passed here is irrelevant, we never execute SwitchToUser.
-	applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		// iteration does not include machine.Get() / machine.Put().
 		for i := 0; i < b.N; i++ {
 			testutil.Getpid()
@@ -508,7 +508,7 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) {
 		i int
 		a int
 	)
-	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool {
 		var si arch.SignalInfo
 		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 8076c7529..6c54712d1 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -52,16 +52,19 @@ type machine struct {
 	// available is notified when vCPUs are available.
 	available sync.Cond
 
-	// vCPUs are the machine vCPUs.
+	// vCPUsByTID are the machine vCPUs.
 	//
 	// These are populated dynamically.
-	vCPUs map[uint64]*vCPU
+	vCPUsByTID map[uint64]*vCPU
 
 	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
-	vCPUsByID map[int]*vCPU
+	vCPUsByID []*vCPU
 
 	// maxVCPUs is the maximum number of vCPUs supported by the machine.
 	maxVCPUs int
+
+	// nextID is the next vCPU ID.
+	nextID uint32
 }
 
 const (
@@ -137,9 +140,8 @@ type dieState struct {
 //
 // Precondition: mu must be held.
 func (m *machine) newVCPU() *vCPU {
-	id := len(m.vCPUs)
-
 	// Create the vCPU.
+	id := int(atomic.AddUint32(&m.nextID, 1) - 1)
 	fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
 	if errno != 0 {
 		panic(fmt.Sprintf("error creating new vCPU: %v", errno))
@@ -176,11 +178,7 @@ func (m *machine) newVCPU() *vCPU {
 // newMachine returns a new VM context.
 func newMachine(vm int) (*machine, error) {
 	// Create the machine.
-	m := &machine{
-		fd:        vm,
-		vCPUs:     make(map[uint64]*vCPU),
-		vCPUsByID: make(map[int]*vCPU),
-	}
+	m := &machine{fd: vm}
 	m.available.L = &m.mu
 	m.kernel.Init(ring0.KernelOpts{
 		PageTables: pagetables.New(newAllocator()),
@@ -194,6 +192,10 @@ func newMachine(vm int) (*machine, error) {
 	}
 	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
 
+	// Create the vCPUs map/slices.
+	m.vCPUsByTID = make(map[uint64]*vCPU)
+	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
+
 	// Apply the physical mappings. Note that these mappings may point to
 	// guest physical addresses that are not actually available. These
 	// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -274,6 +276,8 @@ func newMachine(vm int) (*machine, error) {
 // not available. This attempts to be efficient for calls in the hot path.
 //
 // This panics on error.
+//
+//go:nosplit
 func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion, flags uint32) {
 	for end := physical + length; physical < end; {
 		_, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions)
@@ -304,7 +308,11 @@ func (m *machine) Destroy() {
 	runtime.SetFinalizer(m, nil)
 
 	// Destroy vCPUs.
-	for _, c := range m.vCPUs {
+	for _, c := range m.vCPUsByID {
+		if c == nil {
+			continue
+		}
+
 		// Ensure the vCPU is not still running in guest mode. This is
 		// possible iff teardown has been done by other threads, and
 		// somehow a single thread has not executed any system calls.
@@ -329,13 +337,15 @@ func (m *machine) Destroy() {
 }
 
 // Get gets an available vCPU.
+//
+// This will return with the OS thread locked.
 func (m *machine) Get() *vCPU {
+	m.mu.RLock()
 	runtime.LockOSThread()
 	tid := procid.Current()
-	m.mu.RLock()
 
 	// Check for an exact match.
-	if c := m.vCPUs[tid]; c != nil {
+	if c := m.vCPUsByTID[tid]; c != nil {
 		c.lock()
 		m.mu.RUnlock()
 		return c
@@ -343,15 +353,29 @@ func (m *machine) Get() *vCPU {
 
 	// The happy path failed. We now proceed to acquire an exclusive lock
 	// (because the vCPU map may change), and scan all available vCPUs.
+	// In this case, we first unlock the OS thread. Otherwise, if mu is
+	// not available, the current system thread will be parked and a new
+	// system thread spawned. We avoid this situation by simply refreshing
+	// tid after relocking the system thread.
 	m.mu.RUnlock()
+	runtime.UnlockOSThread()
 	m.mu.Lock()
+	runtime.LockOSThread()
+	tid = procid.Current()
+
+	// Recheck for an exact match.
+	if c := m.vCPUsByTID[tid]; c != nil {
+		c.lock()
+		m.mu.Unlock()
+		return c
+	}
 
 	for {
 		// Scan for an available vCPU.
-		for origTID, c := range m.vCPUs {
+		for origTID, c := range m.vCPUsByTID {
 			if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
-				delete(m.vCPUs, origTID)
-				m.vCPUs[tid] = c
+				delete(m.vCPUsByTID, origTID)
+				m.vCPUsByTID[tid] = c
 				m.mu.Unlock()
 				c.loadSegments(tid)
 				return c
@@ -359,17 +383,17 @@ func (m *machine) Get() *vCPU {
 		}
 
 		// Create a new vCPU (maybe).
-		if len(m.vCPUs) < m.maxVCPUs {
+		if int(m.nextID) < m.maxVCPUs {
 			c := m.newVCPU()
 			c.lock()
-			m.vCPUs[tid] = c
+			m.vCPUsByTID[tid] = c
 			m.mu.Unlock()
 			c.loadSegments(tid)
 			return c
 		}
 
 		// Scan for something not in user mode.
-		for origTID, c := range m.vCPUs {
+		for origTID, c := range m.vCPUsByTID {
 			if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
 				continue
 			}
@@ -387,8 +411,8 @@ func (m *machine) Get() *vCPU {
 			}
 
 			// Steal the vCPU.
-			delete(m.vCPUs, origTID)
-			m.vCPUs[tid] = c
+			delete(m.vCPUsByTID, origTID)
+			m.vCPUsByTID[tid] = c
 			m.mu.Unlock()
 			c.loadSegments(tid)
 			return c
@@ -415,7 +439,7 @@ func (m *machine) Put(c *vCPU) {
 // newDirtySet returns a new dirty set.
 func (m *machine) newDirtySet() *dirtySet {
 	return &dirtySet{
-		vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
+		vCPUMasks: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
 	}
 }
 
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 923ce3909..acc823ba6 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -51,9 +51,10 @@ func (m *machine) initArchState() error {
 		recover()
 		debug.SetPanicOnFault(old)
 	}()
-	m.retryInGuest(func() {
-		ring0.SetCPUIDFaulting(true)
-	})
+	c := m.Get()
+	defer m.Put(c)
+	bluepill(c)
+	ring0.SetCPUIDFaulting(true)
 
 	return nil
 }
@@ -89,8 +90,8 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
 	defer m.mu.Unlock()
 
 	// Clear from all PCIDs.
-	for _, c := range m.vCPUs {
-		if c.PCIDs != nil {
+	for _, c := range m.vCPUsByID {
+		if c != nil && c.PCIDs != nil {
 			c.PCIDs.Drop(pt)
 		}
 	}
@@ -335,29 +336,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
 	}
 }
 
-// retryInGuest runs the given function in guest mode.
-//
-// If the function does not complete in guest mode (due to execution of a
-// system call due to a GC stall, for example), then it will be retried. The
-// given function must be idempotent as a result of the retry mechanism.
-func (m *machine) retryInGuest(fn func()) {
-	c := m.Get()
-	defer m.Put(c)
-	for {
-		c.ClearErrorCode() // See below.
-		bluepill(c)        // Force guest mode.
-		fn()               // Execute the given function.
-		_, user := c.ErrorCode()
-		if user {
-			// If user is set, then we haven't bailed back to host
-			// mode via a kernel exception or system call. We
-			// consider the full function to have executed in guest
-			// mode and we can return.
-			break
-		}
-	}
-}
-
 // On x86 platform, the flags for "setMemoryRegion" can always be set as 0.
 // There is no need to return read-only physicalRegions.
 func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 7156c245f..290f035dd 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -154,7 +154,7 @@ func (c *vCPU) setUserRegisters(uregs *userRegs) error {
 //
 //go:nosplit
 func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
-	if _, _, errno := syscall.RawSyscall(
+	if _, _, errno := syscall.RawSyscall( // escapes: no.
 		syscall.SYS_IOCTL,
 		uintptr(c.fd),
 		_KVM_GET_REGS,
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 09552837a..750751aa3 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -28,6 +28,10 @@ type vCPUArchState struct {
 	//
 	// This starts above fixedKernelPCID.
 	PCIDs *pagetables.PCIDs
+
+	// floatingPointState is the floating point state buffer used in guest
+	// to host transitions. See usage in bluepill_arm64.go.
+	floatingPointState *arch.FloatingPointData
 }
 
 const (
@@ -56,6 +60,12 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
 		if !vr.accessType.Write && vr.accessType.Read {
 			rdonlyRegions = append(rdonlyRegions, vr.region)
 		}
+
+		// TODO(gvisor.dev/issue/2686): PROT_NONE should be specially treated.
+		// Workaround: treated as rdonly temporarily.
+		if !vr.accessType.Write && !vr.accessType.Read && !vr.accessType.Execute {
+			rdonlyRegions = append(rdonlyRegions, vr.region)
+		}
 	})
 
 	for _, r := range rdonlyRegions {
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 1c8384e6b..3c02cef7c 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -29,30 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// setMemoryRegion initializes a region.
-//
-// This may be called from bluepillHandler, and therefore returns an errno
-// directly (instead of wrapping in an error) to avoid allocations.
-//
-//go:nosplit
-func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
-	userRegion := userMemoryRegion{
-		slot:          uint32(slot),
-		flags:         0,
-		guestPhysAddr: uint64(physical),
-		memorySize:    uint64(length),
-		userspaceAddr: uint64(virtual),
-	}
-
-	// Set the region.
-	_, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(m.fd),
-		_KVM_SET_USER_MEMORY_REGION,
-		uintptr(unsafe.Pointer(&userRegion)))
-	return errno
-}
-
 type kvmVcpuInit struct {
 	target   uint32
 	features [7]uint32
@@ -72,69 +48,6 @@ func (m *machine) initArchState() error {
 	return nil
 }
 
-func getPageWithReflect(p uintptr) []byte {
-	return (*(*[0xFFFFFF]byte)(unsafe.Pointer(p & ^uintptr(syscall.Getpagesize()-1))))[:syscall.Getpagesize()]
-}
-
-// Work around: move ring0.Vectors() into a specific address with 11-bits alignment.
-//
-// According to the design documentation of Arm64,
-// the start address of exception vector table should be 11-bits aligned.
-// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S
-// But, we can't align a function's start address to a specific address by using golang.
-// We have raised this question in golang community:
-// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I
-// This function will be removed when golang supports this feature.
-//
-// There are 2 jobs were implemented in this function:
-// 1, move the start address of exception vector table into the specific address.
-// 2, modify the offset of each instruction.
-func updateVectorTable() {
-	fromLocation := reflect.ValueOf(ring0.Vectors).Pointer()
-	offset := fromLocation & (1<<11 - 1)
-	if offset != 0 {
-		offset = 1<<11 - offset
-	}
-
-	toLocation := fromLocation + offset
-	page := getPageWithReflect(toLocation)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-
-	page = getPageWithReflect(toLocation + 4096)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-
-	// Move exception-vector-table into the specific address.
-	var entry *uint32
-	var entryFrom *uint32
-	for i := 1; i <= 0x800; i++ {
-		entry = (*uint32)(unsafe.Pointer(toLocation + 0x800 - uintptr(i)))
-		entryFrom = (*uint32)(unsafe.Pointer(fromLocation + 0x800 - uintptr(i)))
-		*entry = *entryFrom
-	}
-
-	// The offset from the address of each unconditionally branch is changed.
-	// We should modify the offset of each instruction.
-	nums := []uint32{0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480, 0x500, 0x580, 0x600, 0x680, 0x700, 0x780}
-	for _, num := range nums {
-		entry = (*uint32)(unsafe.Pointer(toLocation + uintptr(num)))
-		*entry = *entry - (uint32)(offset/4)
-	}
-
-	page = getPageWithReflect(toLocation)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-
-	page = getPageWithReflect(toLocation + 4096)
-	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
-		panic(err)
-	}
-}
-
 // initArchState initializes architecture-specific state.
 func (c *vCPU) initArchState() error {
 	var (
@@ -147,6 +60,7 @@ func (c *vCPU) initArchState() error {
 	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
 	regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer())
 
+	vcpuInit.target = _KVM_ARM_TARGET_GENERIC_V8
 	vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2)
 	if _, _, errno := syscall.RawSyscall(
 		syscall.SYS_IOCTL,
@@ -158,7 +72,8 @@ func (c *vCPU) initArchState() error {
 
 	// cpacr_el1
 	reg.id = _KVM_ARM64_REGS_CPACR_EL1
-	data = (_FPEN_NOTRAP << _FPEN_SHIFT)
+	// It is off by default, and it is turned on only when in use.
+	data = 0 // Disable fpsimd.
 	if err := c.setOneRegister(&reg); err != nil {
 		return err
 	}
@@ -250,6 +165,7 @@ func (c *vCPU) initArchState() error {
 		return err
 	}
 
+	c.floatingPointState = arch.NewFloatingPointData()
 	return nil
 }
 
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index f04be2ab5..9f86f6a7a 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -115,7 +115,7 @@ func (a *atomicAddressSpace) get() *addressSpace {
 //
 //go:nosplit
 func (c *vCPU) notify() {
-	_, _, errno := syscall.RawSyscall6(
+	_, _, errno := syscall.RawSyscall6( // escapes: no.
 		syscall.SYS_FUTEX,
 		uintptr(unsafe.Pointer(&c.state)),
 		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
index f7605df8a..f7feb8683 100644
--- a/pkg/sentry/platform/kvm/testutil/BUILD
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -13,4 +13,5 @@ go_library(
         "testutil_arm64.s",
     ],
     visibility = ["//pkg/sentry/platform/kvm:__pkg__"],
+    deps = ["//pkg/sentry/arch"],
 )
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
index 4c108abbf..8048eedec 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
@@ -18,19 +18,20 @@ package testutil
 
 import (
 	"reflect"
-	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // TwiddleSegments reads segments into known registers.
 func TwiddleSegments()
 
 // SetTestTarget sets the rip appropriately.
-func SetTestTarget(regs *syscall.PtraceRegs, fn func()) {
+func SetTestTarget(regs *arch.Registers, fn func()) {
 	regs.Rip = uint64(reflect.ValueOf(fn).Pointer())
 }
 
 // SetTouchTarget sets rax appropriately.
-func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) {
+func SetTouchTarget(regs *arch.Registers, target *uintptr) {
 	if target != nil {
 		regs.Rax = uint64(reflect.ValueOf(target).Pointer())
 	} else {
@@ -39,12 +40,12 @@ func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) {
 }
 
 // RewindSyscall rewinds a syscall RIP.
-func RewindSyscall(regs *syscall.PtraceRegs) {
+func RewindSyscall(regs *arch.Registers) {
 	regs.Rip -= 2
 }
 
 // SetTestRegs initializes registers to known values.
-func SetTestRegs(regs *syscall.PtraceRegs) {
+func SetTestRegs(regs *arch.Registers) {
 	regs.R15 = 0x15
 	regs.R14 = 0x14
 	regs.R13 = 0x13
@@ -64,7 +65,7 @@ func SetTestRegs(regs *syscall.PtraceRegs) {
 }
 
 // CheckTestRegs checks that registers were twiddled per TwiddleRegs.
-func CheckTestRegs(regs *syscall.PtraceRegs, full bool) (err error) {
+func CheckTestRegs(regs *arch.Registers, full bool) (err error) {
 	if need := ^uint64(0x15); regs.R15 != need {
 		err = addRegisterMismatch(err, "R15", regs.R15, need)
 	}
@@ -121,13 +122,13 @@ var fsData uint64 = 0x55
 var gsData uint64 = 0x85
 
 // SetTestSegments initializes segments to known values.
-func SetTestSegments(regs *syscall.PtraceRegs) {
+func SetTestSegments(regs *arch.Registers) {
 	regs.Fs_base = uint64(reflect.ValueOf(&fsData).Pointer())
 	regs.Gs_base = uint64(reflect.ValueOf(&gsData).Pointer())
 }
 
 // CheckTestSegments checks that registers were twiddled per TwiddleSegments.
-func CheckTestSegments(regs *syscall.PtraceRegs) (err error) {
+func CheckTestSegments(regs *arch.Registers) (err error) {
 	if regs.Rax != fsData {
 		err = addRegisterMismatch(err, "Rax", regs.Rax, fsData)
 	}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
index 40b2e4acc..ca902c8c1 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go
@@ -19,16 +19,17 @@ package testutil
 import (
 	"fmt"
 	"reflect"
-	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // SetTestTarget sets the rip appropriately.
-func SetTestTarget(regs *syscall.PtraceRegs, fn func()) {
+func SetTestTarget(regs *arch.Registers, fn func()) {
 	regs.Pc = uint64(reflect.ValueOf(fn).Pointer())
 }
 
 // SetTouchTarget sets rax appropriately.
-func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) {
+func SetTouchTarget(regs *arch.Registers, target *uintptr) {
 	if target != nil {
 		regs.Regs[8] = uint64(reflect.ValueOf(target).Pointer())
 	} else {
@@ -37,19 +38,19 @@ func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) {
 }
 
 // RewindSyscall rewinds a syscall RIP.
-func RewindSyscall(regs *syscall.PtraceRegs) {
+func RewindSyscall(regs *arch.Registers) {
 	regs.Pc -= 4
 }
 
 // SetTestRegs initializes registers to known values.
-func SetTestRegs(regs *syscall.PtraceRegs) {
+func SetTestRegs(regs *arch.Registers) {
 	for i := 0; i <= 30; i++ {
 		regs.Regs[i] = uint64(i) + 1
 	}
 }
 
 // CheckTestRegs checks that registers were twiddled per TwiddleRegs.
-func CheckTestRegs(regs *syscall.PtraceRegs, full bool) (err error) {
+func CheckTestRegs(regs *arch.Registers, full bool) (err error) {
 	for i := 0; i <= 30; i++ {
 		if need := ^uint64(i + 1); regs.Regs[i] != need {
 			err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need)
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 2ca696382..171513f3f 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -148,6 +148,9 @@ type Context interface {
 	// Interrupt interrupts a concurrent call to Switch(), causing it to return
 	// ErrContextInterrupt.
 	Interrupt()
+
+	// Release() releases any resources associated with this context.
+	Release()
 }
 
 var (
@@ -353,10 +356,28 @@ func (fr FileRange) String() string {
 	return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
 }
 
+// Requirements is used to specify platform specific requirements.
+type Requirements struct {
+	// RequiresCurrentPIDNS indicates that the sandbox has to be started in the
+	// current pid namespace.
+	RequiresCurrentPIDNS bool
+	// RequiresCapSysPtrace indicates that the sandbox has to be started with
+	// the CAP_SYS_PTRACE capability.
+	RequiresCapSysPtrace bool
+}
+
 // Constructor represents a platform type.
 type Constructor interface {
+	// New returns a new platform instance.
+	//
+	// Arguments:
+	//
+	// * deviceFile - the device file (e.g. /dev/kvm for the KVM platform).
 	New(deviceFile *os.File) (Platform, error)
 	OpenDevice() (*os.File, error)
+
+	// Requirements returns platform specific requirements.
+	Requirements() Requirements
 }
 
 // platforms contains all available platform types.
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 95abd321e..30402c2df 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -9,6 +9,7 @@ go_library(
         "ptrace.go",
         "ptrace_amd64.go",
         "ptrace_arm64.go",
+        "ptrace_arm64_unsafe.go",
         "ptrace_unsafe.go",
         "stub_amd64.s",
         "stub_arm64.s",
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 03adb624b..08d055e05 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -177,6 +177,9 @@ func (c *context) Interrupt() {
 	c.interrupt.NotifyInterrupt()
 }
 
+// Release implements platform.Context.Release().
+func (c *context) Release() {}
+
 // PTrace represents a collection of ptrace subprocesses.
 type PTrace struct {
 	platform.MMapMinAddr
@@ -248,6 +251,16 @@ func (*constructor) OpenDevice() (*os.File, error) {
 	return nil, nil
 }
 
+// Flags implements platform.Constructor.Flags().
+func (*constructor) Requirements() platform.Requirements {
+	// TODO(b/75837838): Also set a new PID namespace so that we limit
+	// access to other host processes.
+	return platform.Requirements{
+		RequiresCapSysPtrace: true,
+		RequiresCurrentPIDNS: true,
+	}
+}
+
 func init() {
 	platform.Register("ptrace", &constructor{})
 }
diff --git a/pkg/sentry/platform/ptrace/ptrace_amd64.go b/pkg/sentry/platform/ptrace/ptrace_amd64.go
index db0212538..3b9a870a5 100644
--- a/pkg/sentry/platform/ptrace/ptrace_amd64.go
+++ b/pkg/sentry/platform/ptrace/ptrace_amd64.go
@@ -15,9 +15,8 @@
 package ptrace
 
 import (
-	"syscall"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
@@ -28,6 +27,20 @@ func fpRegSet(useXsave bool) uintptr {
 	return linux.NT_PRFPREG
 }
 
-func stackPointer(r *syscall.PtraceRegs) uintptr {
+func stackPointer(r *arch.Registers) uintptr {
 	return uintptr(r.Rsp)
 }
+
+// x86 use the fs_base register to store the TLS pointer which can be
+// get/set in "func (t *thread) get/setRegs(regs *arch.Registers)".
+// So both of the get/setTLS() operations are noop here.
+
+// getTLS gets the thread local storage register.
+func (t *thread) getTLS(tls *uint64) error {
+	return nil
+}
+
+// setTLS sets the thread local storage register.
+func (t *thread) setTLS(tls *uint64) error {
+	return nil
+}
diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64.go b/pkg/sentry/platform/ptrace/ptrace_arm64.go
index 4db28c534..5c869926a 100644
--- a/pkg/sentry/platform/ptrace/ptrace_arm64.go
+++ b/pkg/sentry/platform/ptrace/ptrace_arm64.go
@@ -15,9 +15,8 @@
 package ptrace
 
 import (
-	"syscall"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
@@ -25,6 +24,6 @@ func fpRegSet(_ bool) uintptr {
 	return linux.NT_PRFPREG
 }
 
-func stackPointer(r *syscall.PtraceRegs) uintptr {
+func stackPointer(r *arch.Registers) uintptr {
 	return uintptr(r.Sp)
 }
diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go
new file mode 100644
index 000000000..32b8a6be9
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go
@@ -0,0 +1,62 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ptrace
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+// getTLS gets the thread local storage register.
+func (t *thread) getTLS(tls *uint64) error {
+	iovec := syscall.Iovec{
+		Base: (*byte)(unsafe.Pointer(tls)),
+		Len:  uint64(unsafe.Sizeof(*tls)),
+	}
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_GETREGSET,
+		uintptr(t.tid),
+		linux.NT_ARM_TLS,
+		uintptr(unsafe.Pointer(&iovec)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// setTLS sets the thread local storage register.
+func (t *thread) setTLS(tls *uint64) error {
+	iovec := syscall.Iovec{
+		Base: (*byte)(unsafe.Pointer(tls)),
+		Len:  uint64(unsafe.Sizeof(*tls)),
+	}
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_SETREGSET,
+		uintptr(t.tid),
+		linux.NT_ARM_TLS,
+		uintptr(unsafe.Pointer(&iovec)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 6c0ed7b3e..8b72d24e8 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -24,7 +24,7 @@ import (
 )
 
 // getRegs gets the general purpose register set.
-func (t *thread) getRegs(regs *syscall.PtraceRegs) error {
+func (t *thread) getRegs(regs *arch.Registers) error {
 	iovec := syscall.Iovec{
 		Base: (*byte)(unsafe.Pointer(regs)),
 		Len:  uint64(unsafe.Sizeof(*regs)),
@@ -43,7 +43,7 @@ func (t *thread) getRegs(regs *syscall.PtraceRegs) error {
 }
 
 // setRegs sets the general purpose register set.
-func (t *thread) setRegs(regs *syscall.PtraceRegs) error {
+func (t *thread) setRegs(regs *arch.Registers) error {
 	iovec := syscall.Iovec{
 		Base: (*byte)(unsafe.Pointer(regs)),
 		Len:  uint64(unsafe.Sizeof(*regs)),
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 31b7cec53..2389423b0 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -63,7 +63,7 @@ type thread struct {
 	// initRegs are the initial registers for the first thread.
 	//
 	// These are used for the register set for system calls.
-	initRegs syscall.PtraceRegs
+	initRegs arch.Registers
 }
 
 // threadPool is a collection of threads.
@@ -317,7 +317,7 @@ const (
 )
 
 func (t *thread) dumpAndPanic(message string) {
-	var regs syscall.PtraceRegs
+	var regs arch.Registers
 	message += "\n"
 	if err := t.getRegs(&regs); err == nil {
 		message += dumpRegs(&regs)
@@ -332,7 +332,7 @@ func (t *thread) unexpectedStubExit() {
 	msg, err := t.getEventMessage()
 	status := syscall.WaitStatus(msg)
 	if status.Signaled() && status.Signal() == syscall.SIGKILL {
-		// SIGKILL can be only sent by an user or OOM-killer. In both
+		// SIGKILL can be only sent by a user or OOM-killer. In both
 		// these cases, we don't need to panic. There is no reasons to
 		// think that something wrong in gVisor.
 		log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid)
@@ -423,7 +423,7 @@ func (t *thread) init() {
 // This is _not_ for use by application system calls, rather it is for use when
 // a system call must be injected into the remote context (e.g. mmap, munmap).
 // Note that clones are handled separately.
-func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
+func (t *thread) syscall(regs *arch.Registers) (uintptr, error) {
 	// Set registers.
 	if err := t.setRegs(regs); err != nil {
 		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
@@ -461,7 +461,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 // syscallIgnoreInterrupt ignores interrupts on the system call thread and
 // restarts the syscall if the kernel indicates that should happen.
 func (t *thread) syscallIgnoreInterrupt(
-	initRegs *syscall.PtraceRegs,
+	initRegs *arch.Registers,
 	sysno uintptr,
 	args ...arch.SyscallArgument) (uintptr, error) {
 	for {
@@ -506,6 +506,9 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 	regs := &ac.StateData().Regs
 	t.resetSysemuRegs(regs)
 
+	// Extract TLS register
+	tls := uint64(ac.TLS())
+
 	// Check for interrupts, and ensure that future interrupts will signal t.
 	if !c.interrupt.Enable(t) {
 		// Pending interrupt; simulate.
@@ -526,6 +529,9 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 	if err := t.setFPRegs(fpState, uint64(fpLen), useXsave); err != nil {
 		panic(fmt.Sprintf("ptrace set fpregs (%+v) failed: %v", fpState, err))
 	}
+	if err := t.setTLS(&tls); err != nil {
+		panic(fmt.Sprintf("ptrace set tls (%+v) failed: %v", tls, err))
+	}
 
 	for {
 		// Start running until the next system call.
@@ -555,6 +561,12 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 		if err := t.getFPRegs(fpState, uint64(fpLen), useXsave); err != nil {
 			panic(fmt.Sprintf("ptrace get fpregs failed: %v", err))
 		}
+		if err := t.getTLS(&tls); err != nil {
+			panic(fmt.Sprintf("ptrace get tls failed: %v", err))
+		}
+		if !ac.SetTLS(uintptr(tls)) {
+			panic(fmt.Sprintf("tls value %v is invalid", tls))
+		}
 
 		// Is it a system call?
 		if sig == (syscallEvent | syscall.SIGTRAP) {
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index e99798c56..84b699f0d 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -21,6 +21,7 @@ import (
 	"strings"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -40,7 +41,7 @@ const (
 // resetSysemuRegs sets up emulation registers.
 //
 // This should be called prior to calling sysemu.
-func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
+func (t *thread) resetSysemuRegs(regs *arch.Registers) {
 	regs.Cs = t.initRegs.Cs
 	regs.Ss = t.initRegs.Ss
 	regs.Ds = t.initRegs.Ds
@@ -52,7 +53,7 @@ func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
 // createSyscallRegs sets up syscall registers.
 //
 // This should be called to generate registers for a system call.
-func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
+func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers {
 	// Copy initial registers.
 	regs := *initRegs
 
@@ -81,18 +82,18 @@ func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch
 }
 
 // isSingleStepping determines if the registers indicate single-stepping.
-func isSingleStepping(regs *syscall.PtraceRegs) bool {
+func isSingleStepping(regs *arch.Registers) bool {
 	return (regs.Eflags & arch.X86TrapFlag) != 0
 }
 
 // updateSyscallRegs updates registers after finishing sysemu.
-func updateSyscallRegs(regs *syscall.PtraceRegs) {
+func updateSyscallRegs(regs *arch.Registers) {
 	// Ptrace puts -ENOSYS in rax on syscall-enter-stop.
 	regs.Rax = regs.Orig_rax
 }
 
 // syscallReturnValue extracts a sensible return from registers.
-func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
+func syscallReturnValue(regs *arch.Registers) (uintptr, error) {
 	rval := int64(regs.Rax)
 	if rval < 0 {
 		return 0, syscall.Errno(-rval)
@@ -100,7 +101,7 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
 	return uintptr(rval), nil
 }
 
-func dumpRegs(regs *syscall.PtraceRegs) string {
+func dumpRegs(regs *arch.Registers) string {
 	var m strings.Builder
 
 	fmt.Fprintf(&m, "Registers:\n")
@@ -142,7 +143,7 @@ func (t *thread) adjustInitRegsRip() {
 }
 
 // Pass the expected PPID to the child via R15 when creating stub process.
-func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
+func initChildProcessPPID(initregs *arch.Registers, ppid int32) {
 	initregs.R15 = uint64(ppid)
 	// Rbx has to be set to 1 when creating stub process.
 	initregs.Rbx = 1
@@ -155,7 +156,7 @@ func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 //
 // Note that this should only be called after verifying that the signalInfo has
 // been generated by the kernel.
-func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+func patchSignalInfo(regs *arch.Registers, signalInfo *arch.SignalInfo) {
 	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
 		signalInfo.Signo = int32(linux.SIGSEGV)
 
@@ -183,13 +184,76 @@ func enableCpuidFault() {
 
 // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
 // Ref attachedThread() for more detail.
-func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
-	return append(rules, seccomp.RuleSet{
-		Rules: seccomp.SyscallRules{
-			syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-				{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet {
+	rules = append(rules,
+		// Rules for trapping vsyscall access.
+		seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_GETTIMEOFDAY: {},
+				syscall.SYS_TIME:         {},
+				unix.SYS_GETCPU:          {}, // SYS_GETCPU was not defined in package syscall on amd64.
 			},
-		},
-		Action: linux.SECCOMP_RET_ALLOW,
-	})
+			Action:   linux.SECCOMP_RET_TRAP,
+			Vsyscall: true,
+		})
+	if defaultAction != linux.SECCOMP_RET_ALLOW {
+		rules = append(rules,
+			seccomp.RuleSet{
+				Rules: seccomp.SyscallRules{
+					syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+						{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+					},
+				},
+				Action: linux.SECCOMP_RET_ALLOW,
+			})
+	}
+	return rules
+}
+
+// probeSeccomp returns true iff seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8. This check is dynamic
+// because kernels have be backported behavior.
+//
+// See createStub for more information.
+//
+// Precondition: the runtime OS thread must be locked.
+func probeSeccomp() bool {
+	// Create a completely new, destroyable process.
+	t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
+	if err != nil {
+		panic(fmt.Sprintf("seccomp probe failed: %v", err))
+	}
+	defer t.destroy()
+
+	// Set registers to the yield system call. This call is not allowed
+	// by the filters specified in the attachThread function.
+	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
+	if err := t.setRegs(&regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+
+	for {
+		// Attempt an emulation.
+		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
+			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+		}
+
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
+			// Did the seccomp errno hook already run? This would
+			// indicate that seccomp is first in line and we're
+			// less than 4.8.
+			if err := t.getRegs(&regs); err != nil {
+				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
+			}
+			if _, err := syscallReturnValue(&regs); err == nil {
+				// The seccomp errno mode ran first, and reset
+				// the error in the registers.
+				return false
+			}
+			// The seccomp hook did not run yet, and therefore it
+			// is safe to use RET_KILL mode for dispatched calls.
+			return true
+		}
+	}
 }
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index 7b975137f..bd618fae8 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -41,13 +41,13 @@ const (
 // resetSysemuRegs sets up emulation registers.
 //
 // This should be called prior to calling sysemu.
-func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
+func (t *thread) resetSysemuRegs(regs *arch.Registers) {
 }
 
 // createSyscallRegs sets up syscall registers.
 //
 // This should be called to generate registers for a system call.
-func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
+func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers {
 	// Copy initial registers (Pc, Sp, etc.).
 	regs := *initRegs
 
@@ -78,7 +78,7 @@ func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch
 }
 
 // isSingleStepping determines if the registers indicate single-stepping.
-func isSingleStepping(regs *syscall.PtraceRegs) bool {
+func isSingleStepping(regs *arch.Registers) bool {
 	// Refer to the ARM SDM D2.12.3: software step state machine
 	// return (regs.Pstate.SS == 1) && (MDSCR_EL1.SS == 1).
 	//
@@ -89,13 +89,13 @@ func isSingleStepping(regs *syscall.PtraceRegs) bool {
 }
 
 // updateSyscallRegs updates registers after finishing sysemu.
-func updateSyscallRegs(regs *syscall.PtraceRegs) {
+func updateSyscallRegs(regs *arch.Registers) {
 	// No special work is necessary.
 	return
 }
 
 // syscallReturnValue extracts a sensible return from registers.
-func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
+func syscallReturnValue(regs *arch.Registers) (uintptr, error) {
 	rval := int64(regs.Regs[0])
 	if rval < 0 {
 		return 0, syscall.Errno(-rval)
@@ -103,7 +103,7 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
 	return uintptr(rval), nil
 }
 
-func dumpRegs(regs *syscall.PtraceRegs) string {
+func dumpRegs(regs *arch.Registers) string {
 	var m strings.Builder
 
 	fmt.Fprintf(&m, "Registers:\n")
@@ -125,7 +125,7 @@ func (t *thread) adjustInitRegsRip() {
 }
 
 // Pass the expected PPID to the child via X7 when creating stub process
-func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
+func initChildProcessPPID(initregs *arch.Registers, ppid int32) {
 	initregs.Regs[7] = uint64(ppid)
 	// R9 has to be set to 1 when creating stub process.
 	initregs.Regs[9] = 1
@@ -138,7 +138,7 @@ func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 //
 // Note that this should only be called after verifying that the signalInfo has
 // been generated by the kernel.
-func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+func patchSignalInfo(regs *arch.Registers, signalInfo *arch.SignalInfo) {
 	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
 		signalInfo.Signo = int32(linux.SIGSEGV)
 
@@ -160,6 +160,15 @@ func enableCpuidFault() {
 
 // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
 // Ref attachedThread() for more detail.
-func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
+func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet {
 	return rules
 }
+
+// probeSeccomp returns true if seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8.
+//
+// On arm64, the support of PTRACE_SYSEMU was added in the 5.3 kernel, so
+// probeSeccomp can always return true.
+func probeSeccomp() bool {
+	return true
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 74968dfdf..2ce528601 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -20,7 +20,6 @@ import (
 	"fmt"
 	"syscall"
 
-	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/procid"
@@ -30,54 +29,6 @@ import (
 
 const syscallEvent syscall.Signal = 0x80
 
-// probeSeccomp returns true iff seccomp is run after ptrace notifications,
-// which is generally the case for kernel version >= 4.8. This check is dynamic
-// because kernels have be backported behavior.
-//
-// See createStub for more information.
-//
-// Precondition: the runtime OS thread must be locked.
-func probeSeccomp() bool {
-	// Create a completely new, destroyable process.
-	t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
-	if err != nil {
-		panic(fmt.Sprintf("seccomp probe failed: %v", err))
-	}
-	defer t.destroy()
-
-	// Set registers to the yield system call. This call is not allowed
-	// by the filters specified in the attachThread function.
-	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
-	if err := t.setRegs(&regs); err != nil {
-		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
-	}
-
-	for {
-		// Attempt an emulation.
-		if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
-			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
-		}
-
-		sig := t.wait(stopped)
-		if sig == (syscallEvent | syscall.SIGTRAP) {
-			// Did the seccomp errno hook already run? This would
-			// indicate that seccomp is first in line and we're
-			// less than 4.8.
-			if err := t.getRegs(&regs); err != nil {
-				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
-			}
-			if _, err := syscallReturnValue(&regs); err == nil {
-				// The seccomp errno mode ran first, and reset
-				// the error in the registers.
-				return false
-			}
-			// The seccomp hook did not run yet, and therefore it
-			// is safe to use RET_KILL mode for dispatched calls.
-			return true
-		}
-	}
-}
-
 // createStub creates a fresh stub processes.
 //
 // Precondition: the runtime OS thread must be locked.
@@ -123,18 +74,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 	// stub and all its children. This is used to create child stubs
 	// (below), so we must include the ability to fork, but otherwise lock
 	// down available calls only to what is needed.
-	rules := []seccomp.RuleSet{
-		// Rules for trapping vsyscall access.
-		{
-			Rules: seccomp.SyscallRules{
-				syscall.SYS_GETTIMEOFDAY: {},
-				syscall.SYS_TIME:         {},
-				unix.SYS_GETCPU:          {}, // SYS_GETCPU was not defined in package syscall on amd64.
-			},
-			Action:   linux.SECCOMP_RET_TRAP,
-			Vsyscall: true,
-		},
-	}
+	rules := []seccomp.RuleSet{}
 	if defaultAction != linux.SECCOMP_RET_ALLOW {
 		rules = append(rules, seccomp.RuleSet{
 			Rules: seccomp.SyscallRules{
@@ -173,9 +113,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 			},
 			Action: linux.SECCOMP_RET_ALLOW,
 		})
-
-		rules = appendArchSeccompRules(rules)
 	}
+	rules = appendArchSeccompRules(rules, defaultAction)
 	instrs, err := seccomp.BuildProgram(rules, defaultAction)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index 2ae6b9f9d..0bee995e4 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 934b6fbcd..679b287c3 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -72,11 +72,14 @@ go_library(
         "lib_amd64.s",
         "lib_arm64.go",
         "lib_arm64.s",
+        "lib_arm64_unsafe.go",
         "ring0.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/cpuid",
+        "//pkg/safecopy",
+        "//pkg/sentry/arch",
         "//pkg/sentry/platform/ring0/pagetables",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index f6da41c27..8122ac6e2 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -27,26 +27,27 @@ const (
 	_PTE_PGT_BASE = 0x7000
 	_PTE_PGT_SIZE = 0x1000
 
-	_PSR_MODE_EL0t = 0x0
-	_PSR_MODE_EL1t = 0x4
-	_PSR_MODE_EL1h = 0x5
-	_PSR_EL_MASK   = 0xf
-
-	_PSR_D_BIT = 0x200
-	_PSR_A_BIT = 0x100
-	_PSR_I_BIT = 0x80
-	_PSR_F_BIT = 0x40
+	_PSR_D_BIT = 0x00000200
+	_PSR_A_BIT = 0x00000100
+	_PSR_I_BIT = 0x00000080
+	_PSR_F_BIT = 0x00000040
 )
 
 const (
+	// PSR bits
+	PSR_MODE_EL0t = 0x00000000
+	PSR_MODE_EL1t = 0x00000004
+	PSR_MODE_EL1h = 0x00000005
+	PSR_MODE_MASK = 0x0000000f
+
 	// KernelFlagsSet should always be set in the kernel.
-	KernelFlagsSet = _PSR_MODE_EL1h
+	KernelFlagsSet = PSR_MODE_EL1h
 
 	// UserFlagsSet are always set in userspace.
-	UserFlagsSet = _PSR_MODE_EL0t
+	UserFlagsSet = PSR_MODE_EL0t
 
-	KernelFlagsClear = _PSR_EL_MASK
-	UserFlagsClear   = _PSR_EL_MASK
+	KernelFlagsClear = PSR_MODE_MASK
+	UserFlagsClear   = PSR_MODE_MASK
 
 	PsrDefaultSet = _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
 )
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index 86fd5ed58..e6daf24df 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -15,8 +15,7 @@
 package ring0
 
 import (
-	"syscall"
-
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
 )
 
@@ -72,7 +71,7 @@ type CPU struct {
 
 	// registers is a set of registers; these may be used on kernel system
 	// calls and exceptions via the Registers function.
-	registers syscall.PtraceRegs
+	registers arch.Registers
 
 	// hooks are kernel hooks.
 	hooks Hooks
@@ -83,14 +82,14 @@ type CPU struct {
 // This is explicitly safe to call during KernelException and KernelSyscall.
 //
 //go:nosplit
-func (c *CPU) Registers() *syscall.PtraceRegs {
+func (c *CPU) Registers() *arch.Registers {
 	return &c.registers
 }
 
 // SwitchOpts are passed to the Switch function.
 type SwitchOpts struct {
 	// Registers are the user register state.
-	Registers *syscall.PtraceRegs
+	Registers *arch.Registers
 
 	// FloatingPointState is a byte pointer where floating point state is
 	// saved and restored.
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index 1583dda12..0e2ab716c 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -124,6 +124,12 @@ func (c *CPU) SetAppAddr(value uintptr) {
 	c.appAddr = value
 }
 
+// GetLazyVFP returns the value of cpacr_el1.
+//go:nosplit
+func (c *CPU) GetLazyVFP() (value uintptr) {
+	return c.lazyVFP
+}
+
 // SwitchArchOpts are embedded in SwitchOpts.
 type SwitchArchOpts struct {
 	// UserASID indicates that the application ASID to be used on switch,
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
index a5ce67885..7fa43c2f5 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.go
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -17,7 +17,7 @@
 package ring0
 
 import (
-	"syscall"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // This is an assembly function.
@@ -41,7 +41,7 @@ func swapgs()
 // The return code is the vector that interrupted execution.
 //
 // See stubs.go for a note regarding the frame size of this function.
-func sysret(*CPU, *syscall.PtraceRegs) Vector
+func sysret(*CPU, *arch.Registers) Vector
 
 // "iret is the cadillac of CPL switching."
 //
@@ -50,7 +50,7 @@ func sysret(*CPU, *syscall.PtraceRegs) Vector
 // iret is nearly identical to sysret, except an iret is used to fully restore
 // all user state. This must be called in cases where all registers need to be
 // restored.
-func iret(*CPU, *syscall.PtraceRegs) Vector
+func iret(*CPU, *arch.Registers) Vector
 
 // exception is the generic exception entry.
 //
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index baa6c4910..db6465663 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -25,10 +25,14 @@
 // not available for calls.
 //
 
+// ERET returns using the ELR and SPSR for the current exception level.
 #define ERET() \
   WORD $0xd69f03e0
 
+// RSV_REG is a register that holds el1 information temporarily.
 #define RSV_REG 	R18_PLATFORM
+
+// RSV_REG_APP is a register that holds el0 information temporarily.
 #define RSV_REG_APP 	R9
 
 #define FPEN_NOTRAP 	0x3
@@ -36,6 +40,12 @@
 
 #define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT)
 
+// Saves a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not saved: R9, R18.
 #define REGISTERS_SAVE(reg, offset) \
   MOVD R0, offset+PTRACE_R0(reg); \
   MOVD R1, offset+PTRACE_R1(reg); \
@@ -67,6 +77,12 @@
   MOVD R29, offset+PTRACE_R29(reg); \
   MOVD R30, offset+PTRACE_R30(reg);
 
+// Loads a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not loaded: R9, R18.
 #define REGISTERS_LOAD(reg, offset) \
   MOVD offset+PTRACE_R0(reg), R0; \
   MOVD offset+PTRACE_R1(reg), R1; \
@@ -98,7 +114,7 @@
   MOVD offset+PTRACE_R29(reg), R29; \
   MOVD offset+PTRACE_R30(reg), R30;
 
-//NOP
+// NOP-s
 #define nop31Instructions() \
         WORD $0xd503201f; \
         WORD $0xd503201f; \
@@ -254,6 +270,7 @@
 #define ESR_ELx_WFx_ISS_WFE	(UL(1) << 0)
 #define ESR_ELx_xVC_IMM_MASK	((1UL << 16) - 1)
 
+// LOAD_KERNEL_ADDRESS loads a kernel address.
 #define LOAD_KERNEL_ADDRESS(from, to) \
 	MOVD from, to; \
 	ORR $0xffff000000000000, to, to;
@@ -263,15 +280,18 @@
 	LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \
 	MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \
 	MOVD RSV_REG, RSP; \
+	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
 	ISB $15; \
 	DSB $15;
 
+// SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application.
 #define SWITCH_TO_APP_PAGETABLE(from) \
 	MOVD CPU_TTBR0_APP(from), RSV_REG; \
 	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
 	ISB $15; \
 	DSB $15;
 
+// SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable.
 #define SWITCH_TO_KVM_PAGETABLE(from) \
 	MOVD CPU_TTBR0_KVM(from), RSV_REG; \
 	WORD $0xd5182012; \	//        MSR R18, TTBR0_EL1
@@ -294,6 +314,7 @@
 	WORD $0xd5181040; \ //MSR R0, CPACR_EL1
 	ISB $15;
 
+// KERNEL_ENTRY_FROM_EL0 is the entry code of the vcpu from el0 to el1.
 #define KERNEL_ENTRY_FROM_EL0 \
 	SUB $16, RSP, RSP; \		// step1, save r18, r9 into kernel temporary stack.
 	STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
@@ -315,19 +336,22 @@
 	WORD $0xd5384103; \      //  MRS SP_EL0, R3
 	MOVD R3, PTRACE_SP(RSV_REG_APP);
 
+// KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1.
 #define KERNEL_ENTRY_FROM_EL1 \
 	WORD $0xd538d092; \   //MRS   TPIDR_EL1, R18
-	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// save sentry context
+	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \	// Save sentry context.
 	MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \
 	WORD $0xd5384004; \    //    MRS SPSR_EL1, R4
 	MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \
 	MRS ELR_EL1, R4; \
 	MOVD R4, CPU_REGISTERS+PTRACE_PC(RSV_REG); \
 	MOVD RSP, R4; \
-	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG);
+	MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \
+	LOAD_KERNEL_STACK(RSV_REG);  // Load the temporary stack.
 
+// Halt halts execution.
 TEXT ·Halt(SB),NOSPLIT,$0
-	// clear bluepill.
+	// Clear bluepill.
 	WORD $0xd538d092   //MRS   TPIDR_EL1, R18
 	CMP RSV_REG, R9
 	BNE mmio_exit
@@ -341,8 +365,22 @@ mmio_exit:
 	// MMIO_EXIT.
 	MOVD $0, R9
 	MOVD R0, 0xffff000000001000(R9)
-	B ·kernelExitToEl1(SB)
+	RET
+
+// HaltAndResume halts execution and point the pointer to the resume function.
+TEXT ·HaltAndResume(SB),NOSPLIT,$0
+	BL ·Halt(SB)
+	B ·kernelExitToEl1(SB) // Resume.
+
+// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume.
+TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0
+	WORD $0xd538d092            // MRS TPIDR_EL1, R18
+	MOVD CPU_SELF(RSV_REG), R3  // Load vCPU.
+	MOVD R3, 8(RSP)             // First argument (vCPU).
+	CALL ·kernelSyscall(SB)     // Call the trampoline.
+	B ·kernelExitToEl1(SB)      // Resume.
 
+// Shutdown stops the guest.
 TEXT ·Shutdown(SB),NOSPLIT,$0
 	// PSCI EVENT.
 	MOVD $0x84000009, R0
@@ -356,6 +394,8 @@ TEXT ·Current(SB),NOSPLIT,$0-8
 
 #define STACK_FRAME_SIZE 16
 
+// kernelExitToEl0 is the entrypoint for application in guest_el0.
+// Prepare the vcpu environment for container application.
 TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 	// Step1, save sentry context into memory.
 	REGISTERS_SAVE(RSV_REG, CPU_REGISTERS)
@@ -426,9 +466,26 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
 
 	ERET()
 
+// kernelExitToEl1 is the entrypoint for sentry in guest_el1.
+// Prepare the vcpu environment for sentry.
 TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
+	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
+
+	MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R1
+	WORD $0xd5184001  //MSR R1, SPSR_EL1
+
+	MOVD CPU_REGISTERS+PTRACE_PC(RSV_REG), R1
+	MSR R1, ELR_EL1
+
+	MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1
+	MOVD R1, RSP
+
+	REGISTERS_LOAD(RSV_REG, CPU_REGISTERS)
+	MOVD CPU_REGISTERS+PTRACE_R9(RSV_REG), RSV_REG_APP
+
 	ERET()
 
+// Start is the CPU entrypoint.
 TEXT ·Start(SB),NOSPLIT,$0
 	IRQ_DISABLE
 	MOVD R8, RSV_REG
@@ -437,18 +494,23 @@ TEXT ·Start(SB),NOSPLIT,$0
 
 	B ·kernelExitToEl1(SB)
 
+// El1_sync_invalid is the handler for an invalid EL1_sync.
 TEXT ·El1_sync_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_irq_invalid is the handler for an invalid El1_irq.
 TEXT ·El1_irq_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_fiq_invalid is the handler for an invalid El1_fiq.
 TEXT ·El1_fiq_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_error_invalid is the handler for an invalid El1_error.
 TEXT ·El1_error_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_sync is the handler for El1_sync.
 TEXT ·El1_sync(SB),NOSPLIT,$0
 	KERNEL_ENTRY_FROM_EL1
 	WORD $0xd5385219        // MRS ESR_EL1, R25
@@ -484,10 +546,10 @@ el1_da:
 	MOVD $PageFault, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el1_ia:
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el1_sp_pc:
 	B ·Shutdown(SB)
@@ -496,7 +558,9 @@ el1_undef:
 	B ·Shutdown(SB)
 
 el1_svc:
-	B ·Halt(SB)
+	MOVD $0, CPU_ERROR_CODE(RSV_REG)
+	MOVD $0, CPU_ERROR_TYPE(RSV_REG)
+	B ·HaltEl1SvcAndResume(SB)
 
 el1_dbg:
 	B ·Shutdown(SB)
@@ -508,15 +572,19 @@ el1_fpsimd_acc:
 el1_invalid:
 	B ·Shutdown(SB)
 
+// El1_irq is the handler for El1_irq.
 TEXT ·El1_irq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_fiq is the handler for El1_fiq.
 TEXT ·El1_fiq(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El1_error is the handler for El1_error.
 TEXT ·El1_error(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// El0_sync is the handler for El0_sync.
 TEXT ·El0_sync(SB),NOSPLIT,$0
 	KERNEL_ENTRY_FROM_EL0
 	WORD $0xd5385219	// MRS ESR_EL1, R25
@@ -554,7 +622,7 @@ el0_svc:
 	MOVD $Syscall, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el0_da:
 	WORD $0xd538d092     //MRS   TPIDR_EL1, R18
@@ -568,7 +636,7 @@ el0_da:
 	MOVD $PageFault, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 el0_ia:
 	B ·Shutdown(SB)
@@ -613,7 +681,7 @@ TEXT ·El0_error(SB),NOSPLIT,$0
 	MOVD $VirtualizationException, R3
 	MOVD R3, CPU_VECTOR_CODE(RSV_REG)
 
-	B ·Halt(SB)
+	B ·HaltAndResume(SB)
 
 TEXT ·El0_sync_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
@@ -627,6 +695,7 @@ TEXT ·El0_fiq_invalid(SB),NOSPLIT,$0
 TEXT ·El0_error_invalid(SB),NOSPLIT,$0
 	B ·Shutdown(SB)
 
+// Vectors implements exception vector table.
 TEXT ·Vectors(SB),NOSPLIT,$0
 	B ·El1_sync_invalid(SB)
 	nop31Instructions()
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 4cae10459..549f3d228 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -27,6 +27,7 @@ go_binary(
     visibility = ["//pkg/sentry/platform/ring0:__pkg__"],
     deps = [
         "//pkg/cpuid",
+        "//pkg/sentry/arch",
         "//pkg/sentry/platform/ring0/pagetables",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 900c0bba7..021693791 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -31,23 +31,39 @@ type defaultHooks struct{}
 
 // KernelSyscall implements Hooks.KernelSyscall.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (defaultHooks) KernelSyscall() { Halt() }
+func (defaultHooks) KernelSyscall() {
+	Halt()
+}
 
 // KernelException implements Hooks.KernelException.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (defaultHooks) KernelException(Vector) { Halt() }
+func (defaultHooks) KernelException(Vector) {
+	Halt()
+}
 
 // kernelSyscall is a trampoline.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
-func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() }
+func kernelSyscall(c *CPU) {
+	c.hooks.KernelSyscall()
+}
 
 // kernelException is a trampoline.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
-func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) }
+func kernelException(c *CPU, vector Vector) {
+	c.hooks.KernelException(vector)
+}
 
 // Init initializes a new CPU.
 //
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 0feff8778..d37981dbf 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -178,6 +178,8 @@ func IsCanonical(addr uint64) bool {
 //
 // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
@@ -192,9 +194,9 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 
 	// Perform the switch.
 	swapgs()                                         // GS will be swapped on return.
-	WriteFS(uintptr(regs.Fs_base))                   // Set application FS.
-	WriteGS(uintptr(regs.Gs_base))                   // Set application GS.
-	LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point.
+	WriteFS(uintptr(regs.Fs_base))                   // escapes: no. Set application FS.
+	WriteGS(uintptr(regs.Gs_base))                   // escapes: no. Set application GS.
+	LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
 	jumpToKernel()                                   // Switch to upper half.
 	writeCR3(uintptr(userCR3))                       // Change to user address space.
 	if switchOpts.FullRestore {
@@ -204,8 +206,8 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	}
 	writeCR3(uintptr(kernelCR3))                     // Return to kernel address space.
 	jumpToUser()                                     // Return to lower half.
-	SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point.
-	WriteFS(uintptr(c.registers.Fs_base))            // Restore kernel FS.
+	SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point.
+	WriteFS(uintptr(c.registers.Fs_base))            // escapes: no. Restore kernel FS.
 	return
 }
 
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index c3d341998..ccacaea6b 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -16,6 +16,14 @@
 
 package ring0
 
+// HaltAndResume halts execution and point the pointer to the resume function.
+//go:nosplit
+func HaltAndResume()
+
+// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume.
+//go:nosplit
+func HaltEl1SvcAndResume()
+
 // init initializes architecture-specific state.
 func (k *Kernel) init(opts KernelOpts) {
 	// Save the root page tables.
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index af075aae4..a6345010d 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -20,13 +20,13 @@ package ring0
 func CPACREL1() (value uintptr)
 
 // FPCR returns the value of FPCR register.
-func FPCR() (value uintptr)
+func GetFPCR() (value uintptr)
 
 // SetFPCR writes the FPCR value.
 func SetFPCR(value uintptr)
 
 // FPSR returns the value of FPSR register.
-func FPSR() (value uintptr)
+func GetFPSR() (value uintptr)
 
 // SetFPSR writes the FPSR value.
 func SetFPSR(value uintptr)
@@ -37,3 +37,16 @@ func SaveVRegs(*byte)
 
 // LoadVRegs loads V0-V31 registers.
 func LoadVRegs(*byte)
+
+// GetTLS returns the value of TPIDR_EL0 register.
+func GetTLS() (value uint64)
+
+// SetTLS writes the TPIDR_EL0 value.
+func SetTLS(value uint64)
+
+// Init sets function pointers based on architectural features.
+//
+// This must be called prior to using ring0.
+func Init() {
+	rewriteVectors()
+}
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index 0e6a6235b..b63e14b41 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -15,6 +15,16 @@
 #include "funcdata.h"
 #include "textflag.h"
 
+TEXT ·GetTLS(SB),NOSPLIT,$0-8
+	MRS TPIDR_EL0, R1
+	MOVD R1, ret+0(FP)
+	RET
+
+TEXT ·SetTLS(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R1
+	MSR R1, TPIDR_EL0
+	RET
+
 TEXT ·CPACREL1(SB),NOSPLIT,$0-8
 	WORD $0xd5381041 	// MRS CPACR_EL1, R1
 	MOVD R1, ret+0(FP)
diff --git a/pkg/sentry/platform/ring0/lib_arm64_unsafe.go b/pkg/sentry/platform/ring0/lib_arm64_unsafe.go
new file mode 100644
index 000000000..c05166fea
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_arm64_unsafe.go
@@ -0,0 +1,108 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ring0
+
+import (
+	"reflect"
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/safecopy"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	nopInstruction = 0xd503201f
+	instSize       = unsafe.Sizeof(uint32(0))
+	vectorsRawLen  = 0x800
+)
+
+func unsafeSlice(addr uintptr, length int) (slice []uint32) {
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	hdr.Data = addr
+	hdr.Len = length / int(instSize)
+	hdr.Cap = length / int(instSize)
+	return slice
+}
+
+// Work around: move ring0.Vectors() into a specific address with 11-bits alignment.
+//
+// According to the design documentation of Arm64,
+// the start address of exception vector table should be 11-bits aligned.
+// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S
+// But, we can't align a function's start address to a specific address by using golang.
+// We have raised this question in golang community:
+// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I
+// This function will be removed when golang supports this feature.
+//
+// There are 2 jobs were implemented in this function:
+// 1, move the start address of exception vector table into the specific address.
+// 2, modify the offset of each instruction.
+func rewriteVectors() {
+	vectorsBegin := reflect.ValueOf(Vectors).Pointer()
+
+	// The exception-vector-table is required to be 11-bits aligned.
+	// And the size is 0x800.
+	// Please see the documentation as reference:
+	// https://developer.arm.com/docs/100933/0100/aarch64-exception-vector-table
+	//
+	// But, golang does not allow to set a function's address to a specific value.
+	// So, for gvisor, I defined the size of exception-vector-table as 4K,
+	// filled the 2nd 2K part with NOP-s.
+	// So that, I can safely move the 1st 2K part into the address with 11-bits alignment.
+	//
+	// So, the prerequisite for this function to work correctly is:
+	// vectorsSafeLen >= 0x1000
+	// vectorsRawLen  = 0x800
+	vectorsSafeLen := int(safecopy.FindEndAddress(vectorsBegin) - vectorsBegin)
+	if vectorsSafeLen < 2*vectorsRawLen {
+		panic("Can't update vectors")
+	}
+
+	vectorsSafeTable := unsafeSlice(vectorsBegin, vectorsSafeLen) // Now a []uint32
+	vectorsRawLen32 := vectorsRawLen / int(instSize)
+
+	offset := vectorsBegin & (1<<11 - 1)
+	if offset != 0 {
+		offset = 1<<11 - offset
+	}
+
+	pageBegin := (vectorsBegin + offset) & ^uintptr(usermem.PageSize-1)
+
+	_, _, errno := syscall.Syscall(syscall.SYS_MPROTECT, uintptr(pageBegin), uintptr(usermem.PageSize), uintptr(syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC))
+	if errno != 0 {
+		panic(errno.Error())
+	}
+
+	offset = offset / instSize // By index, not bytes.
+	// Move exception-vector-table into the specific address, should uses memmove here.
+	for i := 1; i <= vectorsRawLen32; i++ {
+		vectorsSafeTable[int(offset)+vectorsRawLen32-i] = vectorsSafeTable[vectorsRawLen32-i]
+	}
+
+	// Adjust branch since instruction was moved forward.
+	for i := 0; i < vectorsRawLen32; i++ {
+		if vectorsSafeTable[int(offset)+i] != nopInstruction {
+			vectorsSafeTable[int(offset)+i] -= uint32(offset)
+		}
+	}
+
+	_, _, errno = syscall.Syscall(syscall.SYS_MPROTECT, uintptr(pageBegin), uintptr(usermem.PageSize), uintptr(syscall.PROT_READ|syscall.PROT_EXEC))
+	if errno != 0 {
+		panic(errno.Error())
+	}
+}
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index 85cc3fdad..b8ab120a0 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -20,7 +20,8 @@ import (
 	"fmt"
 	"io"
 	"reflect"
-	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // Emit prints architecture-specific offsets.
@@ -64,7 +65,7 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define SyscallInt80               0x%02x\n", SyscallInt80)
 	fmt.Fprintf(w, "#define Syscall                    0x%02x\n", Syscall)
 
-	p := &syscall.PtraceRegs{}
+	p := &arch.Registers{}
 	fmt.Fprintf(w, "\n// Ptrace registers.\n")
 	fmt.Fprintf(w, "#define PTRACE_R15      0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer())
 	fmt.Fprintf(w, "#define PTRACE_R14      0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer())
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index 057fb5c69..f3de962f0 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -20,7 +20,8 @@ import (
 	"fmt"
 	"io"
 	"reflect"
-	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 // Emit prints architecture-specific offsets.
@@ -87,7 +88,7 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall)
 	fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException)
 
-	p := &syscall.PtraceRegs{}
+	p := &arch.Registers{}
 	fmt.Fprintf(w, "\n// Ptrace registers.\n")
 	fmt.Fprintf(w, "#define PTRACE_R0       0x%02x\n", reflect.ValueOf(&p.Regs[0]).Pointer()-reflect.ValueOf(p).Pointer())
 	fmt.Fprintf(w, "#define PTRACE_R1       0x%02x\n", reflect.ValueOf(&p.Regs[1]).Pointer()-reflect.ValueOf(p).Pointer())
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 971eed7fa..16d5f478b 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -7,7 +7,7 @@ go_template(
     name = "generic_walker",
     srcs = select_arch(
         amd64 = ["walker_amd64.go"],
-        arm64 = ["walker_amd64.go"],
+        arm64 = ["walker_arm64.go"],
     ),
     opt_types = [
         "Visitor",
@@ -80,6 +80,9 @@ go_library(
         "pagetables_amd64.go",
         "pagetables_arm64.go",
         "pagetables_x86.go",
+        "pcids.go",
+        "pcids_aarch64.go",
+        "pcids_aarch64.s",
         "pcids_x86.go",
         "walker_amd64.go",
         "walker_arm64.go",
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
index 23fd5c352..8d75b7599 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -53,9 +53,14 @@ type RuntimeAllocator struct {
 
 // NewRuntimeAllocator returns an allocator that uses runtime allocation.
 func NewRuntimeAllocator() *RuntimeAllocator {
-	return &RuntimeAllocator{
-		used: make(map[*PTEs]struct{}),
-	}
+	r := new(RuntimeAllocator)
+	r.Init()
+	return r
+}
+
+// Init initializes a RuntimeAllocator.
+func (r *RuntimeAllocator) Init() {
+	r.used = make(map[*PTEs]struct{})
 }
 
 // Recycle returns freed pages to the pool.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 87e88e97d..7f18ac296 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -86,6 +86,8 @@ func (*mapVisitor) requiresSplit() bool { return true }
 //
 // Precondition: addr & length must be page-aligned, their sum must not overflow.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
 	if !opts.AccessType.Any() {
@@ -128,6 +130,8 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
 //
 // Precondition: addr & length must be page-aligned.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
 	w := unmapWalker{
@@ -162,6 +166,8 @@ func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) {
 //
 // Precondition: addr & length must be page-aligned.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
 	w := emptyWalker{
@@ -197,6 +203,8 @@ func (*lookupVisitor) requiresSplit() bool { return false }
 
 // Lookup returns the physical address for the given virtual address.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
 	mask := uintptr(usermem.PageSize - 1)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index dcf061df9..157438d9b 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package pagetables
 
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids.go b/pkg/sentry/platform/ring0/pagetables/pcids.go
new file mode 100644
index 000000000..964496aac
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids.go
@@ -0,0 +1,104 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// PCIDs is a simple PCID database.
+//
+// This is not protected by locks and is thus suitable for use only with a
+// single CPU at a time.
+type PCIDs struct {
+	// mu protects below.
+	mu sync.Mutex
+
+	// cache are the assigned page tables.
+	cache map[*PageTables]uint16
+
+	// avail are available PCIDs.
+	avail []uint16
+}
+
+// NewPCIDs returns a new PCID database.
+//
+// start is the first index to assign. Typically this will be one, as the zero
+// pcid will always be flushed on transition (see pagetables_x86.go). This may
+// be more than one if specific PCIDs are reserved.
+//
+// Nil is returned iff the start and size are out of range.
+func NewPCIDs(start, size uint16) *PCIDs {
+	if start+uint16(size) > limitPCID {
+		return nil // See comment.
+	}
+	p := &PCIDs{
+		cache: make(map[*PageTables]uint16),
+	}
+	for pcid := start; pcid < start+size; pcid++ {
+		p.avail = append(p.avail, pcid)
+	}
+	return p
+}
+
+// Assign assigns a PCID to the given PageTables.
+//
+// This may overwrite any previous assignment provided. If this in the case,
+// true is returned to indicate that the PCID should be flushed.
+func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
+	p.mu.Lock()
+	if pcid, ok := p.cache[pt]; ok {
+		p.mu.Unlock()
+		return pcid, false // No flush.
+	}
+
+	// Is there something available?
+	if len(p.avail) > 0 {
+		pcid := p.avail[len(p.avail)-1]
+		p.avail = p.avail[:len(p.avail)-1]
+		p.cache[pt] = pcid
+
+		// We need to flush because while this is in the available
+		// pool, it may have been used previously.
+		p.mu.Unlock()
+		return pcid, true
+	}
+
+	// Evict an existing table.
+	for old, pcid := range p.cache {
+		delete(p.cache, old)
+		p.cache[pt] = pcid
+
+		// A flush is definitely required in this case, these page
+		// tables may still be active. (They will just be assigned some
+		// other PCID if and when they hit the given CPU again.)
+		p.mu.Unlock()
+		return pcid, true
+	}
+
+	// No PCID.
+	p.mu.Unlock()
+	return 0, false
+}
+
+// Drop drops references to a set of page tables.
+func (p *PCIDs) Drop(pt *PageTables) {
+	p.mu.Lock()
+	if pcid, ok := p.cache[pt]; ok {
+		delete(p.cache, pt)
+		p.avail = append(p.avail, pcid)
+	}
+	p.mu.Unlock()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go
new file mode 100644
index 000000000..fbfd41d83
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package pagetables
+
+// limitPCID is the maximum value of PCIDs.
+//
+// In VMSAv8-64, the PCID(ASID) size is an IMPLEMENTATION DEFINED choice
+// of 8 bits or 16 bits, and ID_AA64MMFR0_EL1.ASIDBits identifies the
+// supported size. When an implementation supports a 16-bit ASID, TCR_ELx.AS
+// selects whether the top 8 bits of the ASID are used.
+var limitPCID uint16
+
+// GetASIDBits return the system ASID bits, 8 or 16 bits.
+func GetASIDBits() uint8
+
+func init() {
+	limitPCID = uint16(1)<<GetASIDBits() - 1
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s
new file mode 100644
index 000000000..e9d62d768
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+#include "funcdata.h"
+#include "textflag.h"
+
+#define ID_AA64MMFR0_ASIDBITS_SHIFT 4
+#define ID_AA64MMFR0_ASIDBITS_16 2
+#define TCR_EL1_AS_BIT 36
+
+// GetASIDBits return the system ASID bits, 8 or 16 bits.
+//
+// func GetASIDBits() uint8
+TEXT ·GetASIDBits(SB),NOSPLIT,$0-1
+	// First, check whether 16bits ASID is supported.
+	// ID_AA64MMFR0_EL1.ASIDBITS[7:4] == 0010.
+	WORD $0xd5380700    // MRS ID_AA64MMFR0_EL1, R0
+	UBFX $ID_AA64MMFR0_ASIDBITS_SHIFT, R0, $4, R0
+	CMPW $ID_AA64MMFR0_ASIDBITS_16, R0
+	BNE bits_8
+
+	// Second, check whether 16bits ASID is enabled.
+	// TCR_EL1.AS[36] == 1.
+	WORD $0xd5382040    // MRS TCR_EL1, R0
+	TBZ  $TCR_EL1_AS_BIT, R0, bits_8
+	MOVD $16, R0
+	B done
+bits_8:
+	MOVD $8, R0
+done:
+	MOVB R0, ret+0(FP)
+	RET
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
index e199bae18..91fc5e8dd 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,94 +16,5 @@
 
 package pagetables
 
-import (
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-// limitPCID is the number of valid PCIDs.
-const limitPCID = 4096
-
-// PCIDs is a simple PCID database.
-//
-// This is not protected by locks and is thus suitable for use only with a
-// single CPU at a time.
-type PCIDs struct {
-	// mu protects below.
-	mu sync.Mutex
-
-	// cache are the assigned page tables.
-	cache map[*PageTables]uint16
-
-	// avail are available PCIDs.
-	avail []uint16
-}
-
-// NewPCIDs returns a new PCID database.
-//
-// start is the first index to assign. Typically this will be one, as the zero
-// pcid will always be flushed on transition (see pagetables_x86.go). This may
-// be more than one if specific PCIDs are reserved.
-//
-// Nil is returned iff the start and size are out of range.
-func NewPCIDs(start, size uint16) *PCIDs {
-	if start+uint16(size) >= limitPCID {
-		return nil // See comment.
-	}
-	p := &PCIDs{
-		cache: make(map[*PageTables]uint16),
-	}
-	for pcid := start; pcid < start+size; pcid++ {
-		p.avail = append(p.avail, pcid)
-	}
-	return p
-}
-
-// Assign assigns a PCID to the given PageTables.
-//
-// This may overwrite any previous assignment provided. If this in the case,
-// true is returned to indicate that the PCID should be flushed.
-func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
-	p.mu.Lock()
-	if pcid, ok := p.cache[pt]; ok {
-		p.mu.Unlock()
-		return pcid, false // No flush.
-	}
-
-	// Is there something available?
-	if len(p.avail) > 0 {
-		pcid := p.avail[len(p.avail)-1]
-		p.avail = p.avail[:len(p.avail)-1]
-		p.cache[pt] = pcid
-
-		// We need to flush because while this is in the available
-		// pool, it may have been used previously.
-		p.mu.Unlock()
-		return pcid, true
-	}
-
-	// Evict an existing table.
-	for old, pcid := range p.cache {
-		delete(p.cache, old)
-		p.cache[pt] = pcid
-
-		// A flush is definitely required in this case, these page
-		// tables may still be active. (They will just be assigned some
-		// other PCID if and when they hit the given CPU again.)
-		p.mu.Unlock()
-		return pcid, true
-	}
-
-	// No PCID.
-	p.mu.Unlock()
-	return 0, false
-}
-
-// Drop drops references to a set of page tables.
-func (p *PCIDs) Drop(pt *PageTables) {
-	p.mu.Lock()
-	if pcid, ok := p.cache[pt]; ok {
-		delete(p.cache, pt)
-		p.avail = append(p.avail, pcid)
-	}
-	p.mu.Unlock()
-}
+// limitPCID is the maximum value of valid PCIDs.
+const limitPCID = 4095
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 5f80d64e8..9da0ea685 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build i386 amd64
+// +build 386 amd64
 
 package ring0
 
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index ba1f9043d..83195d5a1 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -85,6 +85,11 @@ func StartSignalForwarding(handler func(linux.Signal)) func() {
 	for sig := 1; sig <= numSignals+1; sig++ {
 		sigchan := make(chan os.Signal, 1)
 		sigchans = append(sigchans, sigchan)
+
+		// SIGURG is used by Go's runtime scheduler.
+		if sig == int(linux.SIGURG) {
+			continue
+		}
 		signal.Notify(sigchan, syscall.Signal(sig))
 	}
 	// Start up our listener.
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 611fa22c3..c40c6d673 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -16,6 +16,7 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/usermem",
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 4d42d29cb..ca16d0381 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -4,7 +4,10 @@ package(licenses = ["notice"])
 
 go_library(
     name = "control",
-    srcs = ["control.go"],
+    srcs = [
+        "control.go",
+        "control_vfs2.go",
+    ],
     imports = [
         "gvisor.dev/gvisor/pkg/sentry/fs",
     ],
@@ -18,6 +21,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/usermem",
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4667373d2..8b439a078 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -41,6 +41,8 @@ type SCMCredentials interface {
 	Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID)
 }
 
+// LINT.IfChange
+
 // SCMRights represents a SCM_RIGHTS socket control message.
 type SCMRights interface {
 	transport.RightsControlMessage
@@ -142,6 +144,8 @@ func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte, flag
 	return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds)
 }
 
+// LINT.ThenChange(./control_vfs2.go)
+
 // scmCredentials represents an SCM_CREDENTIALS socket control message.
 //
 // +stateify savable
@@ -329,7 +333,7 @@ func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 }
 
 // PackTClass packs an IPV6_TCLASS socket control message.
-func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
+func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IPV6,
@@ -537,11 +541,19 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 	}
 
 	if len(fds) > 0 {
-		rights, err := NewSCMRights(t, fds)
-		if err != nil {
-			return socket.ControlMessages{}, err
+		if kernel.VFS2Enabled {
+			rights, err := NewSCMRightsVFS2(t, fds)
+			if err != nil {
+				return socket.ControlMessages{}, err
+			}
+			cmsgs.Unix.Rights = rights
+		} else {
+			rights, err := NewSCMRights(t, fds)
+			if err != nil {
+				return socket.ControlMessages{}, err
+			}
+			cmsgs.Unix.Rights = rights
 		}
-		cmsgs.Unix.Rights = rights
 	}
 
 	return cmsgs, nil
@@ -566,6 +578,8 @@ func MakeCreds(t *kernel.Task) SCMCredentials {
 	return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
 }
 
+// LINT.IfChange
+
 // New creates default control messages if needed.
 func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages {
 	return transport.ControlMessages{
@@ -573,3 +587,5 @@ func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transpo
 		Rights:      rights,
 	}
 }
+
+// LINT.ThenChange(./control_vfs2.go)
diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go
new file mode 100644
index 000000000..fd08179be
--- /dev/null
+++ b/pkg/sentry/socket/control/control_vfs2.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package control
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SCMRightsVFS2 represents a SCM_RIGHTS socket control message.
+type SCMRightsVFS2 interface {
+	transport.RightsControlMessage
+
+	// Files returns up to max RightsFiles.
+	//
+	// Returned files are consumed and ownership is transferred to the caller.
+	// Subsequent calls to Files will return the next files.
+	Files(ctx context.Context, max int) (rf RightsFilesVFS2, truncated bool)
+}
+
+// RightsFiles represents a SCM_RIGHTS socket control message. A reference is
+// maintained for each vfs.FileDescription and is release either when an FD is created or
+// when the Release method is called.
+type RightsFilesVFS2 []*vfs.FileDescription
+
+// NewSCMRightsVFS2 creates a new SCM_RIGHTS socket control message
+// representation using local sentry FDs.
+func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) {
+	files := make(RightsFilesVFS2, 0, len(fds))
+	for _, fd := range fds {
+		file := t.GetFileVFS2(fd)
+		if file == nil {
+			files.Release()
+			return nil, syserror.EBADF
+		}
+		files = append(files, file)
+	}
+	return &files, nil
+}
+
+// Files implements SCMRights.Files.
+func (fs *RightsFilesVFS2) Files(ctx context.Context, max int) (RightsFilesVFS2, bool) {
+	n := max
+	var trunc bool
+	if l := len(*fs); n > l {
+		n = l
+	} else if n < l {
+		trunc = true
+	}
+	rf := (*fs)[:n]
+	*fs = (*fs)[n:]
+	return rf, trunc
+}
+
+// Clone implements transport.RightsControlMessage.Clone.
+func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage {
+	nfs := append(RightsFilesVFS2(nil), *fs...)
+	for _, nf := range nfs {
+		nf.IncRef()
+	}
+	return &nfs
+}
+
+// Release implements transport.RightsControlMessage.Release.
+func (fs *RightsFilesVFS2) Release() {
+	for _, f := range *fs {
+		f.DecRef()
+	}
+	*fs = nil
+}
+
+// rightsFDsVFS2 gets up to the specified maximum number of FDs.
+func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) ([]int32, bool) {
+	files, trunc := rights.Files(t, max)
+	fds := make([]int32, 0, len(files))
+	for i := 0; i < max && len(files) > 0; i++ {
+		fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{
+			CloseOnExec: cloexec,
+		})
+		files[0].DecRef()
+		files = files[1:]
+		if err != nil {
+			t.Warningf("Error inserting FD: %v", err)
+			// This is what Linux does.
+			break
+		}
+
+		fds = append(fds, int32(fd))
+	}
+	return fds, trunc
+}
+
+// PackRightsVFS2 packs as many FDs as will fit into the unused capacity of buf.
+func PackRightsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, buf []byte, flags int) ([]byte, int) {
+	maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4
+	// Linux does not return any FDs if none fit.
+	if maxFDs <= 0 {
+		flags |= linux.MSG_CTRUNC
+		return buf, flags
+	}
+	fds, trunc := rightsFDsVFS2(t, rights, cloexec, maxFDs)
+	if trunc {
+		flags |= linux.MSG_CTRUNC
+	}
+	align := t.Arch().Width()
+	return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds)
+}
+
+// NewVFS2 creates default control messages if needed.
+func NewVFS2(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRightsVFS2) transport.ControlMessages {
+	return transport.ControlMessages{
+		Credentials: makeCreds(t, socketOrEndpoint),
+		Rights:      rights,
+	}
+}
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 023bad156..e82d6cd1e 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -10,6 +10,7 @@ go_library(
         "save_restore.go",
         "socket.go",
         "socket_unsafe.go",
+        "socket_vfs2.go",
         "sockopt_impl.go",
         "stack.go",
     ],
@@ -25,11 +26,14 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsimpl/sockfs",
+        "//pkg/sentry/hostfd",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
+        "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip/stack",
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 22f78d2e2..c11e82c10 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -49,6 +50,8 @@ const (
 	maxControlLen = 1024
 )
 
+// LINT.IfChange
+
 // socketOperations implements fs.FileOperations and socket.Socket for a socket
 // implemented using a host socket.
 type socketOperations struct {
@@ -59,23 +62,37 @@ type socketOperations struct {
 	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	socketOpsCommon
+}
+
+// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
+//
+// +stateify savable
+type socketOpsCommon struct {
 	socket.SendReceiveTimeout
 
 	family   int            // Read-only.
 	stype    linux.SockType // Read-only.
 	protocol int            // Read-only.
-	fd       int            // must be O_NONBLOCK
 	queue    waiter.Queue
+
+	// fd is the host socket fd. It must have O_NONBLOCK, so that operations
+	// will return EWOULDBLOCK instead of blocking on the host. This allows us to
+	// handle blocking behavior independently in the sentry.
+	fd int
 }
 
 var _ = socket.Socket(&socketOperations{})
 
 func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) {
 	s := &socketOperations{
-		family:   family,
-		stype:    stype,
-		protocol: protocol,
-		fd:       fd,
+		socketOpsCommon: socketOpsCommon{
+			family:   family,
+			stype:    stype,
+			protocol: protocol,
+			fd:       fd,
+		},
 	}
 	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
 		return nil, syserr.FromError(err)
@@ -86,28 +103,33 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc
 }
 
 // Release implements fs.FileOperations.Release.
-func (s *socketOperations) Release() {
+func (s *socketOpsCommon) Release() {
 	fdnotifier.RemoveFD(int32(s.fd))
 	syscall.Close(s.fd)
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return fdnotifier.NonBlockingPoll(int32(s.fd), mask)
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
-func (s *socketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 	s.queue.EventRegister(e, mask)
 	fdnotifier.UpdateFD(int32(s.fd))
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
-func (s *socketOperations) EventUnregister(e *waiter.Entry) {
+func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) {
 	s.queue.EventUnregister(e)
 	fdnotifier.UpdateFD(int32(s.fd))
 }
 
+// Ioctl implements fs.FileOperations.Ioctl.
+func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return ioctl(ctx, s.fd, io, args)
+}
+
 // Read implements fs.FileOperations.Read.
 func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
 	n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
@@ -155,7 +177,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 }
 
 // Connect implements socket.Socket.Connect.
-func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
 	if len(sockaddr) > sizeofSockaddr {
 		sockaddr = sockaddr[:sizeofSockaddr]
 	}
@@ -195,7 +217,7 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 }
 
 // Accept implements socket.Socket.Accept.
-func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
 	var peerAddr linux.SockAddr
 	var peerAddrBuf []byte
 	var peerAddrlen uint32
@@ -209,7 +231,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	}
 
 	// Conservatively ignore all flags specified by the application and add
-	// SOCK_NONBLOCK since socketOperations requires it.
+	// SOCK_NONBLOCK since socketOpsCommon requires it.
 	fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC)
 	if blocking {
 		var ch chan struct{}
@@ -235,23 +257,41 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr)
 	}
 
-	f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0)
-	if err != nil {
-		syscall.Close(fd)
-		return 0, nil, 0, err
-	}
-	defer f.DecRef()
+	var (
+		kfd  int32
+		kerr error
+	)
+	if kernel.VFS2Enabled {
+		f, err := newVFS2Socket(t, s.family, s.stype, s.protocol, fd, uint32(flags&syscall.SOCK_NONBLOCK))
+		if err != nil {
+			syscall.Close(fd)
+			return 0, nil, 0, err
+		}
+		defer f.DecRef()
 
-	kfd, kerr := t.NewFDFrom(0, f, kernel.FDFlags{
-		CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
-	})
-	t.Kernel().RecordSocket(f)
+		kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{
+			CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
+		})
+		t.Kernel().RecordSocketVFS2(f)
+	} else {
+		f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0)
+		if err != nil {
+			syscall.Close(fd)
+			return 0, nil, 0, err
+		}
+		defer f.DecRef()
+
+		kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{
+			CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
+		})
+		t.Kernel().RecordSocket(f)
+	}
 
 	return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr)
 }
 
 // Bind implements socket.Socket.Bind.
-func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 	if len(sockaddr) > sizeofSockaddr {
 		sockaddr = sockaddr[:sizeofSockaddr]
 	}
@@ -264,12 +304,12 @@ func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 }
 
 // Listen implements socket.Socket.Listen.
-func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
+func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 	return syserr.FromError(syscall.Listen(s.fd, backlog))
 }
 
 // Shutdown implements socket.Socket.Shutdown.
-func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
+func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	switch how {
 	case syscall.SHUT_RD, syscall.SHUT_WR, syscall.SHUT_RDWR:
 		return syserr.FromError(syscall.Shutdown(s.fd, how))
@@ -279,7 +319,7 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 }
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
 	if outLen < 0 {
 		return nil, syserr.ErrInvalidArgument
 	}
@@ -328,7 +368,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 }
 
 // SetSockOpt implements socket.Socket.SetSockOpt.
-func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
+func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
 	// Whitelist options and constrain option length.
 	optlen := setSockOptLen(t, level, name)
 	switch level {
@@ -374,7 +414,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
 	// Whitelist flags.
 	//
 	// FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary
@@ -496,7 +536,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	// Whitelist flags.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
 		return 0, syserr.ErrInvalidArgument
@@ -515,7 +555,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		if uint64(src.NumBytes()) != srcs.NumBytes() {
 			return 0, nil
 		}
-		if srcs.IsEmpty() {
+		if srcs.IsEmpty() && len(controlBuf) == 0 {
 			return 0, nil
 		}
 
@@ -585,7 +625,7 @@ func translateIOSyscallError(err error) error {
 }
 
 // State implements socket.Socket.State.
-func (s *socketOperations) State() uint32 {
+func (s *socketOpsCommon) State() uint32 {
 	info := linux.TCPInfo{}
 	buf, err := getsockopt(s.fd, syscall.SOL_TCP, syscall.TCP_INFO, linux.SizeOfTCPInfo)
 	if err != nil {
@@ -607,7 +647,7 @@ func (s *socketOperations) State() uint32 {
 }
 
 // Type implements socket.Socket.Type.
-func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) {
+func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
 	return s.family, s.stype, s.protocol
 }
 
@@ -663,8 +703,11 @@ func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int
 	return nil, nil, nil
 }
 
+// LINT.ThenChange(./socket_vfs2.go)
+
 func init() {
 	for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} {
 		socket.RegisterProvider(family, &socketProvider{family})
+		socket.RegisterProviderVFS2(family, &socketProviderVFS2{})
 	}
 }
diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go
index cd67234d2..3f420c2ec 100644
--- a/pkg/sentry/socket/hostinet/socket_unsafe.go
+++ b/pkg/sentry/socket/hostinet/socket_unsafe.go
@@ -21,7 +21,6 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/syserr"
@@ -54,12 +53,11 @@ func writev(fd int, srcs []syscall.Iovec) (uint64, error) {
 	return uint64(n), nil
 }
 
-// Ioctl implements fs.FileOperations.Ioctl.
-func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func ioctl(ctx context.Context, fd int, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	switch cmd := uintptr(args[1].Int()); cmd {
 	case syscall.TIOCINQ, syscall.TIOCOUTQ:
 		var val int32
-		if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s.fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 {
+		if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 {
 			return 0, translateIOSyscallError(errno)
 		}
 		var buf [4]byte
@@ -93,7 +91,7 @@ func getsockopt(fd int, level, name int, optlen int) ([]byte, error) {
 }
 
 // GetSockName implements socket.Socket.GetSockName.
-func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr := make([]byte, sizeofSockaddr)
 	addrlen := uint32(len(addr))
 	_, _, errno := syscall.Syscall(syscall.SYS_GETSOCKNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen)))
@@ -104,7 +102,7 @@ func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32,
 }
 
 // GetPeerName implements socket.Socket.GetPeerName.
-func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr := make([]byte, sizeofSockaddr)
 	addrlen := uint32(len(addr))
 	_, _, errno := syscall.Syscall(syscall.SYS_GETPEERNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen)))
diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go
new file mode 100644
index 000000000..677743113
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/socket_vfs2.go
@@ -0,0 +1,184 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
+	"gvisor.dev/gvisor/pkg/sentry/hostfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+type socketVFS2 struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+
+	// We store metadata for hostinet sockets internally. Technically, we should
+	// access metadata (e.g. through stat, chmod) on the host for correctness,
+	// but this is not very useful for inet socket fds, which do not belong to a
+	// concrete file anyway.
+	vfs.DentryMetadataFileDescriptionImpl
+
+	socketOpsCommon
+}
+
+var _ = socket.SocketVFS2(&socketVFS2{})
+
+func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) {
+	mnt := t.Kernel().SocketMount()
+	d := sockfs.NewDentry(t.Credentials(), mnt)
+
+	s := &socketVFS2{
+		socketOpsCommon: socketOpsCommon{
+			family:   family,
+			stype:    stype,
+			protocol: protocol,
+			fd:       fd,
+		},
+	}
+	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
+		return nil, syserr.FromError(err)
+	}
+	vfsfd := &s.vfsfd
+	if err := vfsfd.Init(s, linux.O_RDWR|(flags&linux.O_NONBLOCK), mnt, d, &vfs.FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, syserr.FromError(err)
+	}
+	return vfsfd, nil
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return s.socketOpsCommon.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *socketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	s.socketOpsCommon.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *socketVFS2) EventUnregister(e *waiter.Entry) {
+	s.socketOpsCommon.EventUnregister(e)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.
+func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return ioctl(ctx, s.fd, uio, args)
+}
+
+// PRead implements vfs.FileDescriptionImpl.
+func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Read implements vfs.FileDescriptionImpl.
+func (s *socketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags)
+	n, err := dst.CopyOutFrom(ctx, reader)
+	hostfd.PutReadWriterAt(reader)
+	return int64(n), err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.
+func (s *socketVFS2) PWrite(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Write implements vfs.FileDescriptionImpl.
+func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags)
+	n, err := src.CopyInTo(ctx, writer)
+	hostfd.PutReadWriterAt(writer)
+	return int64(n), err
+}
+
+type socketProviderVFS2 struct {
+	family int
+}
+
+// Socket implements socket.ProviderVFS2.Socket.
+func (p *socketProviderVFS2) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
+	// Check that we are using the host network stack.
+	stack := t.NetworkContext()
+	if stack == nil {
+		return nil, nil
+	}
+	if _, ok := stack.(*Stack); !ok {
+		return nil, nil
+	}
+
+	// Only accept TCP and UDP.
+	stype := stypeflags & linux.SOCK_TYPE_MASK
+	switch stype {
+	case syscall.SOCK_STREAM:
+		switch protocol {
+		case 0, syscall.IPPROTO_TCP:
+			// ok
+		default:
+			return nil, nil
+		}
+	case syscall.SOCK_DGRAM:
+		switch protocol {
+		case 0, syscall.IPPROTO_UDP:
+			// ok
+		default:
+			return nil, nil
+		}
+	default:
+		return nil, nil
+	}
+
+	// Conservatively ignore all flags specified by the application and add
+	// SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0
+	// to simplify the syscall filters, since 0 and IPPROTO_* are equivalent.
+	fd, err := syscall.Socket(p.family, int(stype)|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		return nil, syserr.FromError(err)
+	}
+	return newVFS2Socket(t, p.family, stype, protocol, fd, uint32(stypeflags&syscall.SOCK_NONBLOCK))
+}
+
+// Pair implements socket.Provider.Pair.
+func (p *socketProviderVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
+	// Not supported by AF_INET/AF_INET6.
+	return nil, nil, nil
+}
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index c91ec7494..721094bbf 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -7,6 +7,8 @@ go_library(
     srcs = [
         "extensions.go",
         "netfilter.go",
+        "owner_matcher.go",
+        "targets.go",
         "tcp_matcher.go",
         "udp_matcher.go",
     ],
@@ -21,7 +23,6 @@ go_library(
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index b4b244abf..0336a32d8 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -19,7 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -37,12 +37,12 @@ type matchMaker interface {
 	// name is the matcher name as stored in the xt_entry_match struct.
 	name() string
 
-	// marshal converts from an iptables.Matcher to an ABI struct.
-	marshal(matcher iptables.Matcher) []byte
+	// marshal converts from an stack.Matcher to an ABI struct.
+	marshal(matcher stack.Matcher) []byte
 
 	// unmarshal converts from the ABI matcher struct to an
-	// iptables.Matcher.
-	unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error)
+	// stack.Matcher.
+	unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error)
 }
 
 // matchMakers maps the name of supported matchers to the matchMaker that
@@ -58,7 +58,7 @@ func registerMatchMaker(mm matchMaker) {
 	matchMakers[mm.name()] = mm
 }
 
-func marshalMatcher(matcher iptables.Matcher) []byte {
+func marshalMatcher(matcher stack.Matcher) []byte {
 	matchMaker, ok := matchMakers[matcher.Name()]
 	if !ok {
 		panic(fmt.Sprintf("Unknown matcher of type %T.", matcher))
@@ -86,7 +86,7 @@ func marshalEntryMatch(name string, data []byte) []byte {
 	return append(buf, make([]byte, size-len(buf))...)
 }
 
-func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) {
+func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) {
 	matchMaker, ok := matchMakers[match.Name.String()]
 	if !ok {
 		return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String())
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3fc80e0de..66015e2bc 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -17,6 +17,7 @@
 package netfilter
 
 import (
+	"bytes"
 	"errors"
 	"fmt"
 
@@ -26,7 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -35,6 +36,11 @@ import (
 // shouldn't be reached - an error has occurred if we fall through to one.
 const errorTargetName = "ERROR"
 
+// redirectTargetName is used to mark targets as redirect targets. Redirect
+// targets should be reached for only NAT and Mangle tables. These targets will
+// change the destination port/destination IP for packets.
+const redirectTargetName = "REDIRECT"
+
 // Metadata is used to verify that we are correctly serializing and
 // deserializing iptables into structs consumable by the iptables tool. We save
 // a metadata struct when the tables are written, and when they are read out we
@@ -48,9 +54,23 @@ type metadata struct {
 	Size       uint32
 }
 
+// enableLogging controls whether to log the (de)serialization of netfilter
+// structs between userspace and netstack. These logs are useful when
+// developing iptables, but can pollute sentry logs otherwise.
+const enableLogging = false
+
+// emptyFilter is for comparison with a rule's filters to determine whether it
+// is also empty. It is immutable.
+var emptyFilter = stack.IPHeaderFilter{
+	Dst:     "\x00\x00\x00\x00",
+	DstMask: "\x00\x00\x00\x00",
+	Src:     "\x00\x00\x00\x00",
+	SrcMask: "\x00\x00\x00\x00",
+}
+
 // nflog logs messages related to the writing and reading of iptables.
 func nflog(format string, args ...interface{}) {
-	if log.IsLogging(log.Debug) {
+	if enableLogging && log.IsLogging(log.Debug) {
 		log.Debugf("netfilter: "+format, args...)
 	}
 }
@@ -123,39 +143,35 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 	return entries, nil
 }
 
-func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, error) {
-	ipt := stack.IPTables()
-	table, ok := ipt.Tables[tablename.String()]
+func findTable(stk *stack.Stack, tablename linux.TableName) (stack.Table, error) {
+	table, ok := stk.IPTables().GetTable(tablename.String())
 	if !ok {
-		return iptables.Table{}, fmt.Errorf("couldn't find table %q", tablename)
+		return stack.Table{}, fmt.Errorf("couldn't find table %q", tablename)
 	}
 	return table, nil
 }
 
-// FillDefaultIPTables sets stack's IPTables to the default tables and
-// populates them with metadata.
-func FillDefaultIPTables(stack *stack.Stack) {
-	ipt := iptables.DefaultTables()
-
-	// In order to fill in the metadata, we have to translate ipt from its
-	// netstack format to Linux's giant-binary-blob format.
-	for name, table := range ipt.Tables {
-		_, metadata, err := convertNetstackToBinary(name, table)
-		if err != nil {
-			panic(fmt.Errorf("Unable to set default IP tables: %v", err))
+// FillIPTablesMetadata populates stack's IPTables with metadata.
+func FillIPTablesMetadata(stk *stack.Stack) {
+	stk.IPTables().ModifyTables(func(tables map[string]stack.Table) {
+		// In order to fill in the metadata, we have to translate ipt from its
+		// netstack format to Linux's giant-binary-blob format.
+		for name, table := range tables {
+			_, metadata, err := convertNetstackToBinary(name, table)
+			if err != nil {
+				panic(fmt.Errorf("Unable to set default IP tables: %v", err))
+			}
+			table.SetMetadata(metadata)
+			tables[name] = table
 		}
-		table.SetMetadata(metadata)
-		ipt.Tables[name] = table
-	}
-
-	stack.SetIPTables(ipt)
+	})
 }
 
 // convertNetstackToBinary converts the iptables as stored in netstack to the
 // format expected by the iptables tool. Linux stores each table as a binary
 // blob that can only be traversed by parsing a bit, reading some offsets,
 // jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, error) {
+func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelIPTGetEntries, metadata, error) {
 	// Return values.
 	var entries linux.KernelIPTGetEntries
 	var meta metadata
@@ -194,6 +210,21 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 				TargetOffset: linux.SizeOfIPTEntry,
 			},
 		}
+		copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst)
+		copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask)
+		copy(entry.IPTEntry.IP.Src[:], rule.Filter.Src)
+		copy(entry.IPTEntry.IP.SrcMask[:], rule.Filter.SrcMask)
+		copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface)
+		copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
+		if rule.Filter.DstInvert {
+			entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP
+		}
+		if rule.Filter.SrcInvert {
+			entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_SRCIP
+		}
+		if rule.Filter.OutputInterfaceInvert {
+			entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
+		}
 
 		for _, matcher := range rule.Matchers {
 			// Serialize the matcher and add it to the
@@ -228,25 +259,29 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern
 	return entries, meta, nil
 }
 
-func marshalTarget(target iptables.Target) []byte {
+func marshalTarget(target stack.Target) []byte {
 	switch tg := target.(type) {
-	case iptables.AcceptTarget:
-		return marshalStandardTarget(iptables.RuleAccept)
-	case iptables.DropTarget:
-		return marshalStandardTarget(iptables.RuleDrop)
-	case iptables.ErrorTarget:
+	case stack.AcceptTarget:
+		return marshalStandardTarget(stack.RuleAccept)
+	case stack.DropTarget:
+		return marshalStandardTarget(stack.RuleDrop)
+	case stack.ErrorTarget:
 		return marshalErrorTarget(errorTargetName)
-	case iptables.UserChainTarget:
+	case stack.UserChainTarget:
 		return marshalErrorTarget(tg.Name)
-	case iptables.ReturnTarget:
-		return marshalStandardTarget(iptables.RuleReturn)
+	case stack.ReturnTarget:
+		return marshalStandardTarget(stack.RuleReturn)
+	case stack.RedirectTarget:
+		return marshalRedirectTarget(tg)
+	case JumpTarget:
+		return marshalJumpTarget(tg)
 	default:
 		panic(fmt.Errorf("unknown target of type %T", target))
 	}
 }
 
-func marshalStandardTarget(verdict iptables.RuleVerdict) []byte {
-	nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget)
+func marshalStandardTarget(verdict stack.RuleVerdict) []byte {
+	nflog("convert to binary: marshalling standard target")
 
 	// The target's name will be the empty string.
 	target := linux.XTStandardTarget{
@@ -274,15 +309,55 @@ func marshalErrorTarget(errorName string) []byte {
 	return binary.Marshal(ret, usermem.ByteOrder, target)
 }
 
+func marshalRedirectTarget(rt stack.RedirectTarget) []byte {
+	// This is a redirect target named redirect
+	target := linux.XTRedirectTarget{
+		Target: linux.XTEntryTarget{
+			TargetSize: linux.SizeOfXTRedirectTarget,
+		},
+	}
+	copy(target.Target.Name[:], redirectTargetName)
+
+	ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
+	target.NfRange.RangeSize = 1
+	if rt.RangeProtoSpecified {
+		target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
+	}
+	// Convert port from little endian to big endian.
+	port := make([]byte, 2)
+	binary.LittleEndian.PutUint16(port, rt.MinPort)
+	target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port)
+	binary.LittleEndian.PutUint16(port, rt.MaxPort)
+	target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port)
+	return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
+func marshalJumpTarget(jt JumpTarget) []byte {
+	nflog("convert to binary: marshalling jump target")
+
+	// The target's name will be the empty string.
+	target := linux.XTStandardTarget{
+		Target: linux.XTEntryTarget{
+			TargetSize: linux.SizeOfXTStandardTarget,
+		},
+		// Verdict is overloaded by the ABI. When positive, it holds
+		// the jump offset from the start of the table.
+		Verdict: int32(jt.Offset),
+	}
+
+	ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
+	return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
 // translateFromStandardVerdict translates verdicts the same way as the iptables
 // tool.
-func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
+func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 {
 	switch verdict {
-	case iptables.RuleAccept:
+	case stack.RuleAccept:
 		return -linux.NF_ACCEPT - 1
-	case iptables.RuleDrop:
+	case stack.RuleDrop:
 		return -linux.NF_DROP - 1
-	case iptables.RuleReturn:
+	case stack.RuleReturn:
 		return linux.NF_RETURN
 	default:
 		// TODO(gvisor.dev/issue/170): Support Jump.
@@ -291,18 +366,18 @@ func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 {
 }
 
 // translateToStandardTarget translates from the value in a
-// linux.XTStandardTarget to an iptables.Verdict.
-func translateToStandardTarget(val int32) (iptables.Target, error) {
+// linux.XTStandardTarget to an stack.Verdict.
+func translateToStandardTarget(val int32) (stack.Target, error) {
 	// TODO(gvisor.dev/issue/170): Support other verdicts.
 	switch val {
 	case -linux.NF_ACCEPT - 1:
-		return iptables.AcceptTarget{}, nil
+		return stack.AcceptTarget{}, nil
 	case -linux.NF_DROP - 1:
-		return iptables.DropTarget{}, nil
+		return stack.DropTarget{}, nil
 	case -linux.NF_QUEUE - 1:
 		return nil, errors.New("unsupported iptables verdict QUEUE")
 	case linux.NF_RETURN:
-		return iptables.ReturnTarget{}, nil
+		return stack.ReturnTarget{}, nil
 	default:
 		return nil, fmt.Errorf("unknown iptables verdict %d", val)
 	}
@@ -310,7 +385,7 @@ func translateToStandardTarget(val int32) (iptables.Target, error) {
 
 // SetEntries sets iptables rules for a single table. See
 // net/ipv4/netfilter/ip_tables.c:translate_table for reference.
-func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
+func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// Get the basic rules data (struct ipt_replace).
 	if len(optVal) < linux.SizeOfIPTReplace {
 		nflog("optVal has insufficient size for replace %d", len(optVal))
@@ -322,10 +397,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
 
 	// TODO(gvisor.dev/issue/170): Support other tables.
-	var table iptables.Table
+	var table stack.Table
 	switch replace.Name.String() {
-	case iptables.TablenameFilter:
-		table = iptables.EmptyFilterTable()
+	case stack.TablenameFilter:
+		table = stack.EmptyFilterTable()
+	case stack.TablenameNat:
+		table = stack.EmptyNatTable()
 	default:
 		nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
 		return syserr.ErrInvalidArgument
@@ -335,7 +412,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 	// Convert input into a list of rules and their offsets.
 	var offset uint32
-	var offsets []uint32
+	// offsets maps rule byte offsets to their position in table.Rules.
+	offsets := map[uint32]int{}
 	for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
 		nflog("set entries: processing entry at offset %d", offset)
 
@@ -384,23 +462,24 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 			nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
 			return syserr.ErrInvalidArgument
 		}
-		target, err := parseTarget(optVal[:targetSize])
+		target, err := parseTarget(filter, optVal[:targetSize])
 		if err != nil {
 			nflog("failed to parse target: %v", err)
 			return syserr.ErrInvalidArgument
 		}
 		optVal = optVal[targetSize:]
 
-		table.Rules = append(table.Rules, iptables.Rule{
+		table.Rules = append(table.Rules, stack.Rule{
 			Filter:   filter,
 			Target:   target,
 			Matchers: matchers,
 		})
-		offsets = append(offsets, offset)
+		offsets[offset] = int(entryIdx)
 		offset += uint32(entry.NextOffset)
 
 		if initialOptValLen-len(optVal) != int(entry.NextOffset) {
 			nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
+			return syserr.ErrInvalidArgument
 		}
 	}
 
@@ -409,23 +488,23 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	for hook, _ := range replace.HookEntry {
 		if table.ValidHooks()&(1<<hook) != 0 {
 			hk := hookFromLinux(hook)
-			for ruleIdx, offset := range offsets {
+			for offset, ruleIdx := range offsets {
 				if offset == replace.HookEntry[hook] {
 					table.BuiltinChains[hk] = ruleIdx
 				}
 				if offset == replace.Underflow[hook] {
 					if !validUnderflow(table.Rules[ruleIdx]) {
-						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP.")
+						nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx)
 						return syserr.ErrInvalidArgument
 					}
 					table.Underflows[hk] = ruleIdx
 				}
 			}
-			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset {
+			if ruleIdx := table.BuiltinChains[hk]; ruleIdx == stack.HookUnset {
 				nflog("hook %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
-			if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset {
+			if ruleIdx := table.Underflows[hk]; ruleIdx == stack.HookUnset {
 				nflog("underflow %v is unset.", hk)
 				return syserr.ErrInvalidArgument
 			}
@@ -434,7 +513,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 
 	// Add the user chains.
 	for ruleIdx, rule := range table.Rules {
-		target, ok := rule.Target.(iptables.UserChainTarget)
+		target, ok := rule.Target.(stack.UserChainTarget)
 		if !ok {
 			continue
 		}
@@ -444,22 +523,41 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 		// - There's some other rule after it.
 		// - There are no matchers.
 		if ruleIdx == len(table.Rules)-1 {
-			nflog("user chain must have a rule or default policy.")
+			nflog("user chain must have a rule or default policy")
 			return syserr.ErrInvalidArgument
 		}
 		if len(table.Rules[ruleIdx].Matchers) != 0 {
-			nflog("user chain's first node must have no matcheres.")
+			nflog("user chain's first node must have no matchers")
 			return syserr.ErrInvalidArgument
 		}
 		table.UserChains[target.Name] = ruleIdx + 1
 	}
 
+	// Set each jump to point to the appropriate rule. Right now they hold byte
+	// offsets.
+	for ruleIdx, rule := range table.Rules {
+		jump, ok := rule.Target.(JumpTarget)
+		if !ok {
+			continue
+		}
+
+		// Find the rule corresponding to the jump rule offset.
+		jumpTo, ok := offsets[jump.Offset]
+		if !ok {
+			nflog("failed to find a rule to jump to")
+			return syserr.ErrInvalidArgument
+		}
+		jump.RuleNum = jumpTo
+		rule.Target = jump
+		table.Rules[ruleIdx] = rule
+	}
+
 	// TODO(gvisor.dev/issue/170): Support other chains.
-	// Since we only support modifying the INPUT chain right now, make sure
-	// all other chains point to ACCEPT rules.
+	// Since we only support modifying the INPUT, PREROUTING and OUTPUT chain right now,
+	// make sure all other chains point to ACCEPT rules.
 	for hook, ruleIdx := range table.BuiltinChains {
-		if hook != iptables.Input {
-			if _, ok := table.Rules[ruleIdx].Target.(iptables.AcceptTarget); !ok {
+		if hook == stack.Forward || hook == stack.Postrouting {
+			if !isUnconditionalAccept(table.Rules[ruleIdx]) {
 				nflog("hook %d is unsupported.", hook)
 				return syserr.ErrInvalidArgument
 			}
@@ -471,24 +569,22 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
 	// - There are no chains without an unconditional final rule.
 	// - There are no chains without an unconditional underflow rule.
 
-	ipt := stack.IPTables()
 	table.SetMetadata(metadata{
 		HookEntry:  replace.HookEntry,
 		Underflow:  replace.Underflow,
 		NumEntries: replace.NumEntries,
 		Size:       replace.Size,
 	})
-	ipt.Tables[replace.Name.String()] = table
-	stack.SetIPTables(ipt)
+	stk.IPTables().ReplaceTable(replace.Name.String(), table)
 
 	return nil
 }
 
 // parseMatchers parses 0 or more matchers from optVal. optVal should contain
 // only the matchers.
-func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, error) {
+func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, error) {
 	nflog("set entries: parsing matchers of size %d", len(optVal))
-	var matchers []iptables.Matcher
+	var matchers []stack.Matcher
 	for len(optVal) > 0 {
 		nflog("set entries: optVal has len %d", len(optVal))
 
@@ -530,7 +626,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma
 
 // parseTarget parses a target from optVal. optVal should contain only the
 // target.
-func parseTarget(optVal []byte) (iptables.Target, error) {
+func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) {
 	nflog("set entries: parsing target of size %d", len(optVal))
 	if len(optVal) < linux.SizeOfXTEntryTarget {
 		return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal))
@@ -548,7 +644,12 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		buf = optVal[:linux.SizeOfXTStandardTarget]
 		binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
 
-		return translateToStandardTarget(standardTarget.Verdict)
+		if standardTarget.Verdict < 0 {
+			// A Verdict < 0 indicates a non-jump verdict.
+			return translateToStandardTarget(standardTarget.Verdict)
+		}
+		// A verdict >= 0 indicates a jump.
+		return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
 
 	case errorTargetName:
 		// Error target.
@@ -569,67 +670,155 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
 		switch name := errorTarget.Name.String(); name {
 		case errorTargetName:
 			nflog("set entries: error target")
-			return iptables.ErrorTarget{}, nil
+			return stack.ErrorTarget{}, nil
 		default:
 			// User defined chain.
 			nflog("set entries: user-defined target %q", name)
-			return iptables.UserChainTarget{Name: name}, nil
+			return stack.UserChainTarget{Name: name}, nil
+		}
+
+	case redirectTargetName:
+		// Redirect target.
+		if len(optVal) < linux.SizeOfXTRedirectTarget {
+			return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal))
+		}
+
+		if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber {
+			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+		}
+
+		var redirectTarget linux.XTRedirectTarget
+		buf = optVal[:linux.SizeOfXTRedirectTarget]
+		binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
+
+		// Copy linux.XTRedirectTarget to stack.RedirectTarget.
+		var target stack.RedirectTarget
+		nfRange := redirectTarget.NfRange
+
+		// RangeSize should be 1.
+		if nfRange.RangeSize != 1 {
+			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+		}
+
+		// TODO(gvisor.dev/issue/170): Check if the flags are valid.
+		// Also check if we need to map ports or IP.
+		// For now, redirect target only supports destination port change.
+		// Port range and IP range are not supported yet.
+		if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 {
+			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
 		}
+		target.RangeProtoSpecified = true
+
+		target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
+		target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:])
+
+		// TODO(gvisor.dev/issue/170): Port range is not supported yet.
+		if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
+			return nil, fmt.Errorf("netfilter.SetEntries: invalid argument")
+		}
+
+		// Convert port from big endian to little endian.
+		port := make([]byte, 2)
+		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort)
+		target.MinPort = binary.LittleEndian.Uint16(port)
+
+		binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort)
+		target.MaxPort = binary.LittleEndian.Uint16(port)
+		return target, nil
 	}
 
 	// Unknown target.
 	return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
 }
 
-func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, error) {
+func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
 	if containsUnsupportedFields(iptip) {
-		return iptables.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+		return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
+	}
+	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
+	}
+	if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+	}
+
+	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterface)
 	}
-	return iptables.IPHeaderFilter{
-		Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
+	ifname := string(iptip.OutputInterface[:n])
+
+	n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0)
+	if n == -1 {
+		n = len(iptip.OutputInterfaceMask)
+	}
+	ifnameMask := string(iptip.OutputInterfaceMask[:n])
+
+	return stack.IPHeaderFilter{
+		Protocol:              tcpip.TransportProtocolNumber(iptip.Protocol),
+		Dst:                   tcpip.Address(iptip.Dst[:]),
+		DstMask:               tcpip.Address(iptip.DstMask[:]),
+		DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
+		Src:                   tcpip.Address(iptip.Src[:]),
+		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
+		SrcInvert:             iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
+		OutputInterface:       ifname,
+		OutputInterfaceMask:   ifnameMask,
+		OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
 	}, nil
 }
 
 func containsUnsupportedFields(iptip linux.IPTIP) bool {
-	// Currently we check that everything except protocol is zeroed.
-	var emptyInetAddr = linux.InetAddr{}
+	// The following features are supported:
+	// - Protocol
+	// - Dst and DstMask
+	// - Src and SrcMask
+	// - The inverse destination IP check flag
+	// - OutputInterface, OutputInterfaceMask and its inverse.
 	var emptyInterface = [linux.IFNAMSIZ]byte{}
-	return iptip.Dst != emptyInetAddr ||
-		iptip.Src != emptyInetAddr ||
-		iptip.SrcMask != emptyInetAddr ||
-		iptip.DstMask != emptyInetAddr ||
-		iptip.InputInterface != emptyInterface ||
-		iptip.OutputInterface != emptyInterface ||
+	// Disable any supported inverse flags.
+	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
+	return iptip.InputInterface != emptyInterface ||
 		iptip.InputInterfaceMask != emptyInterface ||
-		iptip.OutputInterfaceMask != emptyInterface ||
 		iptip.Flags != 0 ||
-		iptip.InverseFlags != 0
+		iptip.InverseFlags&^inverseMask != 0
 }
 
-func validUnderflow(rule iptables.Rule) bool {
+func validUnderflow(rule stack.Rule) bool {
 	if len(rule.Matchers) != 0 {
 		return false
 	}
+	if rule.Filter != emptyFilter {
+		return false
+	}
 	switch rule.Target.(type) {
-	case iptables.AcceptTarget, iptables.DropTarget:
+	case stack.AcceptTarget, stack.DropTarget:
 		return true
 	default:
 		return false
 	}
 }
 
-func hookFromLinux(hook int) iptables.Hook {
+func isUnconditionalAccept(rule stack.Rule) bool {
+	if !validUnderflow(rule) {
+		return false
+	}
+	_, ok := rule.Target.(stack.AcceptTarget)
+	return ok
+}
+
+func hookFromLinux(hook int) stack.Hook {
 	switch hook {
 	case linux.NF_INET_PRE_ROUTING:
-		return iptables.Prerouting
+		return stack.Prerouting
 	case linux.NF_INET_LOCAL_IN:
-		return iptables.Input
+		return stack.Input
 	case linux.NF_INET_FORWARD:
-		return iptables.Forward
+		return stack.Forward
 	case linux.NF_INET_LOCAL_OUT:
-		return iptables.Output
+		return stack.Output
 	case linux.NF_INET_POST_ROUTING:
-		return iptables.Postrouting
+		return stack.Postrouting
 	}
 	panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
 }
diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go
new file mode 100644
index 000000000..1b4e0ad79
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/owner_matcher.go
@@ -0,0 +1,149 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const matcherNameOwner = "owner"
+
+func init() {
+	registerMatchMaker(ownerMarshaler{})
+}
+
+// ownerMarshaler implements matchMaker for owner matching.
+type ownerMarshaler struct{}
+
+// name implements matchMaker.name.
+func (ownerMarshaler) name() string {
+	return matcherNameOwner
+}
+
+// marshal implements matchMaker.marshal.
+func (ownerMarshaler) marshal(mr stack.Matcher) []byte {
+	matcher := mr.(*OwnerMatcher)
+	iptOwnerInfo := linux.IPTOwnerInfo{
+		UID: matcher.uid,
+		GID: matcher.gid,
+	}
+
+	// Support for UID and GID match.
+	if matcher.matchUID {
+		iptOwnerInfo.Match = linux.XT_OWNER_UID
+		if matcher.invertUID {
+			iptOwnerInfo.Invert = linux.XT_OWNER_UID
+		}
+	}
+	if matcher.matchGID {
+		iptOwnerInfo.Match |= linux.XT_OWNER_GID
+		if matcher.invertGID {
+			iptOwnerInfo.Invert |= linux.XT_OWNER_GID
+		}
+	}
+
+	buf := make([]byte, 0, linux.SizeOfIPTOwnerInfo)
+	return marshalEntryMatch(matcherNameOwner, binary.Marshal(buf, usermem.ByteOrder, iptOwnerInfo))
+}
+
+// unmarshal implements matchMaker.unmarshal.
+func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) {
+	if len(buf) < linux.SizeOfIPTOwnerInfo {
+		return nil, fmt.Errorf("buf has insufficient size for owner match: %d", len(buf))
+	}
+
+	// For alignment reasons, the match's total size may
+	// exceed what's strictly necessary to hold matchData.
+	var matchData linux.IPTOwnerInfo
+	binary.Unmarshal(buf[:linux.SizeOfIPTOwnerInfo], usermem.ByteOrder, &matchData)
+	nflog("parseMatchers: parsed IPTOwnerInfo: %+v", matchData)
+
+	var owner OwnerMatcher
+	owner.uid = matchData.UID
+	owner.gid = matchData.GID
+
+	// Check flags.
+	if matchData.Match&linux.XT_OWNER_UID != 0 {
+		owner.matchUID = true
+		if matchData.Invert&linux.XT_OWNER_UID != 0 {
+			owner.invertUID = true
+		}
+	}
+	if matchData.Match&linux.XT_OWNER_GID != 0 {
+		owner.matchGID = true
+		if matchData.Invert&linux.XT_OWNER_GID != 0 {
+			owner.invertGID = true
+		}
+	}
+
+	return &owner, nil
+}
+
+type OwnerMatcher struct {
+	uid       uint32
+	gid       uint32
+	matchUID  bool
+	matchGID  bool
+	invertUID bool
+	invertGID bool
+}
+
+// Name implements Matcher.Name.
+func (*OwnerMatcher) Name() string {
+	return matcherNameOwner
+}
+
+// Match implements Matcher.Match.
+func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
+	// Support only for OUTPUT chain.
+	// TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also.
+	if hook != stack.Output {
+		return false, true
+	}
+
+	// If the packet owner is not set, drop the packet.
+	if pkt.Owner == nil {
+		return false, true
+	}
+
+	var matches bool
+	// Check for UID match.
+	if om.matchUID {
+		if pkt.Owner.UID() == om.uid {
+			matches = true
+		}
+		if matches == om.invertUID {
+			return false, false
+		}
+	}
+
+	// Check for GID match.
+	if om.matchGID {
+		matches = false
+		if pkt.Owner.GID() == om.gid {
+			matches = true
+		}
+		if matches == om.invertGID {
+			return false, false
+		}
+	}
+
+	return true, false
+}
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
new file mode 100644
index 000000000..84abe8d29
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netfilter
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// JumpTarget implements stack.Target.
+type JumpTarget struct {
+	// Offset is the byte offset of the rule to jump to. It is used for
+	// marshaling and unmarshaling.
+	Offset uint32
+
+	// RuleNum is the rule to jump to.
+	RuleNum int
+}
+
+// Action implements stack.Target.Action.
+func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrackTable, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
+	return stack.RuleJump, jt.RuleNum
+}
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index f9945e214..4f98ee2d5 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -19,9 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -40,7 +39,7 @@ func (tcpMarshaler) name() string {
 }
 
 // marshal implements matchMaker.marshal.
-func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
+func (tcpMarshaler) marshal(mr stack.Matcher) []byte {
 	matcher := mr.(*TCPMatcher)
 	xttcp := linux.XTTCP{
 		SourcePortStart:      matcher.sourcePortStart,
@@ -53,7 +52,7 @@ func (tcpMarshaler) marshal(mr iptables.Matcher) []byte {
 }
 
 // unmarshal implements matchMaker.unmarshal.
-func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) {
 	if len(buf) < linux.SizeOfXTTCP {
 		return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf))
 	}
@@ -97,7 +96,7 @@ func (*TCPMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
 	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
@@ -112,22 +111,10 @@ func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfac
 		return false, false
 	}
 
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the iptables.Check codepath as matchers are
-	// added.
-	var tcpHeader header.TCP
-	if pkt.TransportHeader != nil {
-		tcpHeader = header.TCP(pkt.TransportHeader)
-	} else {
-		// The TCP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.TCPMinimumSize {
-			// There's no valid TCP header here, so we hotdrop the
-			// packet.
-			return false, true
-		}
-		tcpHeader = header.TCP(pkt.Data.First())
+	tcpHeader := header.TCP(pkt.TransportHeader)
+	if len(tcpHeader) < header.TCPMinimumSize {
+		// There's no valid TCP header here, so we drop the packet immediately.
+		return false, true
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 86aa11696..3f20fc891 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -19,9 +19,8 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -40,7 +39,7 @@ func (udpMarshaler) name() string {
 }
 
 // marshal implements matchMaker.marshal.
-func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
+func (udpMarshaler) marshal(mr stack.Matcher) []byte {
 	matcher := mr.(*UDPMatcher)
 	xtudp := linux.XTUDP{
 		SourcePortStart:      matcher.sourcePortStart,
@@ -53,7 +52,7 @@ func (udpMarshaler) marshal(mr iptables.Matcher) []byte {
 }
 
 // unmarshal implements matchMaker.unmarshal.
-func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) {
+func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) {
 	if len(buf) < linux.SizeOfXTUDP {
 		return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf))
 	}
@@ -94,11 +93,11 @@ func (*UDPMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) {
+func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
 	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
-	// into the iptables.Check codepath as matchers are added.
+	// into the stack.Check codepath as matchers are added.
 	if netHeader.TransportProtocol() != header.UDPProtocolNumber {
 		return false, false
 	}
@@ -111,22 +110,10 @@ func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfac
 		return false, false
 	}
 
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the iptables.Check codepath as matchers are
-	// added.
-	var udpHeader header.UDP
-	if pkt.TransportHeader != nil {
-		udpHeader = header.UDP(pkt.TransportHeader)
-	} else {
-		// The UDP header hasn't been parsed yet. We have to do it here.
-		if len(pkt.Data.First()) < header.UDPMinimumSize {
-			// There's no valid UDP header here, so we hotdrop the
-			// packet.
-			return false, true
-		}
-		udpHeader = header.UDP(pkt.Data.First())
+	udpHeader := header.UDP(pkt.TransportHeader)
+	if len(udpHeader) < header.UDPMinimumSize {
+		// There's no valid UDP header here, so we drop the packet immediately.
+		return false, true
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 1911cd9b8..7212d8644 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -7,7 +7,9 @@ go_library(
     srcs = [
         "message.go",
         "provider.go",
+        "provider_vfs2.go",
         "socket.go",
+        "socket_vfs2.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -18,6 +20,7 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
@@ -25,6 +28,7 @@ go_library(
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index b0dc70e5c..0d45e5053 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -67,6 +67,8 @@ func RegisterProvider(protocol int, provider Provider) {
 	protocols[protocol] = provider
 }
 
+// LINT.IfChange
+
 // socketProvider implements socket.Provider.
 type socketProvider struct {
 }
@@ -105,7 +107,10 @@ func (*socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.Fi
 	return nil, nil, syserr.ErrNotSupported
 }
 
+// LINT.ThenChange(./provider_vfs2.go)
+
 // init registers the socket provider.
 func init() {
 	socket.RegisterProvider(linux.AF_NETLINK, &socketProvider{})
+	socket.RegisterProviderVFS2(linux.AF_NETLINK, &socketProviderVFS2{})
 }
diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go
new file mode 100644
index 000000000..bb205be0d
--- /dev/null
+++ b/pkg/sentry/socket/netlink/provider_vfs2.go
@@ -0,0 +1,69 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netlink
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+)
+
+// socketProviderVFS2 implements socket.Provider.
+type socketProviderVFS2 struct {
+}
+
+// Socket implements socket.Provider.Socket.
+func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
+	// Netlink sockets must be specified as datagram or raw, but they
+	// behave the same regardless of type.
+	if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW {
+		return nil, syserr.ErrSocketNotSupported
+	}
+
+	provider, ok := protocols[protocol]
+	if !ok {
+		return nil, syserr.ErrProtocolNotSupported
+	}
+
+	p, err := provider(t)
+	if err != nil {
+		return nil, err
+	}
+
+	s, err := NewVFS2(t, stype, p)
+	if err != nil {
+		return nil, err
+	}
+
+	vfsfd := &s.vfsfd
+	mnt := t.Kernel().SocketMount()
+	d := sockfs.NewDentry(t.Credentials(), mnt)
+	if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, syserr.FromError(err)
+	}
+	return vfsfd, nil
+}
+
+// Pair implements socket.Provider.Pair by returning an error.
+func (*socketProviderVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
+	// Netlink sockets never supports creating socket pairs.
+	return nil, nil, syserr.ErrNotSupported
+}
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 2ca02567d..81f34c5a2 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -58,6 +58,8 @@ var errNoFilter = syserr.New("no filter attached", linux.ENOENT)
 // netlinkSocketDevice is the netlink socket virtual device.
 var netlinkSocketDevice = device.NewAnonDevice()
 
+// LINT.IfChange
+
 // Socket is the base socket type for netlink sockets.
 //
 // This implementation only supports userspace sending and receiving messages
@@ -74,6 +76,14 @@ type Socket struct {
 	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	socketOpsCommon
+}
+
+// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
+//
+// +stateify savable
+type socketOpsCommon struct {
 	socket.SendReceiveTimeout
 
 	// ports provides netlink port allocation.
@@ -140,17 +150,19 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke
 	}
 
 	return &Socket{
-		ports:          t.Kernel().NetlinkPorts(),
-		protocol:       protocol,
-		skType:         skType,
-		ep:             ep,
-		connection:     connection,
-		sendBufferSize: defaultSendBufferSize,
+		socketOpsCommon: socketOpsCommon{
+			ports:          t.Kernel().NetlinkPorts(),
+			protocol:       protocol,
+			skType:         skType,
+			ep:             ep,
+			connection:     connection,
+			sendBufferSize: defaultSendBufferSize,
+		},
 	}, nil
 }
 
 // Release implements fs.FileOperations.Release.
-func (s *Socket) Release() {
+func (s *socketOpsCommon) Release() {
 	s.connection.Release()
 	s.ep.Close()
 
@@ -160,7 +172,7 @@ func (s *Socket) Release() {
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
 	// ep holds messages to be read and thus handles EventIn readiness.
 	ready := s.ep.Readiness(mask)
 
@@ -174,18 +186,18 @@ func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask {
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
-func (s *Socket) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 	s.ep.EventRegister(e, mask)
 	// Writable readiness never changes, so no registration is needed.
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
-func (s *Socket) EventUnregister(e *waiter.Entry) {
+func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) {
 	s.ep.EventUnregister(e)
 }
 
 // Passcred implements transport.Credentialer.Passcred.
-func (s *Socket) Passcred() bool {
+func (s *socketOpsCommon) Passcred() bool {
 	s.mu.Lock()
 	passcred := s.passcred
 	s.mu.Unlock()
@@ -193,7 +205,7 @@ func (s *Socket) Passcred() bool {
 }
 
 // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
-func (s *Socket) ConnectedPasscred() bool {
+func (s *socketOpsCommon) ConnectedPasscred() bool {
 	// This socket is connected to the kernel, which doesn't need creds.
 	//
 	// This is arbitrary, as ConnectedPasscred on this type has no callers.
@@ -227,7 +239,7 @@ func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) {
 // port of 0 defaults to the ThreadGroup ID.
 //
 // Preconditions: mu is held.
-func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error {
+func (s *socketOpsCommon) bindPort(t *kernel.Task, port int32) *syserr.Error {
 	if s.bound {
 		// Re-binding is only allowed if the port doesn't change.
 		if port != s.portID {
@@ -251,7 +263,7 @@ func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error {
 }
 
 // Bind implements socket.Socket.Bind.
-func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 	a, err := ExtractSockAddr(sockaddr)
 	if err != nil {
 		return err
@@ -269,7 +281,7 @@ func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 }
 
 // Connect implements socket.Socket.Connect.
-func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
 	a, err := ExtractSockAddr(sockaddr)
 	if err != nil {
 		return err
@@ -300,25 +312,25 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr
 }
 
 // Accept implements socket.Socket.Accept.
-func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
 	// Netlink sockets never support accept.
 	return 0, nil, 0, syserr.ErrNotSupported
 }
 
 // Listen implements socket.Socket.Listen.
-func (s *Socket) Listen(t *kernel.Task, backlog int) *syserr.Error {
+func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 	// Netlink sockets never support listen.
 	return syserr.ErrNotSupported
 }
 
 // Shutdown implements socket.Socket.Shutdown.
-func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error {
+func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	// Netlink sockets never support shutdown.
 	return syserr.ErrNotSupported
 }
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		switch name {
@@ -369,7 +381,7 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.
 }
 
 // SetSockOpt implements socket.Socket.SetSockOpt.
-func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
+func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
 	switch level {
 	case linux.SOL_SOCKET:
 		switch name {
@@ -466,7 +478,7 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 }
 
 // GetSockName implements socket.Socket.GetSockName.
-func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -478,7 +490,7 @@ func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Er
 }
 
 // GetPeerName implements socket.Socket.GetPeerName.
-func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	sa := &linux.SockAddrNetlink{
 		Family: linux.AF_NETLINK,
 		// TODO(b/68878065): Support non-kernel peers. For now the peer
@@ -489,7 +501,7 @@ func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Er
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
 	from := &linux.SockAddrNetlink{
 		Family: linux.AF_NETLINK,
 		PortID: 0,
@@ -590,7 +602,7 @@ func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID)
 var kernelCreds = &kernelSCM{}
 
 // sendResponse sends the response messages in ms back to userspace.
-func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error {
+func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error {
 	// Linux combines multiple netlink messages into a single datagram.
 	bufs := make([][]byte, 0, len(ms.Messages))
 	for _, m := range ms.Messages {
@@ -666,7 +678,7 @@ func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) {
 
 // processMessages handles each message in buf, passing it to the protocol
 // handler for final handling.
-func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error {
+func (s *socketOpsCommon) processMessages(ctx context.Context, buf []byte) *syserr.Error {
 	for len(buf) > 0 {
 		msg, rest, ok := ParseMessage(buf)
 		if !ok {
@@ -698,7 +710,7 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 }
 
 // sendMsg is the core of message send, used for SendMsg and Write.
-func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	dstPort := int32(0)
 
 	if len(to) != 0 {
@@ -745,7 +757,7 @@ func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte,
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	return s.sendMsg(t, src, to, flags, controlMessages)
 }
 
@@ -756,11 +768,13 @@ func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence,
 }
 
 // State implements socket.Socket.State.
-func (s *Socket) State() uint32 {
+func (s *socketOpsCommon) State() uint32 {
 	return s.ep.State()
 }
 
 // Type implements socket.Socket.Type.
-func (s *Socket) Type() (family int, skType linux.SockType, protocol int) {
+func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
 	return linux.AF_NETLINK, s.skType, s.protocol.Protocol()
 }
+
+// LINT.ThenChange(./socket_vfs2.go)
diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go
new file mode 100644
index 000000000..b854bf990
--- /dev/null
+++ b/pkg/sentry/socket/netlink/socket_vfs2.go
@@ -0,0 +1,138 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netlink
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// SocketVFS2 is the base VFS2 socket type for netlink sockets.
+//
+// This implementation only supports userspace sending and receiving messages
+// to/from the kernel.
+//
+// SocketVFS2 implements socket.SocketVFS2 and transport.Credentialer.
+type SocketVFS2 struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	socketOpsCommon
+}
+
+var _ socket.SocketVFS2 = (*SocketVFS2)(nil)
+var _ transport.Credentialer = (*SocketVFS2)(nil)
+
+// NewVFS2 creates a new SocketVFS2.
+func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketVFS2, *syserr.Error) {
+	// Datagram endpoint used to buffer kernel -> user messages.
+	ep := transport.NewConnectionless(t)
+
+	// Bind the endpoint for good measure so we can connect to it. The
+	// bound address will never be exposed.
+	if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil {
+		ep.Close()
+		return nil, err
+	}
+
+	// Create a connection from which the kernel can write messages.
+	connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t)
+	if err != nil {
+		ep.Close()
+		return nil, err
+	}
+
+	return &SocketVFS2{
+		socketOpsCommon: socketOpsCommon{
+			ports:          t.Kernel().NetlinkPorts(),
+			protocol:       protocol,
+			skType:         skType,
+			ep:             ep,
+			connection:     connection,
+			sendBufferSize: defaultSendBufferSize,
+		},
+	}, nil
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return s.socketOpsCommon.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	s.socketOpsCommon.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *SocketVFS2) EventUnregister(e *waiter.Entry) {
+	s.socketOpsCommon.EventUnregister(e)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.
+func (*SocketVFS2) Ioctl(context.Context, usermem.IO, arch.SyscallArguments) (uintptr, error) {
+	// TODO(b/68878065): no ioctls supported.
+	return 0, syserror.ENOTTY
+}
+
+// PRead implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Read implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	return dst.CopyOutFrom(ctx, &unix.EndpointReader{
+		Endpoint: s.ep,
+	})
+}
+
+// PWrite implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Write implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
+	return int64(n), err.ToError()
+}
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index ab01cb4fa..8f0f5466e 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -7,7 +7,9 @@ go_library(
     srcs = [
         "device.go",
         "netstack.go",
+        "netstack_vfs2.go",
         "provider.go",
+        "provider_vfs2.go",
         "save_restore.go",
         "stack.go",
     ],
@@ -16,6 +18,7 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/amutex",
         "//pkg/binary",
         "//pkg/context",
         "//pkg/log",
@@ -25,6 +28,7 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
@@ -32,13 +36,13 @@ go_library(
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/netfilter",
         "//pkg/sentry/unimpl",
+        "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
@@ -46,5 +50,6 @@ go_library(
         "//pkg/tcpip/transport/udp",
         "//pkg/usermem",
         "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 9757fbfba..e1e0c5931 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -29,10 +29,13 @@ import (
 	"io"
 	"math"
 	"reflect"
+	"sync/atomic"
 	"syscall"
 	"time"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
@@ -62,7 +65,13 @@ import (
 
 func mustCreateMetric(name, description string) *tcpip.StatCounter {
 	var cm tcpip.StatCounter
-	metric.MustRegisterCustomUint64Metric(name, false /* sync */, description, cm.Value)
+	metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, cm.Value)
+	return &cm
+}
+
+func mustCreateGauge(name, description string) *tcpip.StatCounter {
+	var cm tcpip.StatCounter
+	metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, cm.Value)
 	return &cm
 }
 
@@ -150,10 +159,10 @@ var Metrics = tcpip.Stats{
 	TCP: tcpip.TCPStats{
 		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
 		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
-		CurrentEstablished:                 mustCreateMetric("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
-		CurrentConnected:                   mustCreateMetric("/netstack/tcp/current_open", "Number of connections that are in connected state."),
+		CurrentEstablished:                 mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
+		CurrentConnected:                   mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."),
 		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
-		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "number of times established TCP connections made a transition to CLOSED state."),
+		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."),
 		EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
 		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
 		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
@@ -245,6 +254,8 @@ type commonEndpoint interface {
 	GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
 }
 
+// LINT.IfChange
+
 // SocketOperations encapsulates all the state needed to represent a network stack
 // endpoint in the kernel context.
 //
@@ -256,6 +267,14 @@ type SocketOperations struct {
 	fsutil.FileNoFsync              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	socketOpsCommon
+}
+
+// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
+//
+// +stateify savable
+type socketOpsCommon struct {
 	socket.SendReceiveTimeout
 	*waiter.Queue
 
@@ -264,6 +283,12 @@ type SocketOperations struct {
 	skType   linux.SockType
 	protocol int
 
+	// readViewHasData is 1 iff readView has data to be read, 0 otherwise.
+	// Must be accessed using atomic operations. It must only be written
+	// with readMu held but can be read without holding readMu. The latter
+	// is required to avoid deadlocks in epoll Readiness checks.
+	readViewHasData uint32
+
 	// readMu protects access to the below fields.
 	readMu sync.Mutex `state:"nosave"`
 	// readView contains the remaining payload from the last packet.
@@ -293,7 +318,7 @@ type SocketOperations struct {
 // New creates a new endpoint socket.
 func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
 	if skType == linux.SOCK_STREAM {
-		if err := endpoint.SetSockOptInt(tcpip.DelayOption, 1); err != nil {
+		if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 	}
@@ -301,11 +326,13 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue
 	dirent := socket.NewDirent(t, netstackDevice)
 	defer dirent.DecRef()
 	return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{
-		Queue:    queue,
-		family:   family,
-		Endpoint: endpoint,
-		skType:   skType,
-		protocol: protocol,
+		socketOpsCommon: socketOpsCommon{
+			Queue:    queue,
+			family:   family,
+			Endpoint: endpoint,
+			skType:   skType,
+			protocol: protocol,
+		},
 	}), nil
 }
 
@@ -404,33 +431,36 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
 	}
 }
 
-func (s *SocketOperations) isPacketBased() bool {
+func (s *socketOpsCommon) isPacketBased() bool {
 	return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW
 }
 
 // fetchReadView updates the readView field of the socket if it's currently
 // empty. It assumes that the socket is locked.
-func (s *SocketOperations) fetchReadView() *syserr.Error {
+//
+// Precondition: s.readMu must be held.
+func (s *socketOpsCommon) fetchReadView() *syserr.Error {
 	if len(s.readView) > 0 {
 		return nil
 	}
-
 	s.readView = nil
 	s.sender = tcpip.FullAddress{}
 
 	v, cms, err := s.Endpoint.Read(&s.sender)
 	if err != nil {
+		atomic.StoreUint32(&s.readViewHasData, 0)
 		return syserr.TranslateNetstackError(err)
 	}
 
 	s.readView = v
 	s.readCM = cms
+	atomic.StoreUint32(&s.readViewHasData, 1)
 
 	return nil
 }
 
 // Release implements fs.FileOperations.Release.
-func (s *SocketOperations) Release() {
+func (s *socketOpsCommon) Release() {
 	s.Endpoint.Close()
 }
 
@@ -525,11 +555,9 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 	}
 
 	if resCh != nil {
-		t := ctx.(*kernel.Task)
-		if err := t.Block(resCh); err != nil {
-			return 0, syserr.FromError(err).ToError()
+		if err := amutex.Block(ctx, resCh); err != nil {
+			return 0, err
 		}
-
 		n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{})
 	}
 
@@ -598,11 +626,9 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader
 	}
 
 	if resCh != nil {
-		t := ctx.(*kernel.Task)
-		if err := t.Block(resCh); err != nil {
-			return 0, syserr.FromError(err).ToError()
+		if err := amutex.Block(ctx, resCh); err != nil {
+			return 0, err
 		}
-
 		n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{
 			Atomic: true, // See above.
 		})
@@ -617,23 +643,21 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader
 }
 
 // Readiness returns a mask of ready events for socket s.
-func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
 	r := s.Endpoint.Readiness(mask)
 
 	// Check our cached value iff the caller asked for readability and the
 	// endpoint itself is currently not readable.
 	if (mask & ^r & waiter.EventIn) != 0 {
-		s.readMu.Lock()
-		if len(s.readView) > 0 {
+		if atomic.LoadUint32(&s.readViewHasData) == 1 {
 			r |= waiter.EventIn
 		}
-		s.readMu.Unlock()
 	}
 
 	return r
 }
 
-func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error {
+func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error {
 	if family == uint16(s.family) {
 		return nil
 	}
@@ -655,8 +679,8 @@ func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error
 // This is a hack to work around the fact that both IPv4 and IPv6 ANY are
 // represented by the empty string.
 //
-// TODO(gvisor.dev/issues/1556): remove this function.
-func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
+// TODO(gvisor.dev/issue/1556): remove this function.
+func (s *socketOpsCommon) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
 	if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
 		addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
 	}
@@ -665,7 +689,7 @@ func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpi
 
 // Connect implements the linux syscall connect(2) for sockets backed by
 // tpcip.Endpoint.
-func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
 	addr, family, err := AddressAndFamily(sockaddr)
 	if err != nil {
 		return err
@@ -696,6 +720,14 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 	defer s.EventUnregister(&e)
 
 	if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting {
+		if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
+			// TCP unlike UDP returns EADDRNOTAVAIL when it can't
+			// find an available local ephemeral port.
+			if err == tcpip.ErrNoPortAvailable {
+				return syserr.ErrAddressNotAvailable
+			}
+		}
+
 		return syserr.TranslateNetstackError(err)
 	}
 
@@ -711,15 +743,45 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, family, err := AddressAndFamily(sockaddr)
-	if err != nil {
-		return err
+func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	if len(sockaddr) < 2 {
+		return syserr.ErrInvalidArgument
 	}
-	if err := s.checkFamily(family, true /* exact */); err != nil {
-		return err
+
+	family := usermem.ByteOrder.Uint16(sockaddr)
+	var addr tcpip.FullAddress
+
+	// Bind for AF_PACKET requires only family, protocol and ifindex.
+	// In function AddressAndFamily, we check the address length which is
+	// not needed for AF_PACKET bind.
+	if family == linux.AF_PACKET {
+		var a linux.SockAddrLink
+		if len(sockaddr) < sockAddrLinkSize {
+			return syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(sockaddr[:sockAddrLinkSize], usermem.ByteOrder, &a)
+
+		if a.Protocol != uint16(s.protocol) {
+			return syserr.ErrInvalidArgument
+		}
+
+		addr = tcpip.FullAddress{
+			NIC:  tcpip.NICID(a.InterfaceIndex),
+			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
+		}
+	} else {
+		var err *syserr.Error
+		addr, family, err = AddressAndFamily(sockaddr)
+		if err != nil {
+			return err
+		}
+
+		if err = s.checkFamily(family, true /* exact */); err != nil {
+			return err
+		}
+
+		addr = s.mapFamily(addr, family)
 	}
-	addr = s.mapFamily(addr, family)
 
 	// Issue the bind request to the endpoint.
 	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
@@ -727,13 +789,13 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 
 // Listen implements the linux syscall listen(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
+func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 	return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog))
 }
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
+func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.EventRegister(&e, waiter.EventIn)
@@ -819,7 +881,7 @@ func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) {
 
 // Shutdown implements the linux syscall shutdown(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
+func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	f, err := ConvertShutdown(how)
 	if err != nil {
 		return err
@@ -902,7 +964,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 
 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
 // sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -927,8 +989,15 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 	return nil, syserr.ErrProtocolNotAvailable
 }
 
+func boolToInt32(v bool) int32 {
+	if v {
+		return 1
+	}
+	return 0
+}
+
 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
-func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
 	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
 	switch name {
 	case linux.SO_ERROR:
@@ -960,12 +1029,11 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.PasscredOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.PasscredOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_SNDBUF:
 		if outLen < sizeOfInt32 {
@@ -1004,24 +1072,22 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.ReuseAddressOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.ReuseAddressOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_REUSEPORT:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.ReusePortOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.ReusePortOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_BINDTODEVICE:
 		var v tcpip.BindToDeviceOption
@@ -1051,24 +1117,22 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.BroadcastOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.BroadcastOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_KEEPALIVE:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.KeepaliveEnabledOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.KeepaliveEnabledOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.SO_LINGER:
 		if outLen < linux.SizeOfLinger {
@@ -1118,47 +1182,41 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		v, err := ep.GetSockOptInt(tcpip.DelayOption)
+		v, err := ep.GetSockOptBool(tcpip.DelayOption)
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		if v == 0 {
-			return int32(1), nil
-		}
-		return int32(0), nil
+		return boolToInt32(!v), nil
 
 	case linux.TCP_CORK:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.CorkOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.CorkOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.TCP_QUICKACK:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.QuickAckOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.QuickAckOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		return int32(v), nil
+		return boolToInt32(v), nil
 
 	case linux.TCP_MAXSEG:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.MaxSegOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.MaxSegOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1272,6 +1330,29 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(time.Duration(v) / time.Second), nil
 
+	case linux.TCP_SYNCNT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
+
+	case linux.TCP_WINDOW_CLAMP:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
 	default:
 		emitUnimplementedEventTCP(t, name)
 	}
@@ -1290,11 +1371,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	case linux.IPV6_PATHMTU:
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1304,8 +1381,8 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if outLen == 0 {
 			return make([]byte, 0), nil
 		}
-		var v tcpip.IPv6TrafficClassOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1318,6 +1395,17 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		}
 		return ib, nil
 
+	case linux.IPV6_RECVTCLASS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		return boolToInt32(v), nil
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -1332,8 +1420,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.TTLOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.TTLOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1349,8 +1437,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.MulticastTTLOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
@@ -1375,23 +1463,19 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 			return nil, syserr.ErrInvalidArgument
 		}
 
-		var v tcpip.MulticastLoopOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptBool(tcpip.MulticastLoopOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-
-		if v {
-			return int32(1), nil
-		}
-		return int32(0), nil
+		return boolToInt32(v), nil
 
 	case linux.IP_TOS:
 		// Length handling for parity with Linux.
 		if outLen == 0 {
 			return []byte(nil), nil
 		}
-		var v tcpip.IPv4TOSOption
-		if err := ep.GetSockOpt(&v); err != nil {
+		v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
+		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
 		if outLen < sizeOfInt32 {
@@ -1408,11 +1492,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	case linux.IP_PKTINFO:
 		if outLen < sizeOfInt32 {
@@ -1423,11 +1503,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o int32
-		if v {
-			o = 1
-		}
-		return o, nil
+		return boolToInt32(v), nil
 
 	default:
 		emitUnimplementedEventIP(t, name)
@@ -1487,7 +1563,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 
 // SetSockOpt can be used to implement the linux syscall setsockopt(2) for
 // sockets backed by a commonEndpoint.
-func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
+func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
 	switch level {
 	case linux.SOL_SOCKET:
 		return setSockOptSocket(t, s, ep, name, optVal)
@@ -1514,7 +1590,7 @@ func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, n
 }
 
 // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
-func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
 	case linux.SO_SNDBUF:
 		if len(optVal) < sizeOfInt32 {
@@ -1538,7 +1614,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReuseAddressOption, v != 0))
 
 	case linux.SO_REUSEPORT:
 		if len(optVal) < sizeOfInt32 {
@@ -1546,7 +1622,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReusePortOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReusePortOption, v != 0))
 
 	case linux.SO_BINDTODEVICE:
 		n := bytes.IndexByte(optVal, 0)
@@ -1574,7 +1650,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BroadcastOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.BroadcastOption, v != 0))
 
 	case linux.SO_PASSCRED:
 		if len(optVal) < sizeOfInt32 {
@@ -1582,7 +1658,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.PasscredOption, v != 0))
 
 	case linux.SO_KEEPALIVE:
 		if len(optVal) < sizeOfInt32 {
@@ -1590,7 +1666,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveEnabledOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.KeepaliveEnabledOption, v != 0))
 
 	case linux.SO_SNDTIMEO:
 		if len(optVal) < linux.SizeOfTimeval {
@@ -1662,11 +1738,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		var o int
-		if v == 0 {
-			o = 1
-		}
-		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.DelayOption, o))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.DelayOption, v == 0))
 
 	case linux.TCP_CORK:
 		if len(optVal) < sizeOfInt32 {
@@ -1674,7 +1746,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.CorkOption, v != 0))
 
 	case linux.TCP_QUICKACK:
 		if len(optVal) < sizeOfInt32 {
@@ -1682,7 +1754,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.QuickAckOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.QuickAckOption, v != 0))
 
 	case linux.TCP_MAXSEG:
 		if len(optVal) < sizeOfInt32 {
@@ -1690,7 +1762,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 
 		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MaxSegOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v)))
 
 	case linux.TCP_KEEPIDLE:
 		if len(optVal) < sizeOfInt32 {
@@ -1750,6 +1822,22 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_SYNCNT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+		v := usermem.ByteOrder.Uint32(optVal)
+
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v)))
+
+	case linux.TCP_WINDOW_CLAMP:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+		v := usermem.ByteOrder.Uint32(optVal)
+
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v)))
+
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
@@ -1801,7 +1889,15 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		if v == -1 {
 			v = 0
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v)))
+
+	case linux.IPV6_RECVTCLASS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
 
 	default:
 		emitUnimplementedEventIPv6(t, name)
@@ -1878,7 +1974,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		if v < 0 || v > 255 {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastTTLOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v)))
 
 	case linux.IP_ADD_MEMBERSHIP:
 		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
@@ -1925,9 +2021,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			return err
 		}
 
-		return syserr.TranslateNetstackError(ep.SetSockOpt(
-			tcpip.MulticastLoopOption(v != 0),
-		))
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.MulticastLoopOption, v != 0))
 
 	case linux.MCAST_JOIN_GROUP:
 		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
@@ -1946,7 +2040,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		} else if v < 1 || v > 255 {
 			return syserr.ErrInvalidArgument
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TTLOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TTLOption, int(v)))
 
 	case linux.IP_TOS:
 		if len(optVal) == 0 {
@@ -1956,7 +2050,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		if err != nil {
 			return err
 		}
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v)))
 
 	case linux.IP_RECVTOS:
 		v, err := parseIntOrChar(optVal)
@@ -2086,7 +2180,6 @@ func emitUnimplementedEventIPv6(t *kernel.Task, name int) {
 		linux.IPV6_RECVPATHMTU,
 		linux.IPV6_RECVPKTINFO,
 		linux.IPV6_RECVRTHDR,
-		linux.IPV6_RECVTCLASS,
 		linux.IPV6_RTHDR,
 		linux.IPV6_RTHDRDSTOPTS,
 		linux.IPV6_TCLASS,
@@ -2222,7 +2315,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32)
 
 // GetSockName implements the linux syscall getsockname(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.Endpoint.GetLocalAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -2234,7 +2327,7 @@ func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32,
 
 // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.Endpoint.GetRemoteAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -2249,7 +2342,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32,
 // caller.
 //
 // Precondition: s.readMu must be locked.
-func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) {
+func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) {
 	var err *syserr.Error
 	var copied int
 
@@ -2281,6 +2374,10 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 		}
 		copied += n
 		s.readView.TrimFront(n)
+		if len(s.readView) == 0 {
+			atomic.StoreUint32(&s.readViewHasData, 0)
+		}
+
 		dst = dst.DropFirst(n)
 		if e != nil {
 			err = syserr.FromError(e)
@@ -2297,7 +2394,7 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 	return 0, err
 }
 
-func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) {
+func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) {
 	if !s.sockOptInq {
 		return
 	}
@@ -2312,7 +2409,7 @@ func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) {
 // nonBlockingRead issues a non-blocking read.
 //
 // TODO(b/78348848): Support timestamps for stream sockets.
-func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
 	isPacket := s.isPacketBased()
 
 	// Fast path for regular reads from stream (e.g., TCP) endpoints. Note
@@ -2327,9 +2424,9 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		// caller-supplied  buffer.
 		s.readMu.Lock()
 		n, err := s.coalescingRead(ctx, dst, trunc)
-		s.readMu.Unlock()
 		cmsg := s.controlMessages()
 		s.fillCmsgInq(&cmsg)
+		s.readMu.Unlock()
 		return n, 0, nil, 0, cmsg, err
 	}
 
@@ -2403,6 +2500,10 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		s.readView.TrimFront(int(n))
 	}
 
+	if len(s.readView) == 0 {
+		atomic.StoreUint32(&s.readViewHasData, 0)
+	}
+
 	var flags int
 	if msgLen > int(n) {
 		flags |= linux.MSG_TRUNC
@@ -2417,13 +2518,15 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	return n, flags, addr, addrLen, cmsg, syserr.FromError(err)
 }
 
-func (s *SocketOperations) controlMessages() socket.ControlMessages {
+func (s *socketOpsCommon) controlMessages() socket.ControlMessages {
 	return socket.ControlMessages{
 		IP: tcpip.ControlMessages{
 			HasTimestamp:    s.readCM.HasTimestamp && s.sockOptTimestamp,
 			Timestamp:       s.readCM.Timestamp,
 			HasTOS:          s.readCM.HasTOS,
 			TOS:             s.readCM.TOS,
+			HasTClass:       s.readCM.HasTClass,
+			TClass:          s.readCM.TClass,
 			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
 			PacketInfo:      s.readCM.PacketInfo,
 		},
@@ -2434,7 +2537,7 @@ func (s *SocketOperations) controlMessages() socket.ControlMessages {
 // successfully writing packet data out to userspace.
 //
 // Precondition: s.readMu must be locked.
-func (s *SocketOperations) updateTimestamp() {
+func (s *socketOpsCommon) updateTimestamp() {
 	// Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled.
 	if !s.sockOptTimestamp {
 		s.timestampValid = true
@@ -2444,7 +2547,7 @@ func (s *SocketOperations) updateTimestamp() {
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
+func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
 	dontWait := flags&linux.MSG_DONTWAIT != 0
@@ -2512,7 +2615,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 
 // SendMsg implements the linux syscall sendmsg(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	// Reject Unix control messages.
 	if !controlMessages.Unix.Empty() {
 		return 0, syserr.ErrInvalidArgument
@@ -2588,6 +2691,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return s.socketOpsCommon.ioctl(ctx, io, args)
+}
+
+func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	// SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
 	// sockets.
 	// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
@@ -2612,13 +2719,15 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
 		}
 
 		// Add bytes removed from the endpoint but not yet sent to the caller.
+		s.readMu.Lock()
 		v += len(s.readView)
+		s.readMu.Unlock()
 
 		if v > math.MaxInt32 {
 			v = math.MaxInt32
 		}
 
-		// Copy result to user-space.
+		// Copy result to userspace.
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
 			AddressSpaceActive: true,
 		})
@@ -2687,7 +2796,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
 		if v > math.MaxInt32 {
 			v = math.MaxInt32
 		}
-		// Copy result to user-space.
+		// Copy result to userspace.
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
 			AddressSpaceActive: true,
 		})
@@ -2703,7 +2812,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
 			v = math.MaxInt32
 		}
 
-		// Copy result to user-space.
+		// Copy result to userspace.
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
 			AddressSpaceActive: true,
 		})
@@ -2925,7 +3034,7 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
 
 // State implements socket.Socket.State. State translates the internal state
 // returned by netstack to values defined by Linux.
-func (s *SocketOperations) State() uint32 {
+func (s *socketOpsCommon) State() uint32 {
 	if s.family != linux.AF_INET && s.family != linux.AF_INET6 {
 		// States not implemented for this socket's family.
 		return 0
@@ -2985,6 +3094,8 @@ func (s *SocketOperations) State() uint32 {
 }
 
 // Type implements socket.Socket.Type.
-func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) {
+func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
 	return s.family, s.skType, s.protocol
 }
+
+// LINT.ThenChange(./netstack_vfs2.go)
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
new file mode 100644
index 000000000..fcd8013c0
--- /dev/null
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -0,0 +1,317 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netstack
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// SocketVFS2 encapsulates all the state needed to represent a network stack
+// endpoint in the kernel context.
+type SocketVFS2 struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	socketOpsCommon
+}
+
+var _ = socket.SocketVFS2(&SocketVFS2{})
+
+// NewVFS2 creates a new endpoint socket.
+func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) {
+	if skType == linux.SOCK_STREAM {
+		if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+	}
+
+	mnt := t.Kernel().SocketMount()
+	d := sockfs.NewDentry(t.Credentials(), mnt)
+
+	s := &SocketVFS2{
+		socketOpsCommon: socketOpsCommon{
+			Queue:    queue,
+			family:   family,
+			Endpoint: endpoint,
+			skType:   skType,
+			protocol: protocol,
+		},
+	}
+	vfsfd := &s.vfsfd
+	if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, syserr.FromError(err)
+	}
+	return vfsfd, nil
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return s.socketOpsCommon.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	s.socketOpsCommon.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *SocketVFS2) EventUnregister(e *waiter.Entry) {
+	s.socketOpsCommon.EventUnregister(e)
+}
+
+// Read implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
+	if err == syserr.ErrWouldBlock {
+		return int64(n), syserror.ErrWouldBlock
+	}
+	if err != nil {
+		return 0, err.ToError()
+	}
+	return int64(n), nil
+}
+
+// Write implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	f := &ioSequencePayload{ctx: ctx, src: src}
+	n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{})
+	if err == tcpip.ErrWouldBlock {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	if resCh != nil {
+		if err := amutex.Block(ctx, resCh); err != nil {
+			return 0, err
+		}
+		n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{})
+	}
+
+	if err != nil {
+		return 0, syserr.TranslateNetstackError(err).ToError()
+	}
+
+	if int64(n) < src.NumBytes() {
+		return int64(n), syserror.ErrWouldBlock
+	}
+
+	return int64(n), nil
+}
+
+// Accept implements the linux syscall accept(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
+	// Issue the accept request to get the new endpoint.
+	ep, wq, terr := s.Endpoint.Accept()
+	if terr != nil {
+		if terr != tcpip.ErrWouldBlock || !blocking {
+			return 0, nil, 0, syserr.TranslateNetstackError(terr)
+		}
+
+		var err *syserr.Error
+		ep, wq, err = s.blockingAccept(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	ns, err := NewVFS2(t, s.family, s.skType, s.protocol, wq, ep)
+	if err != nil {
+		return 0, nil, 0, err
+	}
+	defer ns.DecRef()
+
+	if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil {
+		return 0, nil, 0, syserr.FromError(err)
+	}
+
+	var addr linux.SockAddr
+	var addrLen uint32
+	if peerRequested {
+		// Get address of the peer and write it to peer slice.
+		var err *syserr.Error
+		addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
+		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
+	})
+
+	t.Kernel().RecordSocketVFS2(ns)
+
+	return fd, addr, addrLen, syserr.FromError(e)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return s.socketOpsCommon.ioctl(ctx, uio, args)
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
+	// implemented specifically for netstack.SocketVFS2 rather than
+	// commonEndpoint. commonEndpoint should be extended to support socket
+	// options where the implementation is not shared, as unix sockets need
+	// their own support for SO_TIMESTAMP.
+	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+		val := int32(0)
+		s.readMu.Lock()
+		defer s.readMu.Unlock()
+		if s.sockOptTimestamp {
+			val = 1
+		}
+		return val, nil
+	}
+	if level == linux.SOL_TCP && name == linux.TCP_INQ {
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+		val := int32(0)
+		s.readMu.Lock()
+		defer s.readMu.Unlock()
+		if s.sockOptInq {
+			val = 1
+		}
+		return val, nil
+	}
+
+	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
+		switch name {
+		case linux.IPT_SO_GET_INFO:
+			if outLen < linux.SizeOfIPTGetinfo {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			stack := inet.StackFromContext(t)
+			if stack == nil {
+				return nil, syserr.ErrNoDevice
+			}
+			info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr)
+			if err != nil {
+				return nil, err
+			}
+			return info, nil
+
+		case linux.IPT_SO_GET_ENTRIES:
+			if outLen < linux.SizeOfIPTGetEntries {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			stack := inet.StackFromContext(t)
+			if stack == nil {
+				return nil, syserr.ErrNoDevice
+			}
+			entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen)
+			if err != nil {
+				return nil, err
+			}
+			return entries, nil
+
+		}
+	}
+
+	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
+	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
+	// implemented specifically for netstack.SocketVFS2 rather than
+	// commonEndpoint. commonEndpoint should be extended to support socket
+	// options where the implementation is not shared, as unix sockets need
+	// their own support for SO_TIMESTAMP.
+	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+		s.readMu.Lock()
+		defer s.readMu.Unlock()
+		s.sockOptTimestamp = usermem.ByteOrder.Uint32(optVal) != 0
+		return nil
+	}
+	if level == linux.SOL_TCP && name == linux.TCP_INQ {
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+		s.readMu.Lock()
+		defer s.readMu.Unlock()
+		s.sockOptInq = usermem.ByteOrder.Uint32(optVal) != 0
+		return nil
+	}
+
+	if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
+		switch name {
+		case linux.IPT_SO_SET_REPLACE:
+			if len(optVal) < linux.SizeOfIPTReplace {
+				return syserr.ErrInvalidArgument
+			}
+
+			stack := inet.StackFromContext(t)
+			if stack == nil {
+				return syserr.ErrNoDevice
+			}
+			// Stack must be a netstack stack.
+			return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
+
+		case linux.IPT_SO_SET_ADD_COUNTERS:
+			// TODO(gvisor.dev/issue/170): Counter support.
+			return nil
+		}
+	}
+
+	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
+}
diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go
index 5afff2564..ead3b2b79 100644
--- a/pkg/sentry/socket/netstack/provider.go
+++ b/pkg/sentry/socket/netstack/provider.go
@@ -33,6 +33,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // provider is an inet socket provider.
 type provider struct {
 	family   int
@@ -62,10 +64,6 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in
 		}
 
 	case linux.SOCK_RAW:
-		// TODO(b/142504697): "In order to create a raw socket, a
-		// process must have the CAP_NET_RAW capability in the user
-		// namespace that governs its network namespace." - raw(7)
-
 		// Raw sockets require CAP_NET_RAW.
 		creds := auth.CredentialsFromContext(ctx)
 		if !creds.HasCapability(linux.CAP_NET_RAW) {
@@ -75,6 +73,8 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in
 		switch protocol {
 		case syscall.IPPROTO_ICMP:
 			return header.ICMPv4ProtocolNumber, true, nil
+		case syscall.IPPROTO_ICMPV6:
+			return header.ICMPv6ProtocolNumber, true, nil
 		case syscall.IPPROTO_UDP:
 			return header.UDPProtocolNumber, true, nil
 		case syscall.IPPROTO_TCP:
@@ -124,6 +124,12 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*
 		ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated)
 	} else {
 		ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq)
+
+		// Assign task to PacketOwner interface to get the UID and GID for
+		// iptables owner matching.
+		if e == nil {
+			ep.SetOwner(t)
+		}
 	}
 	if e != nil {
 		return nil, syserr.TranslateNetstackError(e)
@@ -133,10 +139,6 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*
 }
 
 func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
-	// TODO(b/142504697): "In order to create a packet socket, a process
-	// must have the CAP_NET_RAW capability in the user namespace that
-	// governs its network namespace." - packet(7)
-
 	// Packet sockets require CAP_NET_RAW.
 	creds := auth.CredentialsFromContext(t)
 	if !creds.HasCapability(linux.CAP_NET_RAW) {
@@ -167,6 +169,8 @@ func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol
 	return New(t, linux.AF_PACKET, stype, protocol, wq, ep)
 }
 
+// LINT.ThenChange(./provider_vfs2.go)
+
 // Pair just returns nil sockets (not supported).
 func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
 	return nil, nil, nil
diff --git a/pkg/sentry/socket/netstack/provider_vfs2.go b/pkg/sentry/socket/netstack/provider_vfs2.go
new file mode 100644
index 000000000..2a01143f6
--- /dev/null
+++ b/pkg/sentry/socket/netstack/provider_vfs2.go
@@ -0,0 +1,141 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netstack
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// providerVFS2 is an inet socket provider.
+type providerVFS2 struct {
+	family   int
+	netProto tcpip.NetworkProtocolNumber
+}
+
+// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET
+// family.
+func (p *providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
+	// Fail right away if we don't have a stack.
+	stack := t.NetworkContext()
+	if stack == nil {
+		// Don't propagate an error here. Instead, allow the socket
+		// code to continue searching for another provider.
+		return nil, nil
+	}
+	eps, ok := stack.(*Stack)
+	if !ok {
+		return nil, nil
+	}
+
+	// Packet sockets are handled separately, since they are neither INET
+	// nor INET6 specific.
+	if p.family == linux.AF_PACKET {
+		return packetSocketVFS2(t, eps, stype, protocol)
+	}
+
+	// Figure out the transport protocol.
+	transProto, associated, err := getTransportProtocol(t, stype, protocol)
+	if err != nil {
+		return nil, err
+	}
+
+	// Create the endpoint.
+	var ep tcpip.Endpoint
+	var e *tcpip.Error
+	wq := &waiter.Queue{}
+	if stype == linux.SOCK_RAW {
+		ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated)
+	} else {
+		ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq)
+
+		// Assign task to PacketOwner interface to get the UID and GID for
+		// iptables owner matching.
+		if e == nil {
+			ep.SetOwner(t)
+		}
+	}
+	if e != nil {
+		return nil, syserr.TranslateNetstackError(e)
+	}
+
+	return NewVFS2(t, p.family, stype, int(transProto), wq, ep)
+}
+
+func packetSocketVFS2(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
+	// Packet sockets require CAP_NET_RAW.
+	creds := auth.CredentialsFromContext(t)
+	if !creds.HasCapability(linux.CAP_NET_RAW) {
+		return nil, syserr.ErrNotPermitted
+	}
+
+	// "cooked" packets don't contain link layer information.
+	var cooked bool
+	switch stype {
+	case linux.SOCK_DGRAM:
+		cooked = true
+	case linux.SOCK_RAW:
+		cooked = false
+	default:
+		return nil, syserr.ErrProtocolNotSupported
+	}
+
+	// protocol is passed in network byte order, but netstack wants it in
+	// host order.
+	netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol)))
+
+	wq := &waiter.Queue{}
+	ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq)
+	if err != nil {
+		return nil, syserr.TranslateNetstackError(err)
+	}
+
+	return NewVFS2(t, linux.AF_PACKET, stype, protocol, wq, ep)
+}
+
+// Pair just returns nil sockets (not supported).
+func (*providerVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
+	return nil, nil, nil
+}
+
+// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET.
+func init() {
+	// Providers backed by netstack.
+	p := []providerVFS2{
+		{
+			family:   linux.AF_INET,
+			netProto: ipv4.ProtocolNumber,
+		},
+		{
+			family:   linux.AF_INET6,
+			netProto: ipv6.ProtocolNumber,
+		},
+		{
+			family: linux.AF_PACKET,
+		},
+	}
+
+	for i := range p {
+		socket.RegisterProviderVFS2(p[i].family, &p[i])
+	}
+}
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index 0692482e9..9b44c2b89 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -23,7 +23,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -200,36 +199,66 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
 // Statistics implements inet.Stack.Statistics.
 func (s *Stack) Statistics(stat interface{}, arg string) error {
 	switch stats := stat.(type) {
+	case *inet.StatDev:
+		for _, ni := range s.Stack.NICInfo() {
+			if ni.Name != arg {
+				continue
+			}
+			// TODO(gvisor.dev/issue/2103) Support stubbed stats.
+			*stats = inet.StatDev{
+				// Receive section.
+				ni.Stats.Rx.Bytes.Value(),   // bytes.
+				ni.Stats.Rx.Packets.Value(), // packets.
+				0,                           // errs.
+				0,                           // drop.
+				0,                           // fifo.
+				0,                           // frame.
+				0,                           // compressed.
+				0,                           // multicast.
+				// Transmit section.
+				ni.Stats.Tx.Bytes.Value(),   // bytes.
+				ni.Stats.Tx.Packets.Value(), // packets.
+				0,                           // errs.
+				0,                           // drop.
+				0,                           // fifo.
+				0,                           // colls.
+				0,                           // carrier.
+				0,                           // compressed.
+			}
+			break
+		}
 	case *inet.StatSNMPIP:
 		ip := Metrics.IP
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPIP{
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/Forwarding.
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL.
+			0,                          // Ip/Forwarding.
+			0,                          // Ip/DefaultTTL.
 			ip.PacketsReceived.Value(), // InReceives.
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors.
+			0,                          // Ip/InHdrErrors.
 			ip.InvalidDestinationAddressesReceived.Value(), // InAddrErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/InDiscards.
+			0,                               // Ip/ForwDatagrams.
+			0,                               // Ip/InUnknownProtos.
+			0,                               // Ip/InDiscards.
 			ip.PacketsDelivered.Value(),     // InDelivers.
 			ip.PacketsSent.Value(),          // OutRequests.
 			ip.OutgoingPacketErrors.Value(), // OutDiscards.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragOKs.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragFails.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragCreates.
+			0,                               // Ip/OutNoRoutes.
+			0,                               // Support Ip/ReasmTimeout.
+			0,                               // Support Ip/ReasmReqds.
+			0,                               // Support Ip/ReasmOKs.
+			0,                               // Support Ip/ReasmFails.
+			0,                               // Support Ip/FragOKs.
+			0,                               // Support Ip/FragFails.
+			0,                               // Support Ip/FragCreates.
 		}
 	case *inet.StatSNMPICMP:
 		in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats
 		out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPICMP{
-			0, // TODO(gvisor.dev/issue/969): Support Icmp/InMsgs.
+			0, // Icmp/InMsgs.
 			Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors.
-			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/InCsumErrors.
+			0,                         // Icmp/InCsumErrors.
 			in.DstUnreachable.Value(), // InDestUnreachs.
 			in.TimeExceeded.Value(),   // InTimeExcds.
 			in.ParamProblem.Value(),   // InParmProbs.
@@ -241,7 +270,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 			in.TimestampReply.Value(), // InTimestampReps.
 			in.InfoRequest.Value(),    // InAddrMasks.
 			in.InfoReply.Value(),      // InAddrMaskReps.
-			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/OutMsgs.
+			0,                         // Icmp/OutMsgs.
 			Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors.
 			out.DstUnreachable.Value(),                     // OutDestUnreachs.
 			out.TimeExceeded.Value(),                       // OutTimeExcds.
@@ -277,15 +306,16 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 		}
 	case *inet.StatSNMPUDP:
 		udp := Metrics.UDP
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPUDP{
 			udp.PacketsReceived.Value(),     // InDatagrams.
 			udp.UnknownPortErrors.Value(),   // NoPorts.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InErrors.
+			0,                               // Udp/InErrors.
 			udp.PacketsSent.Value(),         // OutDatagrams.
 			udp.ReceiveBufferErrors.Value(), // RcvbufErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/SndbufErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InCsumErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/IgnoredMulti.
+			0,                               // Udp/SndbufErrors.
+			0,                               // Udp/InCsumErrors.
+			0,                               // Udp/IgnoredMulti.
 		}
 	default:
 		return syserr.ErrEndpointOperation.ToError()
@@ -332,14 +362,13 @@ func (s *Stack) RouteTable() []inet.Route {
 }
 
 // IPTables returns the stack's iptables.
-func (s *Stack) IPTables() (iptables.IPTables, error) {
+func (s *Stack) IPTables() (*stack.IPTables, error) {
 	return s.Stack.IPTables(), nil
 }
 
-// FillDefaultIPTables sets the stack's iptables to the default tables, which
-// allow and do not modify all traffic.
-func (s *Stack) FillDefaultIPTables() {
-	netfilter.FillDefaultIPTables(s.Stack)
+// FillIPTablesMetadata populates stack's IPTables with metadata.
+func (s *Stack) FillIPTablesMetadata() {
+	netfilter.FillIPTablesMetadata(s.Stack)
 }
 
 // Resume implements inet.Stack.Resume.
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 50d9744e6..6580bd6e9 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -48,11 +49,25 @@ func (c *ControlMessages) Release() {
 	c.Unix.Release()
 }
 
-// Socket is the interface containing socket syscalls used by the syscall layer
-// to redirect them to the appropriate implementation.
+// Socket is an interface combining fs.FileOperations and SocketOps,
+// representing a VFS1 socket file.
 type Socket interface {
 	fs.FileOperations
+	SocketOps
+}
+
+// SocketVFS2 is an interface combining vfs.FileDescription and SocketOps,
+// representing a VFS2 socket file.
+type SocketVFS2 interface {
+	vfs.FileDescriptionImpl
+	SocketOps
+}
 
+// SocketOps is the interface containing socket syscalls used by the syscall
+// layer to redirect them to the appropriate implementation.
+//
+// It is implemented by both Socket and SocketVFS2.
+type SocketOps interface {
 	// Connect implements the connect(2) linux syscall.
 	Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error
 
@@ -153,6 +168,8 @@ var families = make(map[int][]Provider)
 // RegisterProvider registers the provider of a given address family so that
 // sockets of that type can be created via socket() and/or socketpair()
 // syscalls.
+//
+// This should only be called during the initialization of the address family.
 func RegisterProvider(family int, provider Provider) {
 	families[family] = append(families[family], provider)
 }
@@ -216,6 +233,74 @@ func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
 	return fs.NewDirent(ctx, inode, fmt.Sprintf("socket:[%d]", ino))
 }
 
+// ProviderVFS2 is the vfs2 interface implemented by providers of sockets for
+// specific address families (e.g., AF_INET).
+type ProviderVFS2 interface {
+	// Socket creates a new socket.
+	//
+	// If a nil Socket _and_ a nil error is returned, it means that the
+	// protocol is not supported. A non-nil error should only be returned
+	// if the protocol is supported, but an error occurs during creation.
+	Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error)
+
+	// Pair creates a pair of connected sockets.
+	//
+	// See Socket for error information.
+	Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error)
+}
+
+// familiesVFS2 holds a map of all known address families and their providers.
+var familiesVFS2 = make(map[int][]ProviderVFS2)
+
+// RegisterProviderVFS2 registers the provider of a given address family so that
+// sockets of that type can be created via socket() and/or socketpair()
+// syscalls.
+//
+// This should only be called during the initialization of the address family.
+func RegisterProviderVFS2(family int, provider ProviderVFS2) {
+	familiesVFS2[family] = append(familiesVFS2[family], provider)
+}
+
+// NewVFS2 creates a new socket with the given family, type and protocol.
+func NewVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
+	for _, p := range familiesVFS2[family] {
+		s, err := p.Socket(t, stype, protocol)
+		if err != nil {
+			return nil, err
+		}
+		if s != nil {
+			t.Kernel().RecordSocketVFS2(s)
+			return s, nil
+		}
+	}
+
+	return nil, syserr.ErrAddressFamilyNotSupported
+}
+
+// PairVFS2 creates a new connected socket pair with the given family, type and
+// protocol.
+func PairVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
+	providers, ok := familiesVFS2[family]
+	if !ok {
+		return nil, nil, syserr.ErrAddressFamilyNotSupported
+	}
+
+	for _, p := range providers {
+		s1, s2, err := p.Pair(t, stype, protocol)
+		if err != nil {
+			return nil, nil, err
+		}
+		if s1 != nil && s2 != nil {
+			k := t.Kernel()
+			k.RecordSocketVFS2(s1)
+			k.RecordSocketVFS2(s2)
+			return s1, s2, nil
+		}
+	}
+
+	return nil, nil, syserr.ErrSocketNotSupported
+}
+
 // SendReceiveTimeout stores timeouts for send and receive calls.
 //
 // It is meant to be embedded into Socket implementations to help satisfy the
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 08743deba..de2cc4bdf 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -8,23 +8,27 @@ go_library(
         "device.go",
         "io.go",
         "unix.go",
+        "unix_vfs2.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/fspath",
         "//pkg/refs",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 74bcd6300..c708b6030 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -30,6 +30,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/ilist",
+        "//pkg/log",
         "//pkg/refs",
         "//pkg/sync",
         "//pkg/syserr",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index ce5b94ee7..09c6d3b27 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -252,7 +252,7 @@ func (e *connectionedEndpoint) Close() {
 // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
 func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error {
 	if ce.Type() != e.stype {
-		return syserr.ErrConnectionRefused
+		return syserr.ErrWrongProtocolForSocket
 	}
 
 	// Check if ce is e to avoid a deadlock.
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 2ef654235..2f1b127df 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -20,6 +20,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -838,24 +839,43 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
 
 // SetSockOpt sets a socket option. Currently not supported.
 func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch v := opt.(type) {
-	case tcpip.PasscredOption:
-		e.setPasscred(v != 0)
-		return nil
-	}
 	return nil
 }
 
 func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	switch opt {
+	case tcpip.BroadcastOption:
+	case tcpip.PasscredOption:
+		e.setPasscred(v)
+	case tcpip.ReuseAddressOption:
+	default:
+		log.Warningf("Unsupported socket option: %d", opt)
+	}
 	return nil
 }
 
 func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	switch opt {
+	case tcpip.SendBufferSizeOption:
+	case tcpip.ReceiveBufferSizeOption:
+	default:
+		log.Warningf("Unsupported socket option: %d", opt)
+	}
 	return nil
 }
 
 func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	return false, tcpip.ErrUnknownProtocolOption
+	switch opt {
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	case tcpip.PasscredOption:
+		return e.Passcred(), nil
+
+	default:
+		log.Warningf("Unsupported socket option: %d", opt)
+		return false, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
@@ -914,29 +934,19 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		return int(v), nil
 
 	default:
+		log.Warningf("Unsupported socket option: %d", opt)
 		return -1, tcpip.ErrUnknownProtocolOption
 	}
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
+	switch opt.(type) {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.PasscredOption:
-		if e.Passcred() {
-			*o = tcpip.PasscredOption(1)
-		} else {
-			*o = tcpip.PasscredOption(0)
-		}
-		return nil
-
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
 	default:
+		log.Warningf("Unsupported socket option: %T", opt)
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 4d30aa714..c4c9db81b 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -23,6 +23,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -33,6 +34,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -52,11 +54,8 @@ type SocketOperations struct {
 	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
-	refs.AtomicRefCount
-	socket.SendReceiveTimeout
 
-	ep    transport.Endpoint
-	stype linux.SockType
+	socketOpsCommon
 }
 
 // New creates a new unix socket.
@@ -75,29 +74,42 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty
 	}
 
 	s := SocketOperations{
-		ep:    ep,
-		stype: stype,
+		socketOpsCommon: socketOpsCommon{
+			ep:    ep,
+			stype: stype,
+		},
 	}
 	s.EnableLeakCheck("unix.SocketOperations")
 
 	return fs.NewFile(ctx, d, flags, &s)
 }
 
+// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
+//
+// +stateify savable
+type socketOpsCommon struct {
+	refs.AtomicRefCount
+	socket.SendReceiveTimeout
+
+	ep    transport.Endpoint
+	stype linux.SockType
+}
+
 // DecRef implements RefCounter.DecRef.
-func (s *SocketOperations) DecRef() {
+func (s *socketOpsCommon) DecRef() {
 	s.DecRefWithDestructor(func() {
 		s.ep.Close()
 	})
 }
 
 // Release implemements fs.FileOperations.Release.
-func (s *SocketOperations) Release() {
+func (s *socketOpsCommon) Release() {
 	// Release only decrements a reference on s because s may be referenced in
 	// the abstract socket namespace.
 	s.DecRef()
 }
 
-func (s *SocketOperations) isPacket() bool {
+func (s *socketOpsCommon) isPacket() bool {
 	switch s.stype {
 	case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
 		return true
@@ -110,7 +122,7 @@ func (s *SocketOperations) isPacket() bool {
 }
 
 // Endpoint extracts the transport.Endpoint.
-func (s *SocketOperations) Endpoint() transport.Endpoint {
+func (s *socketOpsCommon) Endpoint() transport.Endpoint {
 	return s.ep
 }
 
@@ -143,7 +155,7 @@ func extractPath(sockaddr []byte) (string, *syserr.Error) {
 
 // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.ep.GetRemoteAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -155,7 +167,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32,
 
 // GetSockName implements the linux syscall getsockname(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
 	addr, err := s.ep.GetLocalAddress()
 	if err != nil {
 		return nil, 0, syserr.TranslateNetstackError(err)
@@ -178,7 +190,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us
 
 // Listen implements the linux syscall listen(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
+func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
 	return s.ep.Listen(backlog)
 }
 
@@ -310,6 +322,11 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 			}
 
 			// Create the socket.
+			//
+			// Note that the file permissions here are not set correctly (see
+			// gvisor.dev/issue/2324). There is no convenient way to get permissions
+			// on the socket referred to by s, so we will leave this discrepancy
+			// unresolved until VFS2 replaces this code.
 			childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}})
 			if err != nil {
 				return syserr.ErrPortInUse
@@ -345,6 +362,31 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
 		return ep, nil
 	}
 
+	if kernel.VFS2Enabled {
+		p := fspath.Parse(path)
+		root := t.FSContext().RootDirectoryVFS2()
+		start := root
+		relPath := !p.Absolute
+		if relPath {
+			start = t.FSContext().WorkingDirectoryVFS2()
+		}
+		pop := vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               p,
+			FollowFinalSymlink: true,
+		}
+		ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path})
+		root.DecRef()
+		if relPath {
+			start.DecRef()
+		}
+		if e != nil {
+			return nil, syserr.FromError(e)
+		}
+		return ep, nil
+	}
+
 	// Find the node in the filesystem.
 	root := t.FSContext().RootDirectory()
 	cwd := t.FSContext().WorkingDirectory()
@@ -363,12 +405,11 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
 		// No socket!
 		return nil, syserr.ErrConnectionRefused
 	}
-
 	return ep, nil
 }
 
 // Connect implements the linux syscall connect(2) for unix sockets.
-func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
 	ep, err := extractEndpoint(t, sockaddr)
 	if err != nil {
 		return err
@@ -376,10 +417,21 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 	defer ep.Release()
 
 	// Connect the server endpoint.
-	return s.ep.Connect(t, ep)
+	err = s.ep.Connect(t, ep)
+
+	if err == syserr.ErrWrongProtocolForSocket {
+		// Linux for abstract sockets returns ErrConnectionRefused
+		// instead of ErrWrongProtocolForSocket.
+		path, _ := extractPath(sockaddr)
+		if len(path) > 0 && path[0] == 0 {
+			err = syserr.ErrConnectionRefused
+		}
+	}
+
+	return err
 }
 
-// Writev implements fs.FileOperations.Write.
+// Write implements fs.FileOperations.Write.
 func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
 	t := kernel.TaskFromContext(ctx)
 	ctrl := control.New(t, s.ep, nil)
@@ -399,7 +451,7 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 
 // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	w := EndpointWriter{
 		Ctx:      t,
 		Endpoint: s.ep,
@@ -453,27 +505,27 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 }
 
 // Passcred implements transport.Credentialer.Passcred.
-func (s *SocketOperations) Passcred() bool {
+func (s *socketOpsCommon) Passcred() bool {
 	return s.ep.Passcred()
 }
 
 // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
-func (s *SocketOperations) ConnectedPasscred() bool {
+func (s *socketOpsCommon) ConnectedPasscred() bool {
 	return s.ep.ConnectedPasscred()
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return s.ep.Readiness(mask)
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
-func (s *SocketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 	s.ep.EventRegister(e, mask)
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
-func (s *SocketOperations) EventUnregister(e *waiter.Entry) {
+func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) {
 	s.ep.EventUnregister(e)
 }
 
@@ -485,7 +537,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 
 // Shutdown implements the linux syscall shutdown(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
+func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	f, err := netstack.ConvertShutdown(how)
 	if err != nil {
 		return err
@@ -511,7 +563,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
+func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
 	dontWait := flags&linux.MSG_DONTWAIT != 0
@@ -648,12 +700,12 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 }
 
 // State implements socket.Socket.State.
-func (s *SocketOperations) State() uint32 {
+func (s *socketOpsCommon) State() uint32 {
 	return s.ep.State()
 }
 
 // Type implements socket.Socket.Type.
-func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) {
+func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
 	// Unix domain sockets always have a protocol of 0.
 	return linux.AF_UNIX, s.stype, 0
 }
@@ -706,4 +758,5 @@ func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.F
 
 func init() {
 	socket.RegisterProvider(linux.AF_UNIX, &provider{})
+	socket.RegisterProviderVFS2(linux.AF_UNIX, &providerVFS2{})
 }
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
new file mode 100644
index 000000000..45e109361
--- /dev/null
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -0,0 +1,358 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package unix
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// SocketVFS2 implements socket.SocketVFS2 (and by extension,
+// vfs.FileDescriptionImpl) for Unix sockets.
+type SocketVFS2 struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	socketOpsCommon
+}
+
+var _ = socket.SocketVFS2(&SocketVFS2{})
+
+// NewSockfsFile creates a new socket file in the global sockfs mount and
+// returns a corresponding file description.
+func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) {
+	mnt := t.Kernel().SocketMount()
+	d := sockfs.NewDentry(t.Credentials(), mnt)
+
+	fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d)
+	if err != nil {
+		return nil, syserr.FromError(err)
+	}
+	return fd, nil
+}
+
+// NewFileDescription creates and returns a socket file description
+// corresponding to the given mount and dentry.
+func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry) (*vfs.FileDescription, error) {
+	// You can create AF_UNIX, SOCK_RAW sockets. They're the same as
+	// SOCK_DGRAM and don't require CAP_NET_RAW.
+	if stype == linux.SOCK_RAW {
+		stype = linux.SOCK_DGRAM
+	}
+
+	sock := &SocketVFS2{
+		socketOpsCommon: socketOpsCommon{
+			ep:    ep,
+			stype: stype,
+		},
+	}
+	vfsfd := &sock.vfsfd
+	if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return vfsfd, nil
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
+// a transport.Endpoint.
+func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
+	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+}
+
+// blockingAccept implements a blocking version of accept(2), that is, if no
+// connections are ready to be accept, it will block until one becomes ready.
+func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
+	// Register for notifications.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.socketOpsCommon.EventRegister(&e, waiter.EventIn)
+	defer s.socketOpsCommon.EventUnregister(&e)
+
+	// Try to accept the connection; if it fails, then wait until we get a
+	// notification.
+	for {
+		if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+			return ep, err
+		}
+
+		if err := t.Block(ch); err != nil {
+			return nil, syserr.FromError(err)
+		}
+	}
+}
+
+// Accept implements the linux syscall accept(2) for sockets backed by
+// a transport.Endpoint.
+func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
+	// Issue the accept request to get the new endpoint.
+	ep, err := s.ep.Accept()
+	if err != nil {
+		if err != syserr.ErrWouldBlock || !blocking {
+			return 0, nil, 0, err
+		}
+
+		var err *syserr.Error
+		ep, err = s.blockingAccept(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	ns, err := NewSockfsFile(t, ep, s.stype)
+	if err != nil {
+		return 0, nil, 0, err
+	}
+	defer ns.DecRef()
+
+	if flags&linux.SOCK_NONBLOCK != 0 {
+		ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK)
+	}
+
+	var addr linux.SockAddr
+	var addrLen uint32
+	if peerRequested {
+		// Get address of the peer.
+		var err *syserr.Error
+		addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
+		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
+	})
+	if e != nil {
+		return 0, nil, 0, syserr.FromError(e)
+	}
+
+	t.Kernel().RecordSocketVFS2(ns)
+	return fd, addr, addrLen, nil
+}
+
+// Bind implements the linux syscall bind(2) for unix sockets.
+func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	p, e := extractPath(sockaddr)
+	if e != nil {
+		return e
+	}
+
+	bep, ok := s.ep.(transport.BoundEndpoint)
+	if !ok {
+		// This socket can't be bound.
+		return syserr.ErrInvalidArgument
+	}
+
+	return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error {
+		// Is it abstract?
+		if p[0] == 0 {
+			if t.IsNetworkNamespaced() {
+				return syserr.ErrInvalidEndpointState
+			}
+			if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil {
+				// syserr.ErrPortInUse corresponds to EADDRINUSE.
+				return syserr.ErrPortInUse
+			}
+		} else {
+			path := fspath.Parse(p)
+			root := t.FSContext().RootDirectoryVFS2()
+			defer root.DecRef()
+			start := root
+			relPath := !path.Absolute
+			if relPath {
+				start = t.FSContext().WorkingDirectoryVFS2()
+				defer start.DecRef()
+			}
+			pop := vfs.PathOperation{
+				Root:  root,
+				Start: start,
+				Path:  path,
+			}
+			stat, err := s.vfsfd.Stat(t, vfs.StatOptions{Mask: linux.STATX_MODE})
+			if err != nil {
+				return syserr.FromError(err)
+			}
+			err = t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{
+				// File permissions correspond to net/unix/af_unix.c:unix_bind.
+				Mode:     linux.FileMode(linux.S_IFSOCK | uint(stat.Mode)&^t.FSContext().Umask()),
+				Endpoint: bep,
+			})
+			if err == syserror.EEXIST {
+				return syserr.ErrAddressInUse
+			}
+			return syserr.FromError(err)
+		}
+
+		return nil
+	})
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return netstack.Ioctl(ctx, s.ep, uio, args)
+}
+
+// PRead implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Read implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	return dst.CopyOutFrom(ctx, &EndpointReader{
+		Ctx:       ctx,
+		Endpoint:  s.ep,
+		NumRights: 0,
+		Peek:      false,
+		From:      nil,
+	})
+}
+
+// PWrite implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Write implements vfs.FileDescriptionImpl.
+func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// All flags other than RWF_NOWAIT should be ignored.
+	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
+	if opts.Flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	t := kernel.TaskFromContext(ctx)
+	ctrl := control.New(t, s.ep, nil)
+
+	if src.NumBytes() == 0 {
+		nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil)
+		return int64(nInt), err.ToError()
+	}
+
+	return src.CopyInTo(ctx, &EndpointWriter{
+		Ctx:      ctx,
+		Endpoint: s.ep,
+		Control:  ctrl,
+		To:       nil,
+	})
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return s.socketOpsCommon.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	s.socketOpsCommon.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *SocketVFS2) EventUnregister(e *waiter.Entry) {
+	s.socketOpsCommon.EventUnregister(e)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
+// a transport.Endpoint.
+func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
+	return netstack.SetSockOpt(t, s, s.ep, level, name, optVal)
+}
+
+// providerVFS2 is a unix domain socket provider for VFS2.
+type providerVFS2 struct{}
+
+func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
+	// Check arguments.
+	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
+		return nil, syserr.ErrProtocolNotSupported
+	}
+
+	// Create the endpoint and socket.
+	var ep transport.Endpoint
+	switch stype {
+	case linux.SOCK_DGRAM, linux.SOCK_RAW:
+		ep = transport.NewConnectionless(t)
+	case linux.SOCK_SEQPACKET, linux.SOCK_STREAM:
+		ep = transport.NewConnectioned(t, stype, t.Kernel())
+	default:
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	f, err := NewSockfsFile(t, ep, stype)
+	if err != nil {
+		ep.Close()
+		return nil, err
+	}
+	return f, nil
+}
+
+// Pair creates a new pair of AF_UNIX connected sockets.
+func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
+	// Check arguments.
+	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
+		return nil, nil, syserr.ErrProtocolNotSupported
+	}
+
+	switch stype {
+	case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW:
+		// Ok
+	default:
+		return nil, nil, syserr.ErrInvalidArgument
+	}
+
+	// Create the endpoints and sockets.
+	ep1, ep2 := transport.NewPair(t, stype, t.Kernel())
+	s1, err := NewSockfsFile(t, ep1, stype)
+	if err != nil {
+		ep1.Close()
+		ep2.Close()
+		return nil, nil, err
+	}
+	s2, err := NewSockfsFile(t, ep2, stype)
+	if err != nil {
+		s1.DecRef()
+		ep2.Close()
+		return nil, nil, err
+	}
+
+	return s1, s2, nil
+}
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 2f39a6f2b..88d5db9fc 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "capability.go",
         "clone.go",
+        "epoll.go",
         "futex.go",
         "linux64_amd64.go",
         "linux64_arm64.go",
diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go
new file mode 100644
index 000000000..a6e48b836
--- /dev/null
+++ b/pkg/sentry/strace/epoll.go
@@ -0,0 +1,89 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"fmt"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func epollEvent(t *kernel.Task, eventAddr usermem.Addr) string {
+	var e linux.EpollEvent
+	if _, err := t.CopyIn(eventAddr, &e); err != nil {
+		return fmt.Sprintf("%#x {error reading event: %v}", eventAddr, err)
+	}
+	var sb strings.Builder
+	fmt.Fprintf(&sb, "%#x ", eventAddr)
+	writeEpollEvent(&sb, e)
+	return sb.String()
+}
+
+func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes uint64) string {
+	var sb strings.Builder
+	fmt.Fprintf(&sb, "%#x {", eventsAddr)
+	addr := eventsAddr
+	for i := uint64(0); i < numEvents; i++ {
+		var e linux.EpollEvent
+		if _, err := t.CopyIn(addr, &e); err != nil {
+			fmt.Fprintf(&sb, "{error reading event at %#x: %v}", addr, err)
+			continue
+		}
+		writeEpollEvent(&sb, e)
+		if uint64(sb.Len()) >= maxBytes {
+			sb.WriteString("...")
+			break
+		}
+		if _, ok := addr.AddLength(uint64(linux.SizeOfEpollEvent)); !ok {
+			fmt.Fprintf(&sb, "{error reading event at %#x: EFAULT}", addr)
+			continue
+		}
+	}
+	sb.WriteString("}")
+	return sb.String()
+}
+
+func writeEpollEvent(sb *strings.Builder, e linux.EpollEvent) {
+	events := epollEventEvents.Parse(uint64(e.Events))
+	fmt.Fprintf(sb, "{events=%s data=[%#x, %#x]}", events, e.Data[0], e.Data[1])
+}
+
+var epollCtlOps = abi.ValueSet{
+	linux.EPOLL_CTL_ADD: "EPOLL_CTL_ADD",
+	linux.EPOLL_CTL_DEL: "EPOLL_CTL_DEL",
+	linux.EPOLL_CTL_MOD: "EPOLL_CTL_MOD",
+}
+
+var epollEventEvents = abi.FlagSet{
+	{Flag: linux.EPOLLIN, Name: "EPOLLIN"},
+	{Flag: linux.EPOLLPRI, Name: "EPOLLPRI"},
+	{Flag: linux.EPOLLOUT, Name: "EPOLLOUT"},
+	{Flag: linux.EPOLLERR, Name: "EPOLLERR"},
+	{Flag: linux.EPOLLHUP, Name: "EPULLHUP"},
+	{Flag: linux.EPOLLRDNORM, Name: "EPOLLRDNORM"},
+	{Flag: linux.EPOLLRDBAND, Name: "EPOLLRDBAND"},
+	{Flag: linux.EPOLLWRNORM, Name: "EPOLLWRNORM"},
+	{Flag: linux.EPOLLWRBAND, Name: "EPOLLWRBAND"},
+	{Flag: linux.EPOLLMSG, Name: "EPOLLMSG"},
+	{Flag: linux.EPOLLRDHUP, Name: "EPOLLRDHUP"},
+	{Flag: linux.EPOLLEXCLUSIVE, Name: "EPOLLEXCLUSIVE"},
+	{Flag: linux.EPOLLWAKEUP, Name: "EPOLLWAKEUP"},
+	{Flag: linux.EPOLLONESHOT, Name: "EPOLLONESHOT"},
+	{Flag: linux.EPOLLET, Name: "EPOLLET"},
+}
diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go
index a4de545e9..71b92eaee 100644
--- a/pkg/sentry/strace/linux64_amd64.go
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -256,8 +256,8 @@ var linuxAMD64 = SyscallMap{
 	229: makeSyscallInfo("clock_getres", Hex, PostTimespec),
 	230: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec),
 	231: makeSyscallInfo("exit_group", Hex),
-	232: makeSyscallInfo("epoll_wait", Hex, Hex, Hex, Hex),
-	233: makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
+	232: makeSyscallInfo("epoll_wait", FD, EpollEvents, Hex, Hex),
+	233: makeSyscallInfo("epoll_ctl", FD, EpollCtlOp, FD, EpollEvent),
 	234: makeSyscallInfo("tgkill", Hex, Hex, Signal),
 	235: makeSyscallInfo("utimes", Path, Timeval),
 	// 236: vserver (not implemented in the Linux kernel)
@@ -305,7 +305,7 @@ var linuxAMD64 = SyscallMap{
 	278: makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex),
 	279: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex),
 	280: makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex),
-	281: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
+	281: makeSyscallInfo("epoll_pwait", FD, EpollEvents, Hex, Hex, SigSet, Hex),
 	282: makeSyscallInfo("signalfd", Hex, Hex, Hex),
 	283: makeSyscallInfo("timerfd_create", Hex, Hex),
 	284: makeSyscallInfo("eventfd", Hex),
diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go
index 8bc38545f..bd7361a52 100644
--- a/pkg/sentry/strace/linux64_arm64.go
+++ b/pkg/sentry/strace/linux64_arm64.go
@@ -45,8 +45,8 @@ var linuxARM64 = SyscallMap{
 	18:  makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex),
 	19:  makeSyscallInfo("eventfd2", Hex, Hex),
 	20:  makeSyscallInfo("epoll_create1", Hex),
-	21:  makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
-	22:  makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
+	21:  makeSyscallInfo("epoll_ctl", FD, EpollCtlOp, FD, EpollEvent),
+	22:  makeSyscallInfo("epoll_pwait", FD, EpollEvents, Hex, Hex, SigSet, Hex),
 	23:  makeSyscallInfo("dup", FD),
 	24:  makeSyscallInfo("dup3", FD, FD, Hex),
 	25:  makeSyscallInfo("fcntl", FD, Hex, Hex),
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 51e6d81b2..c0512de89 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -632,4 +632,13 @@ var sockOptNames = map[uint64]abi.ValueSet{
 		linux.MCAST_MSFILTER:           "MCAST_MSFILTER",
 		linux.IPV6_ADDRFORM:            "IPV6_ADDRFORM",
 	},
+	linux.SOL_NETLINK: {
+		linux.NETLINK_BROADCAST_ERROR:  "NETLINK_BROADCAST_ERROR",
+		linux.NETLINK_CAP_ACK:          "NETLINK_CAP_ACK",
+		linux.NETLINK_DUMP_STRICT_CHK:  "NETLINK_DUMP_STRICT_CHK",
+		linux.NETLINK_EXT_ACK:          "NETLINK_EXT_ACK",
+		linux.NETLINK_LIST_MEMBERSHIPS: "NETLINK_LIST_MEMBERSHIPS",
+		linux.NETLINK_NO_ENOBUFS:       "NETLINK_NO_ENOBUFS",
+		linux.NETLINK_PKTINFO:          "NETLINK_PKTINFO",
+	},
 }
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index a796b2396..68ca537c8 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -141,6 +141,10 @@ func path(t *kernel.Task, addr usermem.Addr) string {
 }
 
 func fd(t *kernel.Task, fd int32) string {
+	if kernel.VFS2Enabled {
+		return fdVFS2(t, fd)
+	}
+
 	root := t.FSContext().RootDirectory()
 	if root != nil {
 		defer root.DecRef()
@@ -169,6 +173,30 @@ func fd(t *kernel.Task, fd int32) string {
 	return fmt.Sprintf("%#x %s", fd, name)
 }
 
+func fdVFS2(t *kernel.Task, fd int32) string {
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+
+	vfsObj := root.Mount().Filesystem().VirtualFilesystem()
+	if fd == linux.AT_FDCWD {
+		wd := t.FSContext().WorkingDirectoryVFS2()
+		defer wd.DecRef()
+
+		name, _ := vfsObj.PathnameWithDeleted(t, root, wd)
+		return fmt.Sprintf("AT_FDCWD %s", name)
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		// Cast FD to uint64 to avoid printing negative hex.
+		return fmt.Sprintf("%#x (bad FD)", uint64(fd))
+	}
+	defer file.DecRef()
+
+	name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry())
+	return fmt.Sprintf("%#x %s", fd, name)
+}
+
 func fdpair(t *kernel.Task, addr usermem.Addr) string {
 	var fds [2]int32
 	_, err := t.CopyIn(addr, &fds)
@@ -453,6 +481,12 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer()))
 		case PollFDs:
 			output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false))
+		case EpollCtlOp:
+			output = append(output, epollCtlOps.Parse(uint64(args[arg].Int())))
+		case EpollEvent:
+			output = append(output, epollEvent(t, args[arg].Pointer()))
+		case EpollEvents:
+			output = append(output, epollEvents(t, args[arg].Pointer(), 0 /* numEvents */, uint64(maximumBlobSize)))
 		case SelectFDSet:
 			output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer()))
 		case Oct:
@@ -521,6 +555,8 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = capData(t, args[arg-1].Pointer(), args[arg].Pointer())
 		case PollFDs:
 			output[arg] = pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), true)
+		case EpollEvents:
+			output[arg] = epollEvents(t, args[arg].Pointer(), uint64(rval), uint64(maximumBlobSize))
 		case GetSockOptVal:
 			output[arg] = getSockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Pointer() /* optLen */, maximumBlobSize, rval)
 		case SetSockOptVal:
@@ -683,7 +719,7 @@ func (s SyscallMap) SyscallEnter(t *kernel.Task, sysno uintptr, args arch.Syscal
 // SyscallExit implements kernel.Stracer.SyscallExit. It logs the syscall
 // exit trace.
 func (s SyscallMap) SyscallExit(context interface{}, t *kernel.Task, sysno, rval uintptr, err error) {
-	errno := t.ExtractErrno(err, int(sysno))
+	errno := kernel.ExtractErrno(err, int(sysno))
 	c := context.(*syscallContext)
 
 	elapsed := time.Since(c.start)
@@ -742,9 +778,6 @@ func (s SyscallMap) Name(sysno uintptr) string {
 //
 // N.B. This is not in an init function because we can't be sure all syscall
 // tables are registered with the kernel when init runs.
-//
-// TODO(gvisor.dev/issue/155): remove kernel package dependencies from this
-// package and have the kernel package self-initialize all syscall tables.
 func Initialize() {
 	for _, table := range kernel.SyscallTables() {
 		// Is this known?
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 446d1e0f6..7e69b9279 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -228,6 +228,16 @@ const (
 	// SockOptLevel is the optname argument in getsockopt(2) and
 	// setsockopt(2).
 	SockOptName
+
+	// EpollCtlOp is the op argument to epoll_ctl(2).
+	EpollCtlOp
+
+	// EpollEvent is the event argument in epoll_ctl(2).
+	EpollEvent
+
+	// EpollEvents is an array of struct epoll_event. It is the events
+	// argument in epoll_wait(2)/epoll_pwait(2).
+	EpollEvents
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index 87dcad18b..d9fb808c0 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -17,6 +17,7 @@ package syscalls
 import (
 	"time"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -118,7 +119,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error {
 }
 
 // WaitEpoll implements the epoll_wait(2) linux syscall.
-func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]epoll.Event, error) {
+func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEvent, error) {
 	// Get epoll from the file descriptor.
 	epollfile := t.GetFile(fd)
 	if epollfile == nil {
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index be16ee686..217fcfef2 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -8,8 +8,6 @@ go_library(
         "error.go",
         "flags.go",
         "linux64.go",
-        "linux64_amd64.go",
-        "linux64_arm64.go",
         "sigset.go",
         "sys_aio.go",
         "sys_capability.go",
@@ -51,7 +49,8 @@ go_library(
         "sys_time.go",
         "sys_timer.go",
         "sys_timerfd.go",
-        "sys_tls.go",
+        "sys_tls_amd64.go",
+        "sys_tls_arm64.go",
         "sys_utsname.go",
         "sys_write.go",
         "sys_xattr.go",
@@ -74,6 +73,7 @@ go_library(
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/epoll",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 68589a377..ea4f9b1a7 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -15,6 +15,16 @@
 // Package linux provides syscall tables for amd64 Linux.
 package linux
 
+import (
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
 const (
 	// LinuxSysname is the OS name advertised by gVisor.
 	LinuxSysname = "Linux"
@@ -25,3 +35,702 @@ const (
 	// LinuxVersion is the version info advertised by gVisor.
 	LinuxVersion = "#1 SMP Sun Jan 10 15:06:54 PST 2016"
 )
+
+// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
+// numbers from Linux 4.4.
+var AMD64 = &kernel.SyscallTable{
+	OS:   abi.Linux,
+	Arch: arch.AMD64,
+	Version: kernel.Version{
+		// Version 4.4 is chosen as a stable, longterm version of Linux, which
+		// guides the interface provided by this syscall table. The build
+		// version is that for a clean build with default kernel config, at 5
+		// minutes after v4.4 was tagged.
+		Sysname: LinuxSysname,
+		Release: LinuxRelease,
+		Version: LinuxVersion,
+	},
+	AuditNumber: linux.AUDIT_ARCH_X86_64,
+	Table: map[uintptr]kernel.Syscall{
+		0:   syscalls.Supported("read", Read),
+		1:   syscalls.Supported("write", Write),
+		2:   syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil),
+		3:   syscalls.Supported("close", Close),
+		4:   syscalls.Supported("stat", Stat),
+		5:   syscalls.Supported("fstat", Fstat),
+		6:   syscalls.Supported("lstat", Lstat),
+		7:   syscalls.Supported("poll", Poll),
+		8:   syscalls.Supported("lseek", Lseek),
+		9:   syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
+		10:  syscalls.Supported("mprotect", Mprotect),
+		11:  syscalls.Supported("munmap", Munmap),
+		12:  syscalls.Supported("brk", Brk),
+		13:  syscalls.Supported("rt_sigaction", RtSigaction),
+		14:  syscalls.Supported("rt_sigprocmask", RtSigprocmask),
+		15:  syscalls.Supported("rt_sigreturn", RtSigreturn),
+		16:  syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
+		17:  syscalls.Supported("pread64", Pread64),
+		18:  syscalls.Supported("pwrite64", Pwrite64),
+		19:  syscalls.Supported("readv", Readv),
+		20:  syscalls.Supported("writev", Writev),
+		21:  syscalls.Supported("access", Access),
+		22:  syscalls.Supported("pipe", Pipe),
+		23:  syscalls.Supported("select", Select),
+		24:  syscalls.Supported("sched_yield", SchedYield),
+		25:  syscalls.Supported("mremap", Mremap),
+		26:  syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil),
+		27:  syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil),
+		28:  syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil),
+		29:  syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
+		30:  syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil),
+		31:  syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
+		32:  syscalls.Supported("dup", Dup),
+		33:  syscalls.Supported("dup2", Dup2),
+		34:  syscalls.Supported("pause", Pause),
+		35:  syscalls.Supported("nanosleep", Nanosleep),
+		36:  syscalls.Supported("getitimer", Getitimer),
+		37:  syscalls.Supported("alarm", Alarm),
+		38:  syscalls.Supported("setitimer", Setitimer),
+		39:  syscalls.Supported("getpid", Getpid),
+		40:  syscalls.Supported("sendfile", Sendfile),
+		41:  syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil),
+		42:  syscalls.Supported("connect", Connect),
+		43:  syscalls.Supported("accept", Accept),
+		44:  syscalls.Supported("sendto", SendTo),
+		45:  syscalls.Supported("recvfrom", RecvFrom),
+		46:  syscalls.Supported("sendmsg", SendMsg),
+		47:  syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
+		48:  syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
+		49:  syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil),
+		50:  syscalls.Supported("listen", Listen),
+		51:  syscalls.Supported("getsockname", GetSockName),
+		52:  syscalls.Supported("getpeername", GetPeerName),
+		53:  syscalls.Supported("socketpair", SocketPair),
+		54:  syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil),
+		55:  syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil),
+		56:  syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
+		57:  syscalls.Supported("fork", Fork),
+		58:  syscalls.Supported("vfork", Vfork),
+		59:  syscalls.Supported("execve", Execve),
+		60:  syscalls.Supported("exit", Exit),
+		61:  syscalls.Supported("wait4", Wait4),
+		62:  syscalls.Supported("kill", Kill),
+		63:  syscalls.Supported("uname", Uname),
+		64:  syscalls.Supported("semget", Semget),
+		65:  syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
+		66:  syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+		67:  syscalls.Supported("shmdt", Shmdt),
+		68:  syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		69:  syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		70:  syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		71:  syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		72:  syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
+		73:  syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
+		74:  syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
+		75:  syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil),
+		76:  syscalls.Supported("truncate", Truncate),
+		77:  syscalls.Supported("ftruncate", Ftruncate),
+		78:  syscalls.Supported("getdents", Getdents),
+		79:  syscalls.Supported("getcwd", Getcwd),
+		80:  syscalls.Supported("chdir", Chdir),
+		81:  syscalls.Supported("fchdir", Fchdir),
+		82:  syscalls.Supported("rename", Rename),
+		83:  syscalls.Supported("mkdir", Mkdir),
+		84:  syscalls.Supported("rmdir", Rmdir),
+		85:  syscalls.Supported("creat", Creat),
+		86:  syscalls.Supported("link", Link),
+		87:  syscalls.Supported("unlink", Unlink),
+		88:  syscalls.Supported("symlink", Symlink),
+		89:  syscalls.Supported("readlink", Readlink),
+		90:  syscalls.Supported("chmod", Chmod),
+		91:  syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil),
+		92:  syscalls.Supported("chown", Chown),
+		93:  syscalls.Supported("fchown", Fchown),
+		94:  syscalls.Supported("lchown", Lchown),
+		95:  syscalls.Supported("umask", Umask),
+		96:  syscalls.Supported("gettimeofday", Gettimeofday),
+		97:  syscalls.Supported("getrlimit", Getrlimit),
+		98:  syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil),
+		99:  syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil),
+		100: syscalls.Supported("times", Times),
+		101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil),
+		102: syscalls.Supported("getuid", Getuid),
+		103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil),
+		104: syscalls.Supported("getgid", Getgid),
+		105: syscalls.Supported("setuid", Setuid),
+		106: syscalls.Supported("setgid", Setgid),
+		107: syscalls.Supported("geteuid", Geteuid),
+		108: syscalls.Supported("getegid", Getegid),
+		109: syscalls.Supported("setpgid", Setpgid),
+		110: syscalls.Supported("getppid", Getppid),
+		111: syscalls.Supported("getpgrp", Getpgrp),
+		112: syscalls.Supported("setsid", Setsid),
+		113: syscalls.Supported("setreuid", Setreuid),
+		114: syscalls.Supported("setregid", Setregid),
+		115: syscalls.Supported("getgroups", Getgroups),
+		116: syscalls.Supported("setgroups", Setgroups),
+		117: syscalls.Supported("setresuid", Setresuid),
+		118: syscalls.Supported("getresuid", Getresuid),
+		119: syscalls.Supported("setresgid", Setresgid),
+		120: syscalls.Supported("getresgid", Getresgid),
+		121: syscalls.Supported("getpgid", Getpgid),
+		122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+		123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+		124: syscalls.Supported("getsid", Getsid),
+		125: syscalls.Supported("capget", Capget),
+		126: syscalls.Supported("capset", Capset),
+		127: syscalls.Supported("rt_sigpending", RtSigpending),
+		128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait),
+		129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo),
+		130: syscalls.Supported("rt_sigsuspend", RtSigsuspend),
+		131: syscalls.Supported("sigaltstack", Sigaltstack),
+		132: syscalls.Supported("utime", Utime),
+		133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil),
+		134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil),
+		135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil),
+		136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil),
+		137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
+		138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
+		139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}),
+		140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil),
+		141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil),
+		142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
+		143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil),
+		144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil),
+		145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil),
+		146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil),
+		147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil),
+		148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil),
+		149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
+		154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil),
+		155: syscalls.Error("pivot_root", syserror.EPERM, "", nil),
+		156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil),
+		157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil),
+		158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil),
+		159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
+		160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil),
+		161: syscalls.Supported("chroot", Chroot),
+		162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
+		163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
+		164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
+		165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
+		166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil),
+		167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
+		168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
+		169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
+		170: syscalls.Supported("sethostname", Sethostname),
+		171: syscalls.Supported("setdomainname", Setdomainname),
+		172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil),
+		173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil),
+		174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil),
+		175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
+		176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
+		177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil),
+		178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil),
+		179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
+		180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
+		181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil),
+		182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil),
+		183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil),
+		184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil),
+		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
+		186: syscalls.Supported("gettid", Gettid),
+		187: syscalls.Supported("readahead", Readahead),
+		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
+		200: syscalls.Supported("tkill", Tkill),
+		201: syscalls.Supported("time", Time),
+		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
+		203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil),
+		204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil),
+		205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+		206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+		212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
+		213: syscalls.Supported("epoll_create", EpollCreate),
+		214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil),
+		215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil),
+		216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil),
+		217: syscalls.Supported("getdents64", Getdents64),
+		218: syscalls.Supported("set_tid_address", SetTidAddress),
+		219: syscalls.Supported("restart_syscall", RestartSyscall),
+		220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
+		221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
+		222: syscalls.Supported("timer_create", TimerCreate),
+		223: syscalls.Supported("timer_settime", TimerSettime),
+		224: syscalls.Supported("timer_gettime", TimerGettime),
+		225: syscalls.Supported("timer_getoverrun", TimerGetoverrun),
+		226: syscalls.Supported("timer_delete", TimerDelete),
+		227: syscalls.Supported("clock_settime", ClockSettime),
+		228: syscalls.Supported("clock_gettime", ClockGettime),
+		229: syscalls.Supported("clock_getres", ClockGetres),
+		230: syscalls.Supported("clock_nanosleep", ClockNanosleep),
+		231: syscalls.Supported("exit_group", ExitGroup),
+		232: syscalls.Supported("epoll_wait", EpollWait),
+		233: syscalls.Supported("epoll_ctl", EpollCtl),
+		234: syscalls.Supported("tgkill", Tgkill),
+		235: syscalls.Supported("utimes", Utimes),
+		236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil),
+		237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
+		238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil),
+		239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil),
+		240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),         // TODO(b/29354921)
+		241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
+		242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),    // TODO(b/29354921)
+		243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+		244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
+		245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),   // TODO(b/29354921)
+		246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
+		247: syscalls.Supported("waitid", Waitid),
+		248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil),
+		249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil),
+		250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
+		251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+		252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+		253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil),
+		254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
+		255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+		256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
+		257: syscalls.Supported("openat", Openat),
+		258: syscalls.Supported("mkdirat", Mkdirat),
+		259: syscalls.Supported("mknodat", Mknodat),
+		260: syscalls.Supported("fchownat", Fchownat),
+		261: syscalls.Supported("futimesat", Futimesat),
+		262: syscalls.Supported("fstatat", Fstatat),
+		263: syscalls.Supported("unlinkat", Unlinkat),
+		264: syscalls.Supported("renameat", Renameat),
+		265: syscalls.Supported("linkat", Linkat),
+		266: syscalls.Supported("symlinkat", Symlinkat),
+		267: syscalls.Supported("readlinkat", Readlinkat),
+		268: syscalls.Supported("fchmodat", Fchmodat),
+		269: syscalls.Supported("faccessat", Faccessat),
+		270: syscalls.Supported("pselect", Pselect),
+		271: syscalls.Supported("ppoll", Ppoll),
+		272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
+		273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		275: syscalls.Supported("splice", Splice),
+		276: syscalls.Supported("tee", Tee),
+		277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
+		278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+		279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil),                               // requires cap_sys_nice (mostly)
+		280: syscalls.Supported("utimensat", Utimensat),
+		281: syscalls.Supported("epoll_pwait", EpollPwait),
+		282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
+		283: syscalls.Supported("timerfd_create", TimerfdCreate),
+		284: syscalls.Supported("eventfd", Eventfd),
+		285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
+		286: syscalls.Supported("timerfd_settime", TimerfdSettime),
+		287: syscalls.Supported("timerfd_gettime", TimerfdGettime),
+		288: syscalls.Supported("accept4", Accept4),
+		289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
+		290: syscalls.Supported("eventfd2", Eventfd2),
+		291: syscalls.Supported("epoll_create1", EpollCreate1),
+		292: syscalls.Supported("dup3", Dup3),
+		293: syscalls.Supported("pipe2", Pipe2),
+		294: syscalls.Supported("inotify_init1", InotifyInit1),
+		295: syscalls.Supported("preadv", Preadv),
+		296: syscalls.Supported("pwritev", Pwritev),
+		297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
+		298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil),
+		299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil),
+		300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+		301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+		302: syscalls.Supported("prlimit64", Prlimit64),
+		303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+		304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+		305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
+		306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil),
+		307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil),
+		308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
+		309: syscalls.Supported("getcpu", Getcpu),
+		310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+		311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+		312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
+		313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
+		314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+		315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+		316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}),                                           // TODO(b/118902772)
+		317: syscalls.Supported("seccomp", Seccomp),
+		318: syscalls.Supported("getrandom", GetRandom),
+		319: syscalls.Supported("memfd_create", MemfdCreate),
+		320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
+		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
+		322: syscalls.Supported("execveat", Execveat),
+		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
+		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
+		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+
+		// Syscalls implemented after 325 are "backports" from versions
+		// of Linux after 4.4.
+		326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
+		327: syscalls.Supported("preadv2", Preadv2),
+		328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+		329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+		330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+		331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
+		332: syscalls.Supported("statx", Statx),
+		333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+		334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
+
+		// Linux skips ahead to syscall 424 to sync numbers between arches.
+		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+		425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+		426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+		427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+		428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+		429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+		430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+		431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+		432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+		433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+		434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+		435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
+	},
+	Emulate: map[usermem.Addr]uintptr{
+		0xffffffffff600000: 96,  // vsyscall gettimeofday(2)
+		0xffffffffff600400: 201, // vsyscall time(2)
+		0xffffffffff600800: 309, // vsyscall getcpu(2)
+	},
+	Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, syserror.ENOSYS
+	},
+}
+
+// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall
+// numbers from Linux 4.4.
+var ARM64 = &kernel.SyscallTable{
+	OS:   abi.Linux,
+	Arch: arch.ARM64,
+	Version: kernel.Version{
+		Sysname: LinuxSysname,
+		Release: LinuxRelease,
+		Version: LinuxVersion,
+	},
+	AuditNumber: linux.AUDIT_ARCH_AARCH64,
+	Table: map[uintptr]kernel.Syscall{
+		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		11:  syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		12:  syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		13:  syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		14:  syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		15:  syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		16:  syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
+		17:  syscalls.Supported("getcwd", Getcwd),
+		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
+		19:  syscalls.Supported("eventfd2", Eventfd2),
+		20:  syscalls.Supported("epoll_create1", EpollCreate1),
+		21:  syscalls.Supported("epoll_ctl", EpollCtl),
+		22:  syscalls.Supported("epoll_pwait", EpollPwait),
+		23:  syscalls.Supported("dup", Dup),
+		24:  syscalls.Supported("dup3", Dup3),
+		25:  syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
+		26:  syscalls.Supported("inotify_init1", InotifyInit1),
+		27:  syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
+		28:  syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
+		29:  syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
+		30:  syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+		31:  syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+		32:  syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
+		33:  syscalls.Supported("mknodat", Mknodat),
+		34:  syscalls.Supported("mkdirat", Mkdirat),
+		35:  syscalls.Supported("unlinkat", Unlinkat),
+		36:  syscalls.Supported("symlinkat", Symlinkat),
+		37:  syscalls.Supported("linkat", Linkat),
+		38:  syscalls.Supported("renameat", Renameat),
+		39:  syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil),
+		40:  syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
+		41:  syscalls.Error("pivot_root", syserror.EPERM, "", nil),
+		42:  syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
+		43:  syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
+		44:  syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
+		45:  syscalls.Supported("truncate", Truncate),
+		46:  syscalls.Supported("ftruncate", Ftruncate),
+		47:  syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
+		48:  syscalls.Supported("faccessat", Faccessat),
+		49:  syscalls.Supported("chdir", Chdir),
+		50:  syscalls.Supported("fchdir", Fchdir),
+		51:  syscalls.Supported("chroot", Chroot),
+		52:  syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil),
+		53:  syscalls.Supported("fchmodat", Fchmodat),
+		54:  syscalls.Supported("fchownat", Fchownat),
+		55:  syscalls.Supported("fchown", Fchown),
+		56:  syscalls.Supported("openat", Openat),
+		57:  syscalls.Supported("close", Close),
+		58:  syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
+		59:  syscalls.Supported("pipe2", Pipe2),
+		60:  syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
+		61:  syscalls.Supported("getdents64", Getdents64),
+		62:  syscalls.Supported("lseek", Lseek),
+		63:  syscalls.Supported("read", Read),
+		64:  syscalls.Supported("write", Write),
+		65:  syscalls.Supported("readv", Readv),
+		66:  syscalls.Supported("writev", Writev),
+		67:  syscalls.Supported("pread64", Pread64),
+		68:  syscalls.Supported("pwrite64", Pwrite64),
+		69:  syscalls.Supported("preadv", Preadv),
+		70:  syscalls.Supported("pwritev", Pwritev),
+		71:  syscalls.Supported("sendfile", Sendfile),
+		72:  syscalls.Supported("pselect", Pselect),
+		73:  syscalls.Supported("ppoll", Ppoll),
+		74:  syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
+		75:  syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}),              // TODO(b/29354098)
+		76:  syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+		77:  syscalls.Supported("tee", Tee),
+		78:  syscalls.Supported("readlinkat", Readlinkat),
+		79:  syscalls.Supported("fstatat", Fstatat),
+		80:  syscalls.Supported("fstat", Fstat),
+		81:  syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
+		82:  syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
+		83:  syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil),
+		84:  syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
+		85:  syscalls.Supported("timerfd_create", TimerfdCreate),
+		86:  syscalls.Supported("timerfd_settime", TimerfdSettime),
+		87:  syscalls.Supported("timerfd_gettime", TimerfdGettime),
+		88:  syscalls.Supported("utimensat", Utimensat),
+		89:  syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
+		90:  syscalls.Supported("capget", Capget),
+		91:  syscalls.Supported("capset", Capset),
+		92:  syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil),
+		93:  syscalls.Supported("exit", Exit),
+		94:  syscalls.Supported("exit_group", ExitGroup),
+		95:  syscalls.Supported("waitid", Waitid),
+		96:  syscalls.Supported("set_tid_address", SetTidAddress),
+		97:  syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
+		98:  syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
+		99:  syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
+		101: syscalls.Supported("nanosleep", Nanosleep),
+		102: syscalls.Supported("getitimer", Getitimer),
+		103: syscalls.Supported("setitimer", Setitimer),
+		104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
+		105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
+		106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
+		107: syscalls.Supported("timer_create", TimerCreate),
+		108: syscalls.Supported("timer_gettime", TimerGettime),
+		109: syscalls.Supported("timer_getoverrun", TimerGetoverrun),
+		110: syscalls.Supported("timer_settime", TimerSettime),
+		111: syscalls.Supported("timer_delete", TimerDelete),
+		112: syscalls.Supported("clock_settime", ClockSettime),
+		113: syscalls.Supported("clock_gettime", ClockGettime),
+		114: syscalls.Supported("clock_getres", ClockGetres),
+		115: syscalls.Supported("clock_nanosleep", ClockNanosleep),
+		116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil),
+		117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil),
+		118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
+		119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil),
+		120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil),
+		121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil),
+		122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil),
+		123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil),
+		124: syscalls.Supported("sched_yield", SchedYield),
+		125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil),
+		126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil),
+		127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil),
+		128: syscalls.Supported("restart_syscall", RestartSyscall),
+		129: syscalls.Supported("kill", Kill),
+		130: syscalls.Supported("tkill", Tkill),
+		131: syscalls.Supported("tgkill", Tgkill),
+		132: syscalls.Supported("sigaltstack", Sigaltstack),
+		133: syscalls.Supported("rt_sigsuspend", RtSigsuspend),
+		134: syscalls.Supported("rt_sigaction", RtSigaction),
+		135: syscalls.Supported("rt_sigprocmask", RtSigprocmask),
+		136: syscalls.Supported("rt_sigpending", RtSigpending),
+		137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait),
+		138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo),
+		139: syscalls.Supported("rt_sigreturn", RtSigreturn),
+		140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil),
+		141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil),
+		142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
+		143: syscalls.Supported("setregid", Setregid),
+		144: syscalls.Supported("setgid", Setgid),
+		145: syscalls.Supported("setreuid", Setreuid),
+		146: syscalls.Supported("setuid", Setuid),
+		147: syscalls.Supported("setresuid", Setresuid),
+		148: syscalls.Supported("getresuid", Getresuid),
+		149: syscalls.Supported("setresgid", Setresgid),
+		150: syscalls.Supported("getresgid", Getresgid),
+		151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+		152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+		153: syscalls.Supported("times", Times),
+		154: syscalls.Supported("setpgid", Setpgid),
+		155: syscalls.Supported("getpgid", Getpgid),
+		156: syscalls.Supported("getsid", Getsid),
+		157: syscalls.Supported("setsid", Setsid),
+		158: syscalls.Supported("getgroups", Getgroups),
+		159: syscalls.Supported("setgroups", Setgroups),
+		160: syscalls.Supported("uname", Uname),
+		161: syscalls.Supported("sethostname", Sethostname),
+		162: syscalls.Supported("setdomainname", Setdomainname),
+		163: syscalls.Supported("getrlimit", Getrlimit),
+		164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil),
+		165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil),
+		166: syscalls.Supported("umask", Umask),
+		167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil),
+		168: syscalls.Supported("getcpu", Getcpu),
+		169: syscalls.Supported("gettimeofday", Gettimeofday),
+		170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
+		171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
+		172: syscalls.Supported("getpid", Getpid),
+		173: syscalls.Supported("getppid", Getppid),
+		174: syscalls.Supported("getuid", Getuid),
+		175: syscalls.Supported("geteuid", Geteuid),
+		176: syscalls.Supported("getgid", Getgid),
+		177: syscalls.Supported("getegid", Getegid),
+		178: syscalls.Supported("gettid", Gettid),
+		179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil),
+		180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),         // TODO(b/29354921)
+		181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
+		182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),    // TODO(b/29354921)
+		183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+		184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
+		185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),   // TODO(b/29354921)
+		186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
+		187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
+		188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
+		189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
+		190: syscalls.Supported("semget", Semget),
+		191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+		192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
+		193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
+		194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
+		195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
+		196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil),
+		197: syscalls.Supported("shmdt", Shmdt),
+		198: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil),
+		199: syscalls.Supported("socketpair", SocketPair),
+		200: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil),
+		201: syscalls.Supported("listen", Listen),
+		202: syscalls.Supported("accept", Accept),
+		203: syscalls.Supported("connect", Connect),
+		204: syscalls.Supported("getsockname", GetSockName),
+		205: syscalls.Supported("getpeername", GetPeerName),
+		206: syscalls.Supported("sendto", SendTo),
+		207: syscalls.Supported("recvfrom", RecvFrom),
+		208: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil),
+		209: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil),
+		210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
+		211: syscalls.Supported("sendmsg", SendMsg),
+		212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
+		213: syscalls.Supported("readahead", Readahead),
+		214: syscalls.Supported("brk", Brk),
+		215: syscalls.Supported("munmap", Munmap),
+		216: syscalls.Supported("mremap", Mremap),
+		217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil),
+		218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil),
+		219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
+		220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
+		221: syscalls.Supported("execve", Execve),
+		222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
+		223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
+		224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
+		225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
+		226: syscalls.Supported("mprotect", Mprotect),
+		227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil),
+		228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+		232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil),
+		233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil),
+		234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil),
+		235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
+		236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil),
+		237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil),
+		238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
+		239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly)
+		240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
+		241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil),
+		242: syscalls.Supported("accept4", Accept4),
+		243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil),
+		260: syscalls.Supported("wait4", Wait4),
+		261: syscalls.Supported("prlimit64", Prlimit64),
+		262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+		263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+		264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+		265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
+		266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
+		267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil),
+		268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
+		269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil),
+		270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+		271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+		272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
+		273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
+		274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+		275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+		276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}),                                           // TODO(b/118902772)
+		277: syscalls.Supported("seccomp", Seccomp),
+		278: syscalls.Supported("getrandom", GetRandom),
+		279: syscalls.Supported("memfd_create", MemfdCreate),
+		280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
+		281: syscalls.Supported("execveat", Execveat),
+		282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
+		283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
+		284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
+
+		// Syscalls after 284 are "backports" from versions of Linux after 4.4.
+		285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
+		286: syscalls.Supported("preadv2", Preadv2),
+		287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+		288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+		289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+		290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
+		291: syscalls.Supported("statx", Statx),
+		292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+		293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
+
+		// Linux skips ahead to syscall 424 to sync numbers between arches.
+		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+		425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+		426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+		427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+		428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+		429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+		430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+		431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+		432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+		433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+		434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+		435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
+	},
+	Emulate: map[usermem.Addr]uintptr{},
+	Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, syserror.ENOSYS
+	},
+}
+
+func init() {
+	kernel.RegisterSyscallTable(AMD64)
+	kernel.RegisterSyscallTable(ARM64)
+}
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
deleted file mode 100644
index 79066ad2a..000000000
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ /dev/null
@@ -1,406 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package linux
-
-import (
-	"gvisor.dev/gvisor/pkg/abi"
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/syscalls"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
-// numbers from Linux 4.4.
-var AMD64 = &kernel.SyscallTable{
-	OS:   abi.Linux,
-	Arch: arch.AMD64,
-	Version: kernel.Version{
-		// Version 4.4 is chosen as a stable, longterm version of Linux, which
-		// guides the interface provided by this syscall table. The build
-		// version is that for a clean build with default kernel config, at 5
-		// minutes after v4.4 was tagged.
-		Sysname: LinuxSysname,
-		Release: LinuxRelease,
-		Version: LinuxVersion,
-	},
-	AuditNumber: linux.AUDIT_ARCH_X86_64,
-	Table: map[uintptr]kernel.Syscall{
-		0:   syscalls.Supported("read", Read),
-		1:   syscalls.Supported("write", Write),
-		2:   syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil),
-		3:   syscalls.Supported("close", Close),
-		4:   syscalls.Supported("stat", Stat),
-		5:   syscalls.Supported("fstat", Fstat),
-		6:   syscalls.Supported("lstat", Lstat),
-		7:   syscalls.Supported("poll", Poll),
-		8:   syscalls.Supported("lseek", Lseek),
-		9:   syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
-		10:  syscalls.Supported("mprotect", Mprotect),
-		11:  syscalls.Supported("munmap", Munmap),
-		12:  syscalls.Supported("brk", Brk),
-		13:  syscalls.Supported("rt_sigaction", RtSigaction),
-		14:  syscalls.Supported("rt_sigprocmask", RtSigprocmask),
-		15:  syscalls.Supported("rt_sigreturn", RtSigreturn),
-		16:  syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
-		17:  syscalls.Supported("pread64", Pread64),
-		18:  syscalls.Supported("pwrite64", Pwrite64),
-		19:  syscalls.Supported("readv", Readv),
-		20:  syscalls.Supported("writev", Writev),
-		21:  syscalls.Supported("access", Access),
-		22:  syscalls.Supported("pipe", Pipe),
-		23:  syscalls.Supported("select", Select),
-		24:  syscalls.Supported("sched_yield", SchedYield),
-		25:  syscalls.Supported("mremap", Mremap),
-		26:  syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil),
-		27:  syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil),
-		28:  syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil),
-		29:  syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
-		30:  syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil),
-		31:  syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
-		32:  syscalls.Supported("dup", Dup),
-		33:  syscalls.Supported("dup2", Dup2),
-		34:  syscalls.Supported("pause", Pause),
-		35:  syscalls.Supported("nanosleep", Nanosleep),
-		36:  syscalls.Supported("getitimer", Getitimer),
-		37:  syscalls.Supported("alarm", Alarm),
-		38:  syscalls.Supported("setitimer", Setitimer),
-		39:  syscalls.Supported("getpid", Getpid),
-		40:  syscalls.Supported("sendfile", Sendfile),
-		41:  syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil),
-		42:  syscalls.Supported("connect", Connect),
-		43:  syscalls.Supported("accept", Accept),
-		44:  syscalls.Supported("sendto", SendTo),
-		45:  syscalls.Supported("recvfrom", RecvFrom),
-		46:  syscalls.Supported("sendmsg", SendMsg),
-		47:  syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
-		48:  syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
-		49:  syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil),
-		50:  syscalls.Supported("listen", Listen),
-		51:  syscalls.Supported("getsockname", GetSockName),
-		52:  syscalls.Supported("getpeername", GetPeerName),
-		53:  syscalls.Supported("socketpair", SocketPair),
-		54:  syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil),
-		55:  syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil),
-		56:  syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
-		57:  syscalls.Supported("fork", Fork),
-		58:  syscalls.Supported("vfork", Vfork),
-		59:  syscalls.Supported("execve", Execve),
-		60:  syscalls.Supported("exit", Exit),
-		61:  syscalls.Supported("wait4", Wait4),
-		62:  syscalls.Supported("kill", Kill),
-		63:  syscalls.Supported("uname", Uname),
-		64:  syscalls.Supported("semget", Semget),
-		65:  syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
-		66:  syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
-		67:  syscalls.Supported("shmdt", Shmdt),
-		68:  syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
-		69:  syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
-		70:  syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
-		71:  syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
-		72:  syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
-		73:  syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
-		74:  syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
-		75:  syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil),
-		76:  syscalls.Supported("truncate", Truncate),
-		77:  syscalls.Supported("ftruncate", Ftruncate),
-		78:  syscalls.Supported("getdents", Getdents),
-		79:  syscalls.Supported("getcwd", Getcwd),
-		80:  syscalls.Supported("chdir", Chdir),
-		81:  syscalls.Supported("fchdir", Fchdir),
-		82:  syscalls.Supported("rename", Rename),
-		83:  syscalls.Supported("mkdir", Mkdir),
-		84:  syscalls.Supported("rmdir", Rmdir),
-		85:  syscalls.Supported("creat", Creat),
-		86:  syscalls.Supported("link", Link),
-		87:  syscalls.Supported("unlink", Unlink),
-		88:  syscalls.Supported("symlink", Symlink),
-		89:  syscalls.Supported("readlink", Readlink),
-		90:  syscalls.Supported("chmod", Chmod),
-		91:  syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil),
-		92:  syscalls.Supported("chown", Chown),
-		93:  syscalls.Supported("fchown", Fchown),
-		94:  syscalls.Supported("lchown", Lchown),
-		95:  syscalls.Supported("umask", Umask),
-		96:  syscalls.Supported("gettimeofday", Gettimeofday),
-		97:  syscalls.Supported("getrlimit", Getrlimit),
-		98:  syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil),
-		99:  syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil),
-		100: syscalls.Supported("times", Times),
-		101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil),
-		102: syscalls.Supported("getuid", Getuid),
-		103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil),
-		104: syscalls.Supported("getgid", Getgid),
-		105: syscalls.Supported("setuid", Setuid),
-		106: syscalls.Supported("setgid", Setgid),
-		107: syscalls.Supported("geteuid", Geteuid),
-		108: syscalls.Supported("getegid", Getegid),
-		109: syscalls.Supported("setpgid", Setpgid),
-		110: syscalls.Supported("getppid", Getppid),
-		111: syscalls.Supported("getpgrp", Getpgrp),
-		112: syscalls.Supported("setsid", Setsid),
-		113: syscalls.Supported("setreuid", Setreuid),
-		114: syscalls.Supported("setregid", Setregid),
-		115: syscalls.Supported("getgroups", Getgroups),
-		116: syscalls.Supported("setgroups", Setgroups),
-		117: syscalls.Supported("setresuid", Setresuid),
-		118: syscalls.Supported("getresuid", Getresuid),
-		119: syscalls.Supported("setresgid", Setresgid),
-		120: syscalls.Supported("getresgid", Getresgid),
-		121: syscalls.Supported("getpgid", Getpgid),
-		122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
-		123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
-		124: syscalls.Supported("getsid", Getsid),
-		125: syscalls.Supported("capget", Capget),
-		126: syscalls.Supported("capset", Capset),
-		127: syscalls.Supported("rt_sigpending", RtSigpending),
-		128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait),
-		129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo),
-		130: syscalls.Supported("rt_sigsuspend", RtSigsuspend),
-		131: syscalls.Supported("sigaltstack", Sigaltstack),
-		132: syscalls.Supported("utime", Utime),
-		133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil),
-		134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil),
-		135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil),
-		136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil),
-		137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
-		138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
-		139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}),
-		140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil),
-		141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil),
-		142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
-		143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil),
-		144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil),
-		145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil),
-		146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil),
-		147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil),
-		148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil),
-		149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-		150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-		151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-		152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-		153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
-		154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil),
-		155: syscalls.Error("pivot_root", syserror.EPERM, "", nil),
-		156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil),
-		157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil),
-		158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil),
-		159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
-		160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil),
-		161: syscalls.Supported("chroot", Chroot),
-		162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
-		163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
-		164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
-		165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
-		166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil),
-		167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
-		168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
-		169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
-		170: syscalls.Supported("sethostname", Sethostname),
-		171: syscalls.Supported("setdomainname", Setdomainname),
-		172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil),
-		173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil),
-		174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil),
-		175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
-		176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
-		177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil),
-		178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil),
-		179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
-		180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
-		181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil),
-		182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil),
-		183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil),
-		184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil),
-		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
-		186: syscalls.Supported("gettid", Gettid),
-		187: syscalls.Supported("readahead", Readahead),
-		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
-		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
-		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
-		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
-		194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
-		195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
-		196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
-		197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
-		198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
-		199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
-		200: syscalls.Supported("tkill", Tkill),
-		201: syscalls.Supported("time", Time),
-		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
-		203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil),
-		204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil),
-		205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
-		206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
-		212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
-		213: syscalls.Supported("epoll_create", EpollCreate),
-		214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil),
-		215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil),
-		216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil),
-		217: syscalls.Supported("getdents64", Getdents64),
-		218: syscalls.Supported("set_tid_address", SetTidAddress),
-		219: syscalls.Supported("restart_syscall", RestartSyscall),
-		220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
-		221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
-		222: syscalls.Supported("timer_create", TimerCreate),
-		223: syscalls.Supported("timer_settime", TimerSettime),
-		224: syscalls.Supported("timer_gettime", TimerGettime),
-		225: syscalls.Supported("timer_getoverrun", TimerGetoverrun),
-		226: syscalls.Supported("timer_delete", TimerDelete),
-		227: syscalls.Supported("clock_settime", ClockSettime),
-		228: syscalls.Supported("clock_gettime", ClockGettime),
-		229: syscalls.Supported("clock_getres", ClockGetres),
-		230: syscalls.Supported("clock_nanosleep", ClockNanosleep),
-		231: syscalls.Supported("exit_group", ExitGroup),
-		232: syscalls.Supported("epoll_wait", EpollWait),
-		233: syscalls.Supported("epoll_ctl", EpollCtl),
-		234: syscalls.Supported("tgkill", Tgkill),
-		235: syscalls.Supported("utimes", Utimes),
-		236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil),
-		237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
-		238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil),
-		239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil),
-		240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),         // TODO(b/29354921)
-		241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
-		242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),    // TODO(b/29354921)
-		243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
-		244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
-		245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),   // TODO(b/29354921)
-		246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
-		247: syscalls.Supported("waitid", Waitid),
-		248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil),
-		249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil),
-		250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
-		251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
-		252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
-		253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil),
-		254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
-		255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
-		256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
-		257: syscalls.Supported("openat", Openat),
-		258: syscalls.Supported("mkdirat", Mkdirat),
-		259: syscalls.Supported("mknodat", Mknodat),
-		260: syscalls.Supported("fchownat", Fchownat),
-		261: syscalls.Supported("futimesat", Futimesat),
-		262: syscalls.Supported("fstatat", Fstatat),
-		263: syscalls.Supported("unlinkat", Unlinkat),
-		264: syscalls.Supported("renameat", Renameat),
-		265: syscalls.Supported("linkat", Linkat),
-		266: syscalls.Supported("symlinkat", Symlinkat),
-		267: syscalls.Supported("readlinkat", Readlinkat),
-		268: syscalls.Supported("fchmodat", Fchmodat),
-		269: syscalls.Supported("faccessat", Faccessat),
-		270: syscalls.Supported("pselect", Pselect),
-		271: syscalls.Supported("ppoll", Ppoll),
-		272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
-		273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
-		274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
-		275: syscalls.Supported("splice", Splice),
-		276: syscalls.Supported("tee", Tee),
-		277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
-		278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
-		279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil),                               // requires cap_sys_nice (mostly)
-		280: syscalls.Supported("utimensat", Utimensat),
-		281: syscalls.Supported("epoll_pwait", EpollPwait),
-		282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
-		283: syscalls.Supported("timerfd_create", TimerfdCreate),
-		284: syscalls.Supported("eventfd", Eventfd),
-		285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
-		286: syscalls.Supported("timerfd_settime", TimerfdSettime),
-		287: syscalls.Supported("timerfd_gettime", TimerfdGettime),
-		288: syscalls.Supported("accept4", Accept4),
-		289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
-		290: syscalls.Supported("eventfd2", Eventfd2),
-		291: syscalls.Supported("epoll_create1", EpollCreate1),
-		292: syscalls.Supported("dup3", Dup3),
-		293: syscalls.Supported("pipe2", Pipe2),
-		294: syscalls.Supported("inotify_init1", InotifyInit1),
-		295: syscalls.Supported("preadv", Preadv),
-		296: syscalls.Supported("pwritev", Pwritev),
-		297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
-		298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil),
-		299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil),
-		300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
-		301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
-		302: syscalls.Supported("prlimit64", Prlimit64),
-		303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
-		304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
-		305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
-		306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil),
-		307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil),
-		308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
-		309: syscalls.Supported("getcpu", Getcpu),
-		310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
-		311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
-		312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
-		313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
-		314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
-		315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
-		316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}),                                           // TODO(b/118902772)
-		317: syscalls.Supported("seccomp", Seccomp),
-		318: syscalls.Supported("getrandom", GetRandom),
-		319: syscalls.Supported("memfd_create", MemfdCreate),
-		320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
-		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
-		322: syscalls.Supported("execveat", Execveat),
-		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
-		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
-		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-
-		// Syscalls implemented after 325 are "backports" from versions
-		// of Linux after 4.4.
-		326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
-		327: syscalls.Supported("preadv2", Preadv2),
-		328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
-		329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
-		330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
-		331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
-		332: syscalls.Supported("statx", Statx),
-		333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
-		334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
-
-		// Linux skips ahead to syscall 424 to sync numbers between arches.
-		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
-		425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
-		426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
-		427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
-		428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
-		429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
-		430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
-		431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
-		432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
-		433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
-		434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
-		435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
-	},
-
-	Emulate: map[usermem.Addr]uintptr{
-		0xffffffffff600000: 96,  // vsyscall gettimeofday(2)
-		0xffffffffff600400: 201, // vsyscall time(2)
-		0xffffffffff600800: 309, // vsyscall getcpu(2)
-	},
-	Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
-		t.Kernel().EmitUnimplementedEvent(t)
-		return 0, syserror.ENOSYS
-	},
-}
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
deleted file mode 100644
index 7421619de..000000000
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ /dev/null
@@ -1,340 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package linux
-
-import (
-	"gvisor.dev/gvisor/pkg/abi"
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/syscalls"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall
-// numbers from Linux 4.4.
-var ARM64 = &kernel.SyscallTable{
-	OS:   abi.Linux,
-	Arch: arch.ARM64,
-	Version: kernel.Version{
-		Sysname: LinuxSysname,
-		Release: LinuxRelease,
-		Version: LinuxVersion,
-	},
-	AuditNumber: linux.AUDIT_ARCH_AARCH64,
-	Table: map[uintptr]kernel.Syscall{
-		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
-		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
-		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
-		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
-		11:  syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
-		12:  syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
-		13:  syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
-		14:  syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
-		15:  syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
-		16:  syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
-		17:  syscalls.Supported("getcwd", Getcwd),
-		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
-		19:  syscalls.Supported("eventfd2", Eventfd2),
-		20:  syscalls.Supported("epoll_create1", EpollCreate1),
-		21:  syscalls.Supported("epoll_ctl", EpollCtl),
-		22:  syscalls.Supported("epoll_pwait", EpollPwait),
-		23:  syscalls.Supported("dup", Dup),
-		24:  syscalls.Supported("dup3", Dup3),
-		25:  syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
-		26:  syscalls.Supported("inotify_init1", InotifyInit1),
-		27:  syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil),
-		28:  syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil),
-		29:  syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil),
-		30:  syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
-		31:  syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
-		32:  syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
-		33:  syscalls.Supported("mknodat", Mknodat),
-		34:  syscalls.Supported("mkdirat", Mkdirat),
-		35:  syscalls.Supported("unlinkat", Unlinkat),
-		36:  syscalls.Supported("symlinkat", Symlinkat),
-		37:  syscalls.Supported("linkat", Linkat),
-		38:  syscalls.Supported("renameat", Renameat),
-		39:  syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil),
-		40:  syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil),
-		41:  syscalls.Error("pivot_root", syserror.EPERM, "", nil),
-		42:  syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil),
-		43:  syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil),
-		44:  syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil),
-		45:  syscalls.Supported("truncate", Truncate),
-		46:  syscalls.Supported("ftruncate", Ftruncate),
-		47:  syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil),
-		48:  syscalls.Supported("faccessat", Faccessat),
-		49:  syscalls.Supported("chdir", Chdir),
-		50:  syscalls.Supported("fchdir", Fchdir),
-		51:  syscalls.Supported("chroot", Chroot),
-		52:  syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil),
-		53:  syscalls.Supported("fchmodat", Fchmodat),
-		54:  syscalls.Supported("fchownat", Fchownat),
-		55:  syscalls.Supported("fchown", Fchown),
-		56:  syscalls.Supported("openat", Openat),
-		57:  syscalls.Supported("close", Close),
-		58:  syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
-		59:  syscalls.Supported("pipe2", Pipe2),
-		60:  syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
-		61:  syscalls.Supported("getdents64", Getdents64),
-		62:  syscalls.Supported("lseek", Lseek),
-		63:  syscalls.Supported("read", Read),
-		64:  syscalls.Supported("write", Write),
-		65:  syscalls.Supported("readv", Readv),
-		66:  syscalls.Supported("writev", Writev),
-		67:  syscalls.Supported("pread64", Pread64),
-		68:  syscalls.Supported("pwrite64", Pwrite64),
-		69:  syscalls.Supported("preadv", Preadv),
-		70:  syscalls.Supported("pwritev", Pwritev),
-		71:  syscalls.Supported("sendfile", Sendfile),
-		72:  syscalls.Supported("pselect", Pselect),
-		73:  syscalls.Supported("ppoll", Ppoll),
-		74:  syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}),
-		75:  syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}),              // TODO(b/29354098)
-		76:  syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
-		77:  syscalls.Supported("tee", Tee),
-		78:  syscalls.Supported("readlinkat", Readlinkat),
-		79:  syscalls.Supported("fstatat", Fstatat),
-		80:  syscalls.Supported("fstat", Fstat),
-		81:  syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil),
-		82:  syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
-		83:  syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil),
-		84:  syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil),
-		85:  syscalls.Supported("timerfd_create", TimerfdCreate),
-		86:  syscalls.Supported("timerfd_settime", TimerfdSettime),
-		87:  syscalls.Supported("timerfd_gettime", TimerfdGettime),
-		88:  syscalls.Supported("utimensat", Utimensat),
-		89:  syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
-		90:  syscalls.Supported("capget", Capget),
-		91:  syscalls.Supported("capset", Capset),
-		92:  syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil),
-		93:  syscalls.Supported("exit", Exit),
-		94:  syscalls.Supported("exit_group", ExitGroup),
-		95:  syscalls.Supported("waitid", Waitid),
-		96:  syscalls.Supported("set_tid_address", SetTidAddress),
-		97:  syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil),
-		98:  syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
-		99:  syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil),
-		100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil),
-		101: syscalls.Supported("nanosleep", Nanosleep),
-		102: syscalls.Supported("getitimer", Getitimer),
-		103: syscalls.Supported("setitimer", Setitimer),
-		104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
-		105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
-		106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
-		107: syscalls.Supported("timer_create", TimerCreate),
-		108: syscalls.Supported("timer_gettime", TimerGettime),
-		109: syscalls.Supported("timer_getoverrun", TimerGetoverrun),
-		110: syscalls.Supported("timer_settime", TimerSettime),
-		111: syscalls.Supported("timer_delete", TimerDelete),
-		112: syscalls.Supported("clock_settime", ClockSettime),
-		113: syscalls.Supported("clock_gettime", ClockGettime),
-		114: syscalls.Supported("clock_getres", ClockGetres),
-		115: syscalls.Supported("clock_nanosleep", ClockNanosleep),
-		116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil),
-		117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil),
-		118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
-		119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil),
-		120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil),
-		121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil),
-		122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil),
-		123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil),
-		124: syscalls.Supported("sched_yield", SchedYield),
-		125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil),
-		126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil),
-		127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil),
-		128: syscalls.Supported("restart_syscall", RestartSyscall),
-		129: syscalls.Supported("kill", Kill),
-		130: syscalls.Supported("tkill", Tkill),
-		131: syscalls.Supported("tgkill", Tgkill),
-		132: syscalls.Supported("sigaltstack", Sigaltstack),
-		133: syscalls.Supported("rt_sigsuspend", RtSigsuspend),
-		134: syscalls.Supported("rt_sigaction", RtSigaction),
-		135: syscalls.Supported("rt_sigprocmask", RtSigprocmask),
-		136: syscalls.Supported("rt_sigpending", RtSigpending),
-		137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait),
-		138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo),
-		139: syscalls.Supported("rt_sigreturn", RtSigreturn),
-		140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil),
-		141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil),
-		142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
-		143: syscalls.Supported("setregid", Setregid),
-		144: syscalls.Supported("setgid", Setgid),
-		145: syscalls.Supported("setreuid", Setreuid),
-		146: syscalls.Supported("setuid", Setuid),
-		147: syscalls.Supported("setresuid", Setresuid),
-		148: syscalls.Supported("getresuid", Getresuid),
-		149: syscalls.Supported("setresgid", Setresgid),
-		150: syscalls.Supported("getresgid", Getresgid),
-		151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
-		152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
-		153: syscalls.Supported("times", Times),
-		154: syscalls.Supported("setpgid", Setpgid),
-		155: syscalls.Supported("getpgid", Getpgid),
-		156: syscalls.Supported("getsid", Getsid),
-		157: syscalls.Supported("setsid", Setsid),
-		158: syscalls.Supported("getgroups", Getgroups),
-		159: syscalls.Supported("setgroups", Setgroups),
-		160: syscalls.Supported("uname", Uname),
-		161: syscalls.Supported("sethostname", Sethostname),
-		162: syscalls.Supported("setdomainname", Setdomainname),
-		163: syscalls.Supported("getrlimit", Getrlimit),
-		164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil),
-		165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil),
-		166: syscalls.Supported("umask", Umask),
-		167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil),
-		168: syscalls.Supported("getcpu", Getcpu),
-		169: syscalls.Supported("gettimeofday", Gettimeofday),
-		170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
-		171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
-		172: syscalls.Supported("getpid", Getpid),
-		173: syscalls.Supported("getppid", Getppid),
-		174: syscalls.Supported("getuid", Getuid),
-		175: syscalls.Supported("geteuid", Geteuid),
-		176: syscalls.Supported("getgid", Getgid),
-		177: syscalls.Supported("getegid", Getegid),
-		178: syscalls.Supported("gettid", Gettid),
-		179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil),
-		180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),         // TODO(b/29354921)
-		181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
-		182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),    // TODO(b/29354921)
-		183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
-		184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
-		185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}),   // TODO(b/29354921)
-		186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
-		187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
-		188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
-		189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
-		190: syscalls.Supported("semget", Semget),
-		191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
-		192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
-		193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
-		194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
-		195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
-		196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil),
-		197: syscalls.Supported("shmdt", Shmdt),
-		198: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil),
-		199: syscalls.Supported("socketpair", SocketPair),
-		200: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil),
-		201: syscalls.Supported("listen", Listen),
-		202: syscalls.Supported("accept", Accept),
-		203: syscalls.Supported("connect", Connect),
-		204: syscalls.Supported("getsockname", GetSockName),
-		205: syscalls.Supported("getpeername", GetPeerName),
-		206: syscalls.Supported("sendto", SendTo),
-		207: syscalls.Supported("recvfrom", RecvFrom),
-		208: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil),
-		209: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil),
-		210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil),
-		211: syscalls.Supported("sendmsg", SendMsg),
-		212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil),
-		213: syscalls.Supported("readahead", Readahead),
-		214: syscalls.Supported("brk", Brk),
-		215: syscalls.Supported("munmap", Munmap),
-		216: syscalls.Supported("mremap", Mremap),
-		217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil),
-		218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil),
-		219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil),
-		220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil),
-		221: syscalls.Supported("execve", Execve),
-		222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil),
-		223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
-		224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
-		225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
-		226: syscalls.Supported("mprotect", Mprotect),
-		227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil),
-		228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-		229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-		230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-		231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-		232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil),
-		233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil),
-		234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil),
-		235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
-		236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil),
-		237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil),
-		238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
-		239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly)
-		240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo),
-		241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil),
-		242: syscalls.Supported("accept4", Accept4),
-		243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil),
-		260: syscalls.Supported("wait4", Wait4),
-		261: syscalls.Supported("prlimit64", Prlimit64),
-		262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
-		263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
-		264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
-		265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil),
-		266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
-		267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil),
-		268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
-		269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil),
-		270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
-		271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
-		272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
-		273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
-		274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
-		275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
-		276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}),                                           // TODO(b/118902772)
-		277: syscalls.Supported("seccomp", Seccomp),
-		278: syscalls.Supported("getrandom", GetRandom),
-		279: syscalls.Supported("memfd_create", MemfdCreate),
-		280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
-		281: syscalls.Supported("execveat", Execveat),
-		282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
-		283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
-		284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
-
-		// Syscalls after 284 are "backports" from versions of Linux after 4.4.
-		285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
-		286: syscalls.Supported("preadv2", Preadv2),
-		287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
-		288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
-		289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
-		290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
-		291: syscalls.Supported("statx", Statx),
-		292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
-		293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
-
-		// Linux skips ahead to syscall 424 to sync numbers between arches.
-		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
-		425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
-		426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
-		427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
-		428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
-		429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
-		430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
-		431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
-		432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
-		433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
-		434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
-		435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
-	},
-	Emulate: map[usermem.Addr]uintptr{},
-
-	Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
-		t.Kernel().EmitUnimplementedEvent(t)
-		return 0, syserror.ENOSYS
-	},
-}
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
index 2ddb2b146..434559b80 100644
--- a/pkg/sentry/syscalls/linux/sigset.go
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -21,9 +21,13 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and
+// CopyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and
 // STOP are clear.
-func copyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) {
+//
+// TODO(gvisor.dev/issue/1624): This is only exported because
+// syscalls/vfs2/signal.go depends on it. Once vfs1 is deleted and the vfs2
+// syscalls are moved into this package, then they can be unexported.
+func CopyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) {
 	if size != linux.SignalSetSize {
 		return 0, syserror.EINVAL
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index b401978db..d781d6a04 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -114,14 +114,28 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	id := args[0].Uint64()
 
-	// Destroy the given context.
-	if !t.MemoryManager().DestroyAIOContext(t, id) {
+	ctx := t.MemoryManager().DestroyAIOContext(t, id)
+	if ctx == nil {
 		// Does not exist.
 		return 0, nil, syserror.EINVAL
 	}
-	// FIXME(fvoznika): Linux blocks until all AIO to the destroyed context is
-	// done.
-	return 0, nil, nil
+
+	// Drain completed requests amd wait for pending requests until there are no
+	// more.
+	for {
+		ctx.Drain()
+
+		ch := ctx.WaitChannel()
+		if ch == nil {
+			// No more requests, we're done.
+			return 0, nil, nil
+		}
+		// The task cannot be interrupted during the wait. Equivalent to
+		// TASK_UNINTERRUPTIBLE in Linux.
+		t.UninterruptibleSleepStart(true /* deactivate */)
+		<-ch
+		t.UninterruptibleSleepFinish(true /* activate */)
+	}
 }
 
 // IoGetevents implements linux syscall io_getevents(2).
@@ -200,13 +214,13 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) {
 	for {
 		if v, ok := ctx.PopRequest(); ok {
-			// Request was readly available. Just return it.
+			// Request was readily available. Just return it.
 			return v, nil
 		}
 
 		// Need to wait for request completion.
-		done, active := ctx.WaitChannel()
-		if !active {
+		done := ctx.WaitChannel()
+		if done == nil {
 			// Context has been destroyed.
 			return nil, syserror.EINVAL
 		}
@@ -248,6 +262,10 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
 }
 
 func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) {
+	if ctx.Dead() {
+		ctx.CancelPendingRequest()
+		return
+	}
 	ev := &ioEvent{
 		Data: cb.Data,
 		Obj:  uint64(cbAddr),
@@ -272,7 +290,7 @@ func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioC
 	// Update the result.
 	if err != nil {
 		err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
-		ev.Result = -int64(t.ExtractErrno(err, 0))
+		ev.Result = -int64(kernel.ExtractErrno(err, 0))
 	}
 
 	file.DecRef()
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 5f11b496c..7f460d30b 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -21,10 +21,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
 	"gvisor.dev/gvisor/pkg/sentry/syscalls"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 // EpollCreate1 implements the epoll_create1(2) linux syscall.
 func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	flags := args[0].Int()
@@ -70,7 +71,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	var data [2]int32
 	if op != linux.EPOLL_CTL_DEL {
 		var e linux.EpollEvent
-		if _, err := t.CopyIn(eventAddr, &e); err != nil {
+		if _, err := e.CopyIn(t, eventAddr); err != nil {
 			return 0, nil, err
 		}
 
@@ -83,8 +84,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		}
 
 		mask = waiter.EventMaskFromLinux(e.Events)
-		data[0] = e.Fd
-		data[1] = e.Data
+		data = e.Data
 	}
 
 	// Perform the requested operations.
@@ -104,28 +104,6 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 }
 
-// copyOutEvents copies epoll events from the kernel to user memory.
-func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error {
-	const itemLen = 12
-	buffLen := len(e) * itemLen
-	if _, ok := addr.AddLength(uint64(buffLen)); !ok {
-		return syserror.EFAULT
-	}
-
-	b := t.CopyScratchBuffer(buffLen)
-	for i := range e {
-		usermem.ByteOrder.PutUint32(b[i*itemLen:], e[i].Events)
-		usermem.ByteOrder.PutUint32(b[i*itemLen+4:], uint32(e[i].Data[0]))
-		usermem.ByteOrder.PutUint32(b[i*itemLen+8:], uint32(e[i].Data[1]))
-	}
-
-	if _, err := t.CopyOutBytes(addr, b); err != nil {
-		return err
-	}
-
-	return nil
-}
-
 // EpollWait implements the epoll_wait(2) linux syscall.
 func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	epfd := args[0].Int()
@@ -139,7 +117,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	}
 
 	if len(r) != 0 {
-		if err := copyOutEvents(t, eventsAddr, r); err != nil {
+		if _, err := linux.CopyEpollEventSliceOut(t, eventsAddr, r); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -153,7 +131,7 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	maskSize := uint(args[5].Uint())
 
 	if maskAddr != 0 {
-		mask, err := copyInSigSet(t, maskAddr, maskSize)
+		mask, err := CopyInSigSet(t, maskAddr, maskSize)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -165,3 +143,5 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 	return EpollWait(t, args)
 }
+
+// LINT.ThenChange(vfs2/epoll.go)
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
index 8a34c4e99..ed3413ca6 100644
--- a/pkg/sentry/syscalls/linux/sys_eventfd.go
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -15,6 +15,7 @@
 package linux
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -22,32 +23,24 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-const (
-	// EFD_SEMAPHORE is a flag used in syscall eventfd(2) and eventfd2(2). Please
-	// see its man page for more information.
-	EFD_SEMAPHORE = 1
-	EFD_NONBLOCK  = 0x800
-	EFD_CLOEXEC   = 0x80000
-)
-
 // Eventfd2 implements linux syscall eventfd2(2).
 func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	initVal := args[0].Int()
 	flags := uint(args[1].Uint())
-	allOps := uint(EFD_SEMAPHORE | EFD_NONBLOCK | EFD_CLOEXEC)
+	allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC)
 
 	if flags & ^allOps != 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
-	event := eventfd.New(t, uint64(initVal), flags&EFD_SEMAPHORE != 0)
+	event := eventfd.New(t, uint64(initVal), flags&linux.EFD_SEMAPHORE != 0)
 	event.SetFlags(fs.SettableFileFlags{
-		NonBlocking: flags&EFD_NONBLOCK != 0,
+		NonBlocking: flags&linux.EFD_NONBLOCK != 0,
 	})
 	defer event.DecRef()
 
 	fd, err := t.NewFDFrom(0, event, kernel.FDFlags{
-		CloseOnExec: flags&EFD_CLOEXEC != 0,
+		CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
 	})
 	if err != nil {
 		return 0, nil, err
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 421845ebb..35a98212a 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -130,6 +130,8 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string
 	return path, dirPath, nil
 }
 
+// LINT.IfChange
+
 func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -512,7 +514,7 @@ func (ac accessContext) Value(key interface{}) interface{} {
 	}
 }
 
-func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode uint) error {
+func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode uint) error {
 	const rOK = 4
 	const wOK = 2
 	const xOK = 1
@@ -527,7 +529,7 @@ func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode
 		return syserror.EINVAL
 	}
 
-	return fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	return fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
 		// access(2) and faccessat(2) check permissions using real
 		// UID/GID, not effective UID/GID.
 		//
@@ -562,19 +564,29 @@ func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	addr := args[0].Pointer()
 	mode := args[1].ModeT()
 
-	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, true, mode)
+	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
 }
 
 // Faccessat implements linux syscall faccessat(2).
+//
+// Note that the faccessat() system call does not take a flags argument:
+// "The raw faccessat() system call takes only the first three arguments. The
+// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
+// the glibc wrapper function for faccessat().  If either of these flags is
+// specified, then the wrapper function employs fstatat(2) to determine access
+// permissions." - faccessat(2)
 func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	dirFD := args[0].Int()
 	addr := args[1].Pointer()
 	mode := args[2].ModeT()
-	flags := args[3].Int()
 
-	return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode)
+	return 0, nil, accessAt(t, dirFD, addr, mode)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 // Ioctl implements linux syscall ioctl(2).
 func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -650,6 +662,10 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 }
 
+// LINT.ThenChange(vfs2/ioctl.go)
+
+// LINT.IfChange
+
 // Getcwd implements the linux syscall getcwd(2).
 func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -760,6 +776,10 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/fscontext.go)
+
+// LINT.IfChange
+
 // Close implements linux syscall close(2).
 func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -1094,6 +1114,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 }
 
+// LINT.ThenChange(vfs2/fd.go)
+
 const (
 	_FADV_NORMAL     = 0
 	_FADV_RANDOM     = 1
@@ -1141,6 +1163,8 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.IfChange
+
 func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
 	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1218,7 +1242,7 @@ func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 			return syserror.ENOTEMPTY
 		}
 
-		if err := fs.MayDelete(t, root, d, name); err != nil {
+		if err := d.MayDelete(t, root, name); err != nil {
 			return err
 		}
 
@@ -1421,6 +1445,10 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1480,6 +1508,10 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	return n, nil, err
 }
 
+// LINT.ThenChange(vfs2/stat.go)
+
+// LINT.IfChange
+
 func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
 	if err != nil {
@@ -1491,7 +1523,7 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
 			return syserror.ENOTDIR
 		}
 
-		if err := fs.MayDelete(t, root, d, name); err != nil {
+		if err := d.MayDelete(t, root, name); err != nil {
 			return err
 		}
 
@@ -1516,6 +1548,10 @@ func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	return 0, nil, unlinkAt(t, dirFD, addr)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
+// LINT.IfChange
+
 // Truncate implements linux syscall truncate(2).
 func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -1614,6 +1650,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, nil
 }
 
+// LINT.ThenChange(vfs2/setstat.go)
+
 // Umask implements linux syscall umask(2).
 func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	mask := args[0].ModeT()
@@ -1621,6 +1659,8 @@ func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	return uintptr(mask), nil, nil
 }
 
+// LINT.IfChange
+
 // Change ownership of a file.
 //
 // uid and gid may be -1, in which case they will not be changed.
@@ -1987,6 +2027,10 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
 }
 
+// LINT.ThenChange(vfs2/setstat.go)
+
+// LINT.IfChange
+
 func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error {
 	newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
 	if err != nil {
@@ -2042,6 +2086,8 @@ func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
 }
 
+// LINT.ThenChange(vfs2/filesystem.go)
+
 // Fallocate implements linux system call fallocate(2).
 func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index f66f4ffde..b126fecc0 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -27,6 +27,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // Getdents implements linux syscall getdents(2) for 64bit systems.
 func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -244,3 +246,5 @@ func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error {
 func (ds *direntSerializer) Written() int {
 	return ds.written
 }
+
+// LINT.ThenChange(vfs2/getdents.go)
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index 297e920c4..3f7691eae 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -21,6 +21,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // Lseek implements linux syscall lseek(2).
 func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := args[0].Int()
@@ -52,3 +54,5 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 	return uintptr(offset), nil, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go
index ac934dc6f..9b4a5c3f1 100644
--- a/pkg/sentry/syscalls/linux/sys_mempolicy.go
+++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go
@@ -162,10 +162,10 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 			if err != nil {
 				return 0, nil, err
 			}
-			policy = 0 // maxNodes == 1
+			policy = linux.MPOL_DEFAULT // maxNodes == 1
 		}
 		if mode != 0 {
-			if _, err := t.CopyOut(mode, policy); err != nil {
+			if _, err := policy.CopyOut(t, mode); err != nil {
 				return 0, nil, err
 			}
 		}
@@ -199,10 +199,10 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 		if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
 			return 0, nil, syserror.EINVAL
 		}
-		policy = 0 // maxNodes == 1
+		policy = linux.MPOL_DEFAULT // maxNodes == 1
 	}
 	if mode != 0 {
-		if _, err := t.CopyOut(mode, policy); err != nil {
+		if _, err := policy.CopyOut(t, mode); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -216,7 +216,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 
 // SetMempolicy implements the syscall set_mempolicy(2).
 func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	modeWithFlags := args[0].Int()
+	modeWithFlags := linux.NumaPolicy(args[0].Int())
 	nodemask := args[1].Pointer()
 	maxnode := args[2].Uint()
 
@@ -233,7 +233,7 @@ func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 	length := args[1].Uint64()
-	mode := args[2].Int()
+	mode := linux.NumaPolicy(args[2].Int())
 	nodemask := args[3].Pointer()
 	maxnode := args[4].Uint()
 	flags := args[5].Uint()
@@ -258,9 +258,9 @@ func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	return 0, nil, err
 }
 
-func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags int32, nodemask usermem.Addr, maxnode uint32) (int32, uint64, error) {
-	flags := modeWithFlags & linux.MPOL_MODE_FLAGS
-	mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
+func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask usermem.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) {
+	flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS)
+	mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS)
 	if flags == linux.MPOL_MODE_FLAGS {
 		// Can't specify both mode flags simultaneously.
 		return 0, 0, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 9959f6e61..91694d374 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -35,6 +35,8 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
 	return uintptr(addr), nil, nil
 }
 
+// LINT.IfChange
+
 // Mmap implements linux syscall mmap(2).
 func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	prot := args[2].Int()
@@ -104,6 +106,8 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	return uintptr(rv), nil, err
 }
 
+// LINT.ThenChange(vfs2/mmap.go)
+
 // Munmap implements linux syscall munmap(2).
 func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 798344042..43c510930 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -24,6 +24,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // pipe2 implements the actual system call with flags.
 func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
 	if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
@@ -45,10 +47,12 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
 	}
 
 	if _, err := t.CopyOut(addr, fds); err != nil {
-		// The files are not closed in this case, the exact semantics
-		// of this error case are not well defined, but they could have
-		// already been observed by user space.
-		return 0, syserror.EFAULT
+		for _, fd := range fds {
+			if file, _ := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return 0, err
 	}
 	return 0, nil
 }
@@ -69,3 +73,5 @@ func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	n, err := pipe2(t, addr, flags)
 	return n, nil, err
 }
+
+// LINT.ThenChange(vfs2/pipe.go)
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 4f8762d7d..f0198141c 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -443,7 +443,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 
 	if maskAddr != 0 {
-		mask, err := copyInSigSet(t, maskAddr, maskSize)
+		mask, err := CopyInSigSet(t, maskAddr, maskSize)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -525,7 +525,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 		}
 
 		if maskAddr != 0 {
-			mask, err := copyInSigSet(t, maskAddr, size)
+			mask, err := CopyInSigSet(t, maskAddr, size)
 			if err != nil {
 				return 0, nil, err
 			}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 98db32d77..f92bf8096 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -135,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			}
 
 			// Set the underlying executable.
-			t.MemoryManager().SetExecutable(file.Dirent)
+			t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file))
 
 		case linux.PR_SET_MM_AUXV,
 			linux.PR_SET_MM_START_CODE,
@@ -160,8 +161,8 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
 			return 0, nil, syserror.EINVAL
 		}
-		// no_new_privs is assumed to always be set. See
-		// kernel.Task.updateCredsForExec.
+		// PR_SET_NO_NEW_PRIVS is assumed to always be set.
+		// See kernel.Task.updateCredsForExecLocked.
 		return 0, nil, nil
 
 	case linux.PR_GET_NO_NEW_PRIVS:
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 227692f06..071b4bacc 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 const (
 	// EventMaskRead contains events that can be triggered on reads.
 	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
@@ -94,8 +96,8 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, syserror.EINVAL
 	}
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
@@ -118,8 +120,8 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 	defer file.DecRef()
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
@@ -388,3 +390,5 @@ func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (i
 
 	return total, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index e08c333d6..d5d5b6959 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -197,7 +197,7 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	// saved set user IDs of the target process must match the real user ID of
 	// the caller and the real, effective, and saved set group IDs of the
 	// target process must match the real group ID of the caller."
-	if !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
+	if ot != t && !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
 		cred, tcred := t.Credentials(), ot.Credentials()
 		if cred.RealKUID != tcred.RealKUID ||
 			cred.RealKUID != tcred.EffectiveKUID ||
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index 7e1747a0c..d2b0012ae 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -295,7 +295,7 @@ func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 	}
 	oldmask := t.SignalMask()
 	if setaddr != 0 {
-		mask, err := copyInSigSet(t, setaddr, sigsetsize)
+		mask, err := CopyInSigSet(t, setaddr, sigsetsize)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -355,7 +355,7 @@ func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 	pending := t.PendingSignals()
-	_, err := t.CopyOut(addr, pending)
+	_, err := pending.CopyOut(t, addr)
 	return 0, nil, err
 }
 
@@ -366,7 +366,7 @@ func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	timespec := args[2].Pointer()
 	sigsetsize := args[3].SizeT()
 
-	mask, err := copyInSigSet(t, sigset, sigsetsize)
+	mask, err := CopyInSigSet(t, sigset, sigsetsize)
 	if err != nil {
 		return 0, nil, err
 	}
@@ -392,7 +392,7 @@ func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 
 	if siginfo != 0 {
 		si.FixSignalCodeForUser()
-		if _, err := t.CopyOut(siginfo, si); err != nil {
+		if _, err := si.CopyOut(t, siginfo); err != nil {
 			return 0, nil, err
 		}
 	}
@@ -411,7 +411,7 @@ func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	// same way), and that the code is in the allowed set. This same logic
 	// appears below in RtSigtgqueueinfo and should be kept in sync.
 	var info arch.SignalInfo
-	if _, err := t.CopyIn(infoAddr, &info); err != nil {
+	if _, err := info.CopyIn(t, infoAddr); err != nil {
 		return 0, nil, err
 	}
 	info.Signo = int32(sig)
@@ -455,7 +455,7 @@ func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker
 
 	// Copy in the info. See RtSigqueueinfo above.
 	var info arch.SignalInfo
-	if _, err := t.CopyIn(infoAddr, &info); err != nil {
+	if _, err := info.CopyIn(t, infoAddr); err != nil {
 		return 0, nil, err
 	}
 	info.Signo = int32(sig)
@@ -485,7 +485,7 @@ func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 
 	// Copy in the signal mask.
 	var mask linux.SignalSet
-	if _, err := t.CopyIn(sigset, &mask); err != nil {
+	if _, err := mask.CopyIn(t, sigset); err != nil {
 		return 0, nil, err
 	}
 	mask &^= kernel.UnblockableSignals
@@ -518,7 +518,7 @@ func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 // sharedSignalfd is shared between the two calls.
 func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) {
 	// Copy in the signal mask.
-	mask, err := copyInSigSet(t, sigset, sigsetsize)
+	mask, err := CopyInSigSet(t, sigset, sigsetsize)
 	if err != nil {
 		return 0, nil, err
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 2919228d0..0760af77b 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -31,6 +31,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // minListenBacklog is the minimum reasonable backlog for listening sockets.
 const minListenBacklog = 8
 
@@ -244,7 +246,11 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 
 	// Copy the file descriptors out.
 	if _, err := t.CopyOut(socks, fds); err != nil {
-		// Note that we don't close files here; see pipe(2) also.
+		for _, fd := range fds {
+			if file, _ := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
 		return 0, nil, err
 	}
 
@@ -1128,3 +1134,5 @@ func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
 	return n, nil, err
 }
+
+// LINT.ThenChange(./vfs2/socket.go)
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index fd642834b..77c78889d 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -25,10 +25,14 @@ import (
 
 // doSplice implements a blocking splice operation.
 func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
-	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 {
+	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) {
 		return 0, syserror.EINVAL
 	}
 
+	if opts.Length > int64(kernel.MAX_RW_COUNT) {
+		opts.Length = int64(kernel.MAX_RW_COUNT)
+	}
+
 	var (
 		total int64
 		n     int64
@@ -76,6 +80,12 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
 		}
 	}
 
+	if total > 0 {
+		// On Linux, inotify behavior is not very consistent with splice(2). We try
+		// our best to emulate Linux for very basic calls to splice, where for some
+		// reason, events are generated for output files, but not input files.
+		outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+	}
 	return total, err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index c841abccb..46ebf27a2 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -23,6 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // Stat implements linux syscall stat(2).
 func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -112,7 +114,9 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 	if err != nil {
 		return err
 	}
-	return copyOutStat(t, statAddr, d.Inode.StableAttr, uattr)
+	s := statFromAttrs(t, d.Inode.StableAttr, uattr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
 }
 
 // fstat implements fstat for the given *fs.File.
@@ -121,7 +125,9 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
 	if err != nil {
 		return err
 	}
-	return copyOutStat(t, statAddr, f.Dirent.Inode.StableAttr, uattr)
+	s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr)
+	_, err = s.CopyOut(t, statAddr)
+	return err
 }
 
 // Statx implements linux syscall statx(2).
@@ -132,7 +138,10 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	mask := args[3].Uint()
 	statxAddr := args[4].Pointer()
 
-	if mask&linux.STATX__RESERVED > 0 {
+	if mask&linux.STATX__RESERVED != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
 	if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
@@ -277,3 +286,5 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
 	_, err = t.CopyOut(addr, &statfs)
 	return err
 }
+
+// LINT.ThenChange(vfs2/stat.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
index 75a567bd4..0a04a6113 100644
--- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go
+++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
@@ -12,64 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//+build amd64
+// +build amd64
 
 package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
-// address dst in t's address space. It encodes the stat struct to bytes
-// manually, as stat() is a very common syscall for many applications, and
-// t.CopyObjectOut has noticeable performance impact due to its many slice
-// allocations and use of reflection.
-func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
-	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
-
-	// Dev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
-	// Ino (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
-	// Nlink (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links)
-	// Mode (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
-	// UID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
-	// GID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
-	// Padding (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
-	// Rdev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
-	// Size (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
-	// Blksize (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize))
-	// Blocks (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
-
-	// ATime
-	atime := uattr.AccessTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
-
-	// MTime
-	mtime := uattr.ModificationTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
-
-	// CTime
-	ctime := uattr.StatusChangeTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
-
-	_, err := t.CopyOutBytes(dst, b)
-	return err
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uattr.Links,
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: sattr.BlockSize,
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
 }
+
+// LINT.ThenChange(vfs2/stat_amd64.go)
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
index 80c98d05c..5a3b1bfad 100644
--- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go
+++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
@@ -12,66 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//+build arm64
+// +build arm64
 
 package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
-// address dst in t's address space. It encodes the stat struct to bytes
-// manually, as stat() is a very common syscall for many applications, and
-// t.CopyObjectOut has noticeable performance impact due to its many slice
-// allocations and use of reflection.
-func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
-	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
-
-	// Dev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
-	// Ino (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
-	// Mode (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
-	// Nlink (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Links))
-	// UID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
-	// GID (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
-	// Rdev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
-	// Padding (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, 0)
-	// Size (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
-	// Blksize (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(sattr.BlockSize))
-	// Padding (uint32)
-	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
-	// Blocks (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
-
-	// ATime
-	atime := uattr.AccessTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
-
-	// MTime
-	mtime := uattr.ModificationTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
-
-	// CTime
-	ctime := uattr.StatusChangeTime.Timespec()
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
-
-	_, err := t.CopyOutBytes(dst, b)
-	return err
+// LINT.IfChange
+
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+	return linux.Stat{
+		Dev:     sattr.DeviceID,
+		Ino:     sattr.InodeID,
+		Nlink:   uint32(uattr.Links),
+		Mode:    sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+		Size:    uattr.Size,
+		Blksize: int32(sattr.BlockSize),
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	}
 }
+
+// LINT.ThenChange(vfs2/stat_arm64.go)
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 3e55235bd..5ad465ae3 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// LINT.IfChange
+
 // Sync implements linux system call sync(2).
 func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	t.MountNamespace().SyncAll(t)
@@ -135,3 +137,5 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
 
 	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
 }
+
+// LINT.ThenChange(vfs2/sync.go)
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index a65b560c8..297de052a 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -29,13 +29,18 @@ func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	mf.UpdateUsage()
 	_, totalUsage := usage.MemoryAccounting.Copy()
 	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
+	memFree := totalSize - totalUsage
+	if memFree > totalSize {
+		// Underflow.
+		memFree = 0
+	}
 
 	// Only a subset of the fields in sysinfo_t make sense to return.
 	si := linux.Sysinfo{
 		Procs:    uint16(len(t.PIDNamespace().Tasks())),
 		Uptime:   t.Kernel().MonotonicClock().Now().Seconds(),
 		TotalRAM: totalSize,
-		FreeRAM:  totalSize - totalUsage,
+		FreeRAM:  memFree,
 		Unit:     1,
 	}
 	_, err := t.CopyOut(addr, si)
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 0c9e2255d..00915fdde 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/loader"
@@ -119,7 +120,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 	defer root.DecRef()
 
 	var wd *fs.Dirent
-	var executable *fs.File
+	var executable fsbridge.File
 	var closeOnExec bool
 	if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
 		// Even if the pathname is absolute, we may still need the wd
@@ -136,7 +137,15 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		closeOnExec = fdFlags.CloseOnExec
 
 		if atEmptyPath && len(pathname) == 0 {
-			executable = f
+			// TODO(gvisor.dev/issue/160): Linux requires only execute permission,
+			// not read. However, our backing filesystems may prevent us from reading
+			// the file without read permission. Additionally, a task with a
+			// non-readable executable has additional constraints on access via
+			// ptrace and procfs.
+			if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil {
+				return 0, nil, err
+			}
+			executable = fsbridge.NewFSFile(f)
 		} else {
 			wd = f.Dirent
 			wd.IncRef()
@@ -152,9 +161,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 	// Load the new TaskContext.
 	remainingTraversals := uint(linux.MaxSymlinkTraversals)
 	loadArgs := loader.LoadArgs{
-		Mounts:              t.MountNamespace(),
-		Root:                root,
-		WorkingDirectory:    wd,
+		Opener:              fsbridge.NewFSLookup(t.MountNamespace(), root, wd),
 		RemainingTraversals: &remainingTraversals,
 		ResolveFinal:        resolveFinal,
 		Filename:            pathname,
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
index b3eb96a1c..b3eb96a1c 100644
--- a/pkg/sentry/syscalls/linux/sys_tls.go
+++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go
diff --git a/pkg/sentry/syscalls/linux/sys_tls_arm64.go b/pkg/sentry/syscalls/linux/sys_tls_arm64.go
new file mode 100644
index 000000000..fb08a356e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_tls_arm64.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build arm64
+
+package linux
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ArchPrctl is not defined for ARM64.
+func ArchPrctl(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.ENOSYS
+}
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index a393e28c1..e9d702e8e 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 arm64
-
 package linux
 
 import (
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index aba892939..6ec0de96e 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -28,6 +28,8 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// LINT.IfChange
+
 const (
 	// EventMaskWrite contains events that can be triggered on writes.
 	//
@@ -85,8 +87,8 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 	defer file.DecRef()
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
@@ -358,3 +360,5 @@ func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (
 
 	return total, err
 }
+
+// LINT.ThenChange(vfs2/read_write.go)
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 9d8140b8a..c24946160 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // GetXattr implements linux syscall getxattr(2).
 func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	return getXattrFromPath(t, args, true)
@@ -205,7 +207,11 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si
 		return syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.SetXattr(t, d, name, value, flags)
+	if err := d.Inode.SetXattr(t, d, name, value, flags); err != nil {
+		return err
+	}
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+	return nil
 }
 
 func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
@@ -416,5 +422,11 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
 		return syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.RemoveXattr(t, d, name)
+	if err := d.Inode.RemoveXattr(t, d, name); err != nil {
+		return err
+	}
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+	return nil
 }
+
+// LINT.ThenChange(vfs2/xattr.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index 6b8a00b6e..c0d005247 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -5,18 +5,64 @@ package(licenses = ["notice"])
 go_library(
     name = "vfs2",
     srcs = [
-        "linux64.go",
-        "linux64_override_amd64.go",
-        "linux64_override_arm64.go",
-        "sys_read.go",
+        "epoll.go",
+        "eventfd.go",
+        "execve.go",
+        "fd.go",
+        "filesystem.go",
+        "fscontext.go",
+        "getdents.go",
+        "inotify.go",
+        "ioctl.go",
+        "memfd.go",
+        "mmap.go",
+        "mount.go",
+        "path.go",
+        "pipe.go",
+        "poll.go",
+        "read_write.go",
+        "setstat.go",
+        "signal.go",
+        "socket.go",
+        "splice.go",
+        "stat.go",
+        "stat_amd64.go",
+        "stat_arm64.go",
+        "sync.go",
+        "timerfd.go",
+        "vfs2.go",
+        "xattr.go",
     ],
+    marshal = True,
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/bits",
+        "//pkg/fspath",
+        "//pkg/gohacks",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/eventfd",
+        "//pkg/sentry/fsimpl/pipefs",
+        "//pkg/sentry/fsimpl/signalfd",
+        "//pkg/sentry/fsimpl/timerfd",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/syscalls",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
new file mode 100644
index 000000000..34c90ae3e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -0,0 +1,228 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"math"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes()
+
+// EpollCreate1 implements Linux syscall epoll_create1(2).
+func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags&^linux.EPOLL_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file, err := t.Kernel().VFS().NewEpollInstanceFD()
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// EpollCreate implements Linux syscall epoll_create(2).
+func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+
+	// "Since Linux 2.6.8, the size argument is ignored, but must be greater
+	// than zero" - epoll_create(2)
+	if size <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file, err := t.Kernel().VFS().NewEpollInstanceFD()
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// EpollCtl implements Linux syscall epoll_ctl(2).
+func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	op := args[1].Int()
+	fd := args[2].Int()
+	eventAddr := args[3].Pointer()
+
+	epfile := t.GetFileVFS2(epfd)
+	if epfile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer epfile.DecRef()
+	ep, ok := epfile.Impl().(*vfs.EpollInstance)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+	if epfile == file {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var event linux.EpollEvent
+	switch op {
+	case linux.EPOLL_CTL_ADD:
+		if _, err := event.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, ep.AddInterest(file, fd, event)
+	case linux.EPOLL_CTL_DEL:
+		return 0, nil, ep.DeleteInterest(file, fd)
+	case linux.EPOLL_CTL_MOD:
+		if _, err := event.CopyIn(t, eventAddr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, ep.ModifyInterest(file, fd, event)
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// EpollWait implements Linux syscall epoll_wait(2).
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := args[0].Int()
+	eventsAddr := args[1].Pointer()
+	maxEvents := int(args[2].Int())
+	timeout := int(args[3].Int())
+
+	var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
+	if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
+		return 0, nil, syserror.EINVAL
+	}
+
+	epfile := t.GetFileVFS2(epfd)
+	if epfile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer epfile.DecRef()
+	ep, ok := epfile.Impl().(*vfs.EpollInstance)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
+	// maxEvents), so that the buffer can be allocated on the stack.
+	var (
+		events       [16]linux.EpollEvent
+		total        int
+		ch           chan struct{}
+		haveDeadline bool
+		deadline     ktime.Time
+	)
+	for {
+		batchEvents := len(events)
+		if batchEvents > maxEvents {
+			batchEvents = maxEvents
+		}
+		n := ep.ReadEvents(events[:batchEvents])
+		maxEvents -= n
+		if n != 0 {
+			// Copy what we read out.
+			copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n])
+			copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
+			eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
+			total += copiedEvents
+			if err != nil {
+				if total != 0 {
+					return uintptr(total), nil, nil
+				}
+				return 0, nil, err
+			}
+			// If we've filled the application's event buffer, we're done.
+			if maxEvents == 0 {
+				return uintptr(total), nil, nil
+			}
+			// Loop if we read a full batch, under the expectation that there
+			// may be more events to read.
+			if n == batchEvents {
+				continue
+			}
+		}
+		// We get here if n != batchEvents. If we read any number of events
+		// (just now, or in a previous iteration of this loop), or if timeout
+		// is 0 (such that epoll_wait should be non-blocking), return the
+		// events we've read so far to the application.
+		if total != 0 || timeout == 0 {
+			return uintptr(total), nil, nil
+		}
+		// In the first iteration of this loop, register with the epoll
+		// instance for readability events, but then immediately continue the
+		// loop since we need to retry ReadEvents() before blocking. In all
+		// subsequent iterations, block until events are available, the timeout
+		// expires, or an interrupt arrives.
+		if ch == nil {
+			var w waiter.Entry
+			w, ch = waiter.NewChannelEntry(nil)
+			epfile.EventRegister(&w, waiter.EventIn)
+			defer epfile.EventUnregister(&w)
+		} else {
+			// Set up the timer if a timeout was specified.
+			if timeout > 0 && !haveDeadline {
+				timeoutDur := time.Duration(timeout) * time.Millisecond
+				deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
+				haveDeadline = true
+			}
+			if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+				if err == syserror.ETIMEDOUT {
+					err = nil
+				}
+				// total must be 0 since otherwise we would have returned
+				// above.
+				return 0, nil, err
+			}
+		}
+	}
+}
+
+// EpollPwait implements Linux syscall epoll_pwait(2).
+func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	maskAddr := args[4].Pointer()
+	maskSize := uint(args[5].Uint())
+
+	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+		return 0, nil, err
+	}
+
+	return EpollWait(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go
new file mode 100644
index 000000000..aff1a2070
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go
@@ -0,0 +1,61 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Eventfd2 implements linux syscall eventfd2(2).
+func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	initVal := uint64(args[0].Uint())
+	flags := uint(args[1].Uint())
+	allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC)
+
+	if flags & ^allOps != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	vfsObj := t.Kernel().VFS()
+	fileFlags := uint32(linux.O_RDWR)
+	if flags&linux.EFD_NONBLOCK != 0 {
+		fileFlags |= linux.O_NONBLOCK
+	}
+	semMode := flags&linux.EFD_SEMAPHORE != 0
+	eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer eventfd.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{
+		CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// Eventfd implements linux syscall eventfd(2).
+func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[1].Value = 0
+	return Eventfd2(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go
new file mode 100644
index 000000000..aef0078a8
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/execve.go
@@ -0,0 +1,137 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Execve implements linux syscall execve(2).
+func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathnameAddr := args[0].Pointer()
+	argvAddr := args[1].Pointer()
+	envvAddr := args[2].Pointer()
+	return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */)
+}
+
+// Execveat implements linux syscall execveat(2).
+func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathnameAddr := args[1].Pointer()
+	argvAddr := args[2].Pointer()
+	envvAddr := args[3].Pointer()
+	flags := args[4].Int()
+	return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags)
+}
+
+func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
+	if err != nil {
+		return 0, nil, err
+	}
+	var argv, envv []string
+	if argvAddr != 0 {
+		var err error
+		argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if envvAddr != 0 {
+		var err error
+		envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	var executable fsbridge.File
+	closeOnExec := false
+	if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
+		// We must open the executable ourselves since dirfd is used as the
+		// starting point while resolving path, but the task working directory
+		// is used as the starting point while resolving interpreters (Linux:
+		// fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() =>
+		// do_open_execat(fd=AT_FDCWD)), and the loader package is currently
+		// incapable of handling this correctly.
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return 0, nil, syserror.ENOENT
+		}
+		dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd)
+		if dirfile == nil {
+			return 0, nil, syserror.EBADF
+		}
+		start := dirfile.VirtualDentry()
+		start.IncRef()
+		dirfile.DecRef()
+		closeOnExec = dirfileFlags.CloseOnExec
+		file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               path,
+			FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+		}, &vfs.OpenOptions{
+			Flags:    linux.O_RDONLY,
+			FileExec: true,
+		})
+		start.DecRef()
+		if err != nil {
+			return 0, nil, err
+		}
+		defer file.DecRef()
+		executable = fsbridge.NewVFSFile(file)
+	}
+
+	// Load the new TaskContext.
+	mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
+	defer mntns.DecRef()
+	wd := t.FSContext().WorkingDirectoryVFS2()
+	defer wd.DecRef()
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	loadArgs := loader.LoadArgs{
+		Opener:              fsbridge.NewVFSLookup(mntns, root, wd),
+		RemainingTraversals: &remainingTraversals,
+		ResolveFinal:        flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+		Filename:            pathname,
+		File:                executable,
+		CloseOnExec:         closeOnExec,
+		Argv:                argv,
+		Envv:                envv,
+		Features:            t.Arch().FeatureSet(),
+	}
+
+	tc, se := t.Kernel().LoadTaskImage(t, loadArgs)
+	if se != nil {
+		return 0, nil, se.ToError()
+	}
+
+	ctrl, err := t.Execve(tc)
+	return 0, ctrl, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
new file mode 100644
index 000000000..ca0f7fd1e
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -0,0 +1,174 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Close implements Linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	// Note that Remove provides a reference on the file that we may use to
+	// flush. It is still active until we drop the final reference below
+	// (and other reference-holding operations complete).
+	_, file := t.FDTable().Remove(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.OnClose(t)
+	return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file)
+}
+
+// Dup implements Linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
+	if err != nil {
+		return 0, nil, syserror.EMFILE
+	}
+	return uintptr(newFD), nil, nil
+}
+
+// Dup2 implements Linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+
+	if oldfd == newfd {
+		// As long as oldfd is valid, dup2() does nothing and returns newfd.
+		file := t.GetFileVFS2(oldfd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		file.DecRef()
+		return uintptr(newfd), nil, nil
+	}
+
+	return dup3(t, oldfd, newfd, 0)
+}
+
+// Dup3 implements Linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := args[0].Int()
+	newfd := args[1].Int()
+	flags := args[2].Uint()
+
+	if oldfd == newfd {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return dup3(t, oldfd, newfd, flags)
+}
+
+func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
+	if flags&^linux.O_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(oldfd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(newfd), nil, nil
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	cmd := args[1].Int()
+
+	file, flags := t.FDTable().GetVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch cmd {
+	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
+		minfd := args[2].Int()
+		fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{
+			CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
+		})
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(fd), nil, nil
+	case linux.F_GETFD:
+		return uintptr(flags.ToLinuxFDFlags()), nil, nil
+	case linux.F_SETFD:
+		flags := args[2].Uint()
+		t.FDTable().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
+		})
+		return 0, nil, nil
+	case linux.F_GETFL:
+		return uintptr(file.StatusFlags()), nil, nil
+	case linux.F_SETFL:
+		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
+	case linux.F_SETPIPE_SZ:
+		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+		if !ok {
+			return 0, nil, syserror.EBADF
+		}
+		n, err := pipefile.SetPipeSize(int64(args[2].Int()))
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(n), nil, nil
+	case linux.F_GETPIPE_SZ:
+		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+		if !ok {
+			return 0, nil, syserror.EBADF
+		}
+		return uintptr(pipefile.PipeSize()), nil, nil
+	case linux.F_GET_SEALS:
+		val, err := tmpfs.GetSeals(file)
+		return uintptr(val), nil, err
+	case linux.F_ADD_SEALS:
+		if !file.IsWritable() {
+			return 0, nil, syserror.EPERM
+		}
+		err := tmpfs.AddSeals(file, args[2].Uint())
+		return 0, nil, err
+	default:
+		// TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
new file mode 100644
index 000000000..46d3e189c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -0,0 +1,326 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Link implements Linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldpathAddr := args[0].Pointer()
+	newpathAddr := args[1].Pointer()
+	return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Linkat implements Linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	flags := args[4].Int()
+	return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+	if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
+		return syserror.ENOENT
+	}
+
+	oldpath, err := copyInPath(t, oldpathAddr)
+	if err != nil {
+		return err
+	}
+	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
+	if err != nil {
+		return err
+	}
+	defer oldtpop.Release()
+
+	newpath, err := copyInPath(t, newpathAddr)
+	if err != nil {
+		return err
+	}
+	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer newtpop.Release()
+
+	return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
+}
+
+// Mkdir implements Linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements Linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	return 0, nil, mkdirat(t, dirfd, addr, mode)
+}
+
+func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error {
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
+		Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
+	})
+}
+
+// Mknod implements Linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	dev := args[2].Uint()
+	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev)
+}
+
+// Mknodat implements Linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	dev := args[3].Uint()
+	return 0, nil, mknodat(t, dirfd, addr, mode, dev)
+}
+
+func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error {
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	major, minor := linux.DecodeDeviceID(dev)
+	return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
+		Mode:     linux.FileMode(mode &^ t.FSContext().Umask()),
+		DevMajor: uint32(major),
+		DevMinor: minor,
+	})
+}
+
+// Open implements Linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Uint()
+	mode := args[2].ModeT()
+	return openat(t, linux.AT_FDCWD, addr, flags, mode)
+}
+
+// Openat implements Linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	flags := args[2].Uint()
+	mode := args[3].ModeT()
+	return openat(t, dirfd, addr, flags, mode)
+}
+
+// Creat implements Linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
+}
+
+func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
+		Flags: flags | linux.O_LARGEFILE,
+		Mode:  linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	return uintptr(fd), nil, err
+}
+
+// Rename implements Linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldpathAddr := args[0].Pointer()
+	newpathAddr := args[1].Pointer()
+	return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
+}
+
+// Renameat implements Linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
+}
+
+// Renameat2 implements Linux syscall renameat2(2).
+func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	olddirfd := args[0].Int()
+	oldpathAddr := args[1].Pointer()
+	newdirfd := args[2].Int()
+	newpathAddr := args[3].Pointer()
+	flags := args[4].Uint()
+	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
+}
+
+func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error {
+	oldpath, err := copyInPath(t, oldpathAddr)
+	if err != nil {
+		return err
+	}
+	// "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
+	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer oldtpop.Release()
+
+	newpath, err := copyInPath(t, newpathAddr)
+	if err != nil {
+		return err
+	}
+	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer newtpop.Release()
+
+	return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
+		Flags: flags,
+	})
+}
+
+// Rmdir implements Linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlink implements Linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
+}
+
+func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
+}
+
+// Unlinkat implements Linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if flags&^linux.AT_REMOVEDIR != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if flags&linux.AT_REMOVEDIR != 0 {
+		return 0, nil, rmdirat(t, dirfd, pathAddr)
+	}
+	return 0, nil, unlinkat(t, dirfd, pathAddr)
+}
+
+// Symlink implements Linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	targetAddr := args[0].Pointer()
+	linkpathAddr := args[1].Pointer()
+	return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
+}
+
+// Symlinkat implements Linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	targetAddr := args[0].Pointer()
+	newdirfd := args[1].Int()
+	linkpathAddr := args[2].Pointer()
+	return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
+}
+
+func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error {
+	target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
+	if err != nil {
+		return err
+	}
+	linkpath, err := copyInPath(t, linkpathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+	return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
new file mode 100644
index 000000000..317409a18
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getcwd implements Linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	size := args[1].SizeT()
+
+	root := t.FSContext().RootDirectoryVFS2()
+	wd := t.FSContext().WorkingDirectoryVFS2()
+	s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
+	root.DecRef()
+	wd.DecRef()
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Note this is >= because we need a terminator.
+	if uint(len(s)) >= size {
+		return 0, nil, syserror.ERANGE
+	}
+
+	// Construct a byte slice containing a NUL terminator.
+	buf := t.CopyScratchBuffer(len(s) + 1)
+	copy(buf, s)
+	buf[len(buf)-1] = 0
+
+	// Write the pathname slice.
+	n, err := t.CopyOutBytes(addr, buf)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Chdir implements Linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetWorkingDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
+
+// Fchdir implements Linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetWorkingDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
+
+// Chroot implements Linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+		return 0, nil, syserror.EPERM
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
+		CheckSearchable: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	t.FSContext().SetRootDirectoryVFS2(vd)
+	vd.DecRef()
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
new file mode 100644
index 000000000..c7c7bf7ce
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -0,0 +1,161 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Getdents implements Linux syscall getdents(2).
+func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getdents(t, args, false /* isGetdents64 */)
+}
+
+// Getdents64 implements Linux syscall getdents64(2).
+func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getdents(t, args, true /* isGetdents64 */)
+}
+
+func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	cb := getGetdentsCallback(t, addr, size, isGetdents64)
+	err := file.IterDirents(t, cb)
+	n := size - cb.remaining
+	putGetdentsCallback(cb)
+	if n == 0 {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+type getdentsCallback struct {
+	t            *kernel.Task
+	addr         usermem.Addr
+	remaining    int
+	isGetdents64 bool
+}
+
+var getdentsCallbackPool = sync.Pool{
+	New: func() interface{} {
+		return &getdentsCallback{}
+	},
+}
+
+func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback {
+	cb := getdentsCallbackPool.Get().(*getdentsCallback)
+	*cb = getdentsCallback{
+		t:            t,
+		addr:         addr,
+		remaining:    size,
+		isGetdents64: isGetdents64,
+	}
+	return cb
+}
+
+func putGetdentsCallback(cb *getdentsCallback) {
+	cb.t = nil
+	getdentsCallbackPool.Put(cb)
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
+	var buf []byte
+	if cb.isGetdents64 {
+		// struct linux_dirent64 {
+		//     ino64_t        d_ino;    /* 64-bit inode number */
+		//     off64_t        d_off;    /* 64-bit offset to next structure */
+		//     unsigned short d_reclen; /* Size of this dirent */
+		//     unsigned char  d_type;   /* File type */
+		//     char           d_name[]; /* Filename (null-terminated) */
+		// };
+		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+		size = (size + 7) &^ 7 // round up to multiple of 8
+		if size > cb.remaining {
+			return syserror.EINVAL
+		}
+		buf = cb.t.CopyScratchBuffer(size)
+		usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+		buf[18] = dirent.Type
+		copy(buf[19:], dirent.Name)
+		// Zero out all remaining bytes in buf, including the NUL terminator
+		// after dirent.Name.
+		bufTail := buf[19+len(dirent.Name):]
+		for i := range bufTail {
+			bufTail[i] = 0
+		}
+	} else {
+		// struct linux_dirent {
+		//     unsigned long  d_ino;     /* Inode number */
+		//     unsigned long  d_off;     /* Offset to next linux_dirent */
+		//     unsigned short d_reclen;  /* Length of this linux_dirent */
+		//     char           d_name[];  /* Filename (null-terminated) */
+		//                       /* length is actually (d_reclen - 2 -
+		//                          offsetof(struct linux_dirent, d_name)) */
+		//     /*
+		//     char           pad;       // Zero padding byte
+		//     char           d_type;    // File type (only since Linux
+		//                               // 2.6.4); offset is (d_reclen - 1)
+		//     */
+		// };
+		if cb.t.Arch().Width() != 8 {
+			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
+		}
+		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
+		size = (size + 7) &^ 7 // round up to multiple of sizeof(long)
+		if size > cb.remaining {
+			return syserror.EINVAL
+		}
+		buf = cb.t.CopyScratchBuffer(size)
+		usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
+		usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
+		usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
+		copy(buf[18:], dirent.Name)
+		// Zero out all remaining bytes in buf, including the NUL terminator
+		// after dirent.Name and the zero padding byte between the name and
+		// dirent type.
+		bufTail := buf[18+len(dirent.Name) : size-1]
+		for i := range bufTail {
+			bufTail[i] = 0
+		}
+		buf[size-1] = dirent.Type
+	}
+	n, err := cb.t.CopyOutBytes(cb.addr, buf)
+	if err != nil {
+		// Don't report partially-written dirents by advancing cb.addr or
+		// cb.remaining.
+		return err
+	}
+	cb.addr += usermem.Addr(n)
+	cb.remaining -= n
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go
new file mode 100644
index 000000000..7d50b6a16
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go
@@ -0,0 +1,134 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags&^allFlags != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags))
+	if err != nil {
+		return 0, nil, err
+	}
+	defer ino.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{
+		CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+	})
+
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[0].Value = 0
+	return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) {
+	f := t.GetFileVFS2(fd)
+	if f == nil {
+		// Invalid fd.
+		return nil, nil, syserror.EBADF
+	}
+
+	ino, ok := f.Impl().(*vfs.Inotify)
+	if !ok {
+		// Not an inotify fd.
+		f.DecRef()
+		return nil, nil, syserror.EINVAL
+	}
+
+	return ino, f, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	mask := args[2].Uint()
+
+	// "EINVAL: The given event mask contains no valid events."
+	// -- inotify_add_watch(2)
+	if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+	//  -- inotify(7)
+	follow := followFinalSymlink
+	if mask&linux.IN_DONT_FOLLOW == 0 {
+		follow = nofollowFinalSymlink
+	}
+
+	ino, f, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer f.DecRef()
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	if mask&linux.IN_ONLYDIR != 0 {
+		path.Dir = true
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+	d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer d.DecRef()
+
+	fd = ino.AddWatch(d.Dentry(), mask)
+	return uintptr(fd), nil, err
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	wd := args[1].Int()
+
+	ino, f, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer f.DecRef()
+	return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
new file mode 100644
index 000000000..5a2418da9
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Ioctl implements Linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	ret, err := file.Ioctl(t, t.MemoryManager(), args)
+	return ret, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go
new file mode 100644
index 000000000..bbe248d17
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+	memfdPrefix     = "memfd:"
+	memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix)
+	memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
+)
+
+// MemfdCreate implements the linux syscall memfd_create(2).
+func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Uint()
+
+	if flags&^memfdAllFlags != 0 {
+		// Unknown bits in flags.
+		return 0, nil, syserror.EINVAL
+	}
+
+	allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
+	cloExec := flags&linux.MFD_CLOEXEC != 0
+
+	name, err := t.CopyInString(addr, memfdMaxNameLen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	shmMount := t.Kernel().ShmMount()
+	file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: cloExec,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
new file mode 100644
index 000000000..60a43f0a0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -0,0 +1,92 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mmap implements Linux syscall mmap(2).
+func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	prot := args[2].Int()
+	flags := args[3].Int()
+	fd := args[4].Int()
+	fixed := flags&linux.MAP_FIXED != 0
+	private := flags&linux.MAP_PRIVATE != 0
+	shared := flags&linux.MAP_SHARED != 0
+	anon := flags&linux.MAP_ANONYMOUS != 0
+	map32bit := flags&linux.MAP_32BIT != 0
+
+	// Require exactly one of MAP_PRIVATE and MAP_SHARED.
+	if private == shared {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := memmap.MMapOpts{
+		Length:   args[1].Uint64(),
+		Offset:   args[5].Uint64(),
+		Addr:     args[0].Pointer(),
+		Fixed:    fixed,
+		Unmap:    fixed,
+		Map32Bit: map32bit,
+		Private:  private,
+		Perms: usermem.AccessType{
+			Read:    linux.PROT_READ&prot != 0,
+			Write:   linux.PROT_WRITE&prot != 0,
+			Execute: linux.PROT_EXEC&prot != 0,
+		},
+		MaxPerms:  usermem.AnyAccess,
+		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
+		Precommit: linux.MAP_POPULATE&flags != 0,
+	}
+	if linux.MAP_LOCKED&flags != 0 {
+		opts.MLockMode = memmap.MLockEager
+	}
+	defer func() {
+		if opts.MappingIdentity != nil {
+			opts.MappingIdentity.DecRef()
+		}
+	}()
+
+	if !anon {
+		// Convert the passed FD to a file reference.
+		file := t.GetFileVFS2(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		// mmap unconditionally requires that the FD is readable.
+		if !file.IsReadable() {
+			return 0, nil, syserror.EACCES
+		}
+		// MAP_SHARED requires that the FD be writable for PROT_WRITE.
+		if shared && !file.IsWritable() {
+			opts.MaxPerms.Write = false
+		}
+
+		if err := file.ConfigureMMap(t, &opts); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	rv, err := t.MemoryManager().MMap(t, opts)
+	return uintptr(rv), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go
new file mode 100644
index 000000000..adeaa39cc
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mount.go
@@ -0,0 +1,145 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sourceAddr := args[0].Pointer()
+	targetAddr := args[1].Pointer()
+	typeAddr := args[2].Pointer()
+	flags := args[3].Uint64()
+	dataAddr := args[4].Pointer()
+
+	// For null-terminated strings related to mount(2), Linux copies in at most
+	// a page worth of data. See fs/namespace.c:copy_mount_string().
+	fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+	source, err := t.CopyInString(sourceAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	targetPath, err := copyInPath(t, targetAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	data := ""
+	if dataAddr != 0 {
+		// In Linux, a full page is always copied in regardless of null
+		// character placement, and the address is passed to each file system.
+		// Most file systems always treat this data as a string, though, and so
+		// do all of the ones we implement.
+		data, err = t.CopyInString(dataAddr, usermem.PageSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	// Ignore magic value that was required before Linux 2.4.
+	if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+		flags = flags &^ linux.MS_MGC_MSK
+	}
+
+	// Must have CAP_SYS_ADMIN in the current mount namespace's associated user
+	// namespace.
+	creds := t.Credentials()
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+		linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+		linux.MS_UNBINDABLE | linux.MS_MOVE
+
+	// Silently allow MS_NOSUID, since we don't implement set-id bits
+	// anyway.
+	const unsupportedFlags = linux.MS_NODEV |
+		linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+	// Linux just allows passing any flags to mount(2) - it won't fail when
+	// unknown or unsupported flags are passed. Since we don't implement
+	// everything, we fail explicitly on flags that are unimplemented.
+	if flags&(unsupportedOps|unsupportedFlags) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var opts vfs.MountOptions
+	if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+		opts.Flags.NoATime = true
+	}
+	if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+		opts.Flags.NoExec = true
+	}
+	if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+		opts.ReadOnly = true
+	}
+	opts.GetFilesystemOptions.Data = data
+
+	target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer target.Release()
+
+	return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+
+	// Must have CAP_SYS_ADMIN in the mount namespace's associated user
+	// namespace.
+	//
+	// Currently, this is always the init task's user namespace.
+	creds := t.Credentials()
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+	if flags&unsupported != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	opts := vfs.UmountOptions{
+		Flags: uint32(flags),
+	}
+
+	return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go
new file mode 100644
index 000000000..97da6c647
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/path.go
@@ -0,0 +1,94 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) {
+	pathname, err := t.CopyInString(addr, linux.PATH_MAX)
+	if err != nil {
+		return fspath.Path{}, err
+	}
+	return fspath.Parse(pathname), nil
+}
+
+type taskPathOperation struct {
+	pop          vfs.PathOperation
+	haveStartRef bool
+}
+
+func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) {
+	root := t.FSContext().RootDirectoryVFS2()
+	start := root
+	haveStartRef := false
+	if !path.Absolute {
+		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+			root.DecRef()
+			return taskPathOperation{}, syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			haveStartRef = true
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				root.DecRef()
+				return taskPathOperation{}, syserror.EBADF
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			haveStartRef = true
+			dirfile.DecRef()
+		}
+	}
+	return taskPathOperation{
+		pop: vfs.PathOperation{
+			Root:               root,
+			Start:              start,
+			Path:               path,
+			FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+		},
+		haveStartRef: haveStartRef,
+	}, nil
+}
+
+func (tpop *taskPathOperation) Release() {
+	tpop.pop.Root.DecRef()
+	if tpop.haveStartRef {
+		tpop.pop.Start.DecRef()
+		tpop.haveStartRef = false
+	}
+}
+
+type shouldAllowEmptyPath bool
+
+const (
+	disallowEmptyPath shouldAllowEmptyPath = false
+	allowEmptyPath    shouldAllowEmptyPath = true
+)
+
+type shouldFollowFinalSymlink bool
+
+const (
+	nofollowFinalSymlink shouldFollowFinalSymlink = false
+	followFinalSymlink   shouldFollowFinalSymlink = true
+)
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
new file mode 100644
index 000000000..4a01e4209
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Pipe implements Linux syscall pipe(2).
+func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	return 0, nil, pipe2(t, addr, 0)
+}
+
+// Pipe2 implements Linux syscall pipe2(2).
+func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+	return 0, nil, pipe2(t, addr, flags)
+}
+
+func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
+	if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
+		return syserror.EINVAL
+	}
+	r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK))
+	defer r.DecRef()
+	defer w.DecRef()
+
+	fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	if err != nil {
+		return err
+	}
+	if _, err := t.CopyOut(addr, fds); err != nil {
+		for _, fd := range fds {
+			if _, file := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return err
+	}
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go
new file mode 100644
index 000000000..ff1b25d7b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/poll.go
@@ -0,0 +1,586 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileCap is the maximum allowable files for poll & select. This has no
+// equivalent in Linux; it exists in gVisor since allocation failure in Go is
+// unrecoverable.
+const fileCap = 1024 * 1024
+
+// Masks for "readable", "writable", and "exceptional" events as defined by
+// select(2).
+const (
+	// selectReadEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLIN_SET.
+	selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
+
+	// selectWriteEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLOUT_SET.
+	selectWriteEvents = linux.POLLOUT | linux.POLLERR
+
+	// selectExceptEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLEX_SET.
+	selectExceptEvents = linux.POLLPRI
+)
+
+// pollState tracks the associated file description and waiter of a PollFD.
+type pollState struct {
+	file   *vfs.FileDescription
+	waiter waiter.Entry
+}
+
+// initReadiness gets the current ready mask for the file represented by the FD
+// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
+// used to register with the file for event notifications, and a reference to
+// the file is stored in "state".
+func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
+	if pfd.FD < 0 {
+		pfd.REvents = 0
+		return
+	}
+
+	file := t.GetFileVFS2(pfd.FD)
+	if file == nil {
+		pfd.REvents = linux.POLLNVAL
+		return
+	}
+
+	if ch == nil {
+		defer file.DecRef()
+	} else {
+		state.file = file
+		state.waiter, _ = waiter.NewChannelEntry(ch)
+		file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	}
+
+	r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	pfd.REvents = int16(r.ToLinux()) & pfd.Events
+}
+
+// releaseState releases all the pollState in "state".
+func releaseState(state []pollState) {
+	for i := range state {
+		if state[i].file != nil {
+			state[i].file.EventUnregister(&state[i].waiter)
+			state[i].file.DecRef()
+		}
+	}
+}
+
+// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
+// when "timeout" is greater than zero.
+//
+// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
+// positive if interrupted by a signal.
+func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
+	var ch chan struct{}
+	if timeout != 0 {
+		ch = make(chan struct{}, 1)
+	}
+
+	// Register for event notification in the files involved if we may
+	// block (timeout not zero). Once we find a file that has a non-zero
+	// result, we stop registering for events but still go through all files
+	// to get their ready masks.
+	state := make([]pollState, len(pfd))
+	defer releaseState(state)
+	n := uintptr(0)
+	for i := range pfd {
+		initReadiness(t, &pfd[i], &state[i], ch)
+		if pfd[i].REvents != 0 {
+			n++
+			ch = nil
+		}
+	}
+
+	if timeout == 0 {
+		return timeout, n, nil
+	}
+
+	haveTimeout := timeout >= 0
+
+	for n == 0 {
+		var err error
+		// Wait for a notification.
+		timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout)
+		if err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = nil
+			}
+			return timeout, 0, err
+		}
+
+		// We got notified, count how many files are ready. If none,
+		// then this was a spurious notification, and we just go back
+		// to sleep with the remaining timeout.
+		for i := range state {
+			if state[i].file == nil {
+				continue
+			}
+
+			r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
+			rl := int16(r.ToLinux()) & pfd[i].Events
+			if rl != 0 {
+				pfd[i].REvents = rl
+				n++
+			}
+		}
+	}
+
+	return timeout, n, nil
+}
+
+// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
+func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) {
+	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+		return nil, syserror.EINVAL
+	}
+
+	pfd := make([]linux.PollFD, nfds)
+	if nfds > 0 {
+		if _, err := t.CopyIn(addr, &pfd); err != nil {
+			return nil, err
+		}
+	}
+
+	return pfd, nil
+}
+
+func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+	pfd, err := copyInPollFDs(t, addr, nfds)
+	if err != nil {
+		return timeout, 0, err
+	}
+
+	// Compatibility warning: Linux adds POLLHUP and POLLERR just before
+	// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
+	// polling, changing event masks here is an application-visible difference.
+	// (Linux also doesn't copy out event masks at all, only revents.)
+	for i := range pfd {
+		pfd[i].Events |= linux.POLLHUP | linux.POLLERR
+	}
+	remainingTimeout, n, err := pollBlock(t, pfd, timeout)
+	err = syserror.ConvertIntr(err, syserror.EINTR)
+
+	// The poll entries are copied out regardless of whether
+	// any are set or not. This aligns with the Linux behavior.
+	if nfds > 0 && err == nil {
+		if _, err := t.CopyOut(addr, pfd); err != nil {
+			return remainingTimeout, 0, err
+		}
+	}
+
+	return remainingTimeout, n, err
+}
+
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
+	set := make([]byte, nBytes)
+
+	if addr != 0 {
+		if _, err := t.CopyIn(addr, &set); err != nil {
+			return nil, err
+		}
+		// If we only use part of the last byte, mask out the extraneous bits.
+		//
+		// N.B. This only works on little-endian architectures.
+		if nBitsInLastPartialByte != 0 {
+			set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+		}
+	}
+	return set, nil
+}
+
+func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
+	if nfds < 0 || nfds > fileCap {
+		return 0, syserror.EINVAL
+	}
+
+	// Calculate the size of the fd sets (one bit per fd).
+	nBytes := (nfds + 7) / 8
+	nBitsInLastPartialByte := nfds % 8
+
+	// Capture all the provided input vectors.
+	r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+	e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
+	}
+
+	// Count how many FDs are actually being requested so that we can build
+	// a PollFD array.
+	fdCount := 0
+	for i := 0; i < nBytes; i++ {
+		v := r[i] | w[i] | e[i]
+		for v != 0 {
+			v &= (v - 1)
+			fdCount++
+		}
+	}
+
+	// Build the PollFD array.
+	pfd := make([]linux.PollFD, 0, fdCount)
+	var fd int32
+	for i := 0; i < nBytes; i++ {
+		rV, wV, eV := r[i], w[i], e[i]
+		v := rV | wV | eV
+		m := byte(1)
+		for j := 0; j < 8; j++ {
+			if (v & m) != 0 {
+				// Make sure the fd is valid and decrement the reference
+				// immediately to ensure we don't leak. Note, another thread
+				// might be about to close fd. This is racy, but that's
+				// OK. Linux is racy in the same way.
+				file := t.GetFileVFS2(fd)
+				if file == nil {
+					return 0, syserror.EBADF
+				}
+				file.DecRef()
+
+				var mask int16
+				if (rV & m) != 0 {
+					mask |= selectReadEvents
+				}
+
+				if (wV & m) != 0 {
+					mask |= selectWriteEvents
+				}
+
+				if (eV & m) != 0 {
+					mask |= selectExceptEvents
+				}
+
+				pfd = append(pfd, linux.PollFD{
+					FD:     fd,
+					Events: mask,
+				})
+			}
+
+			fd++
+			m <<= 1
+		}
+	}
+
+	// Do the syscall, then count the number of bits set.
+	if _, _, err = pollBlock(t, pfd, timeout); err != nil {
+		return 0, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	// r, w, and e are currently event mask bitsets; unset bits corresponding
+	// to events that *didn't* occur.
+	bitSetCount := uintptr(0)
+	for idx := range pfd {
+		events := pfd[idx].REvents
+		i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
+		m := byte(1) << j
+		if r[i]&m != 0 {
+			if (events & selectReadEvents) != 0 {
+				bitSetCount++
+			} else {
+				r[i] &^= m
+			}
+		}
+		if w[i]&m != 0 {
+			if (events & selectWriteEvents) != 0 {
+				bitSetCount++
+			} else {
+				w[i] &^= m
+			}
+		}
+		if e[i]&m != 0 {
+			if (events & selectExceptEvents) != 0 {
+				bitSetCount++
+			} else {
+				e[i] &^= m
+			}
+		}
+	}
+
+	// Copy updated vectors back.
+	if readFDs != 0 {
+		if _, err := t.CopyOut(readFDs, r); err != nil {
+			return 0, err
+		}
+	}
+
+	if writeFDs != 0 {
+		if _, err := t.CopyOut(writeFDs, w); err != nil {
+			return 0, err
+		}
+	}
+
+	if exceptFDs != 0 {
+		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+			return 0, err
+		}
+	}
+
+	return bitSetCount, nil
+}
+
+// timeoutRemaining returns the amount of time remaining for the specified
+// timeout or 0 if it has elapsed.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
+	now := t.Kernel().MonotonicClock().Now()
+	remaining := timeout - now.Sub(startNs)
+	if remaining < 0 {
+		remaining = 0
+	}
+	return remaining
+}
+
+// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
+	_, err := tsRemaining.CopyOut(t, timespecAddr)
+	return err
+}
+
+// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
+	_, err := tvRemaining.CopyOut(t, timevalAddr)
+	return err
+}
+
+// pollRestartBlock encapsulates the state required to restart poll(2) via
+// restart_syscall(2).
+//
+// +stateify savable
+type pollRestartBlock struct {
+	pfdAddr usermem.Addr
+	nfds    uint
+	timeout time.Duration
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return poll(t, p.pfdAddr, p.nfds, p.timeout)
+}
+
+func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
+	remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	// On an interrupt poll(2) is restarted with the remaining timeout.
+	if err == syserror.EINTR {
+		t.SetSyscallRestartBlock(&pollRestartBlock{
+			pfdAddr: pfdAddr,
+			nfds:    nfds,
+			timeout: remainingTimeout,
+		})
+		return 0, kernel.ERESTART_RESTARTBLOCK
+	}
+	return n, err
+}
+
+// Poll implements linux syscall poll(2).
+func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timeout := time.Duration(args[2].Int()) * time.Millisecond
+	n, err := poll(t, pfdAddr, nfds, timeout)
+	return n, nil, err
+}
+
+// Ppoll implements linux syscall ppoll(2).
+func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timespecAddr := args[2].Pointer()
+	maskAddr := args[3].Pointer()
+	maskSize := uint(args[4].Uint())
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
+		return 0, nil, err
+	}
+
+	_, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// doPoll returns EINTR if interrupted, but ppoll is normally restartable
+	// if interrupted by something other than a signal handled by the
+	// application (i.e. returns ERESTARTNOHAND). However, if
+	// copyOutTimespecRemaining failed, then the restarted ppoll would use the
+	// wrong timeout, so the error should be left as EINTR.
+	//
+	// Note that this means that if err is nil but copyErr is not, copyErr is
+	// ignored. This is consistent with Linux.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Select implements linux syscall select(2).
+func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timevalAddr := args[4].Pointer()
+
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timevalAddr != 0 {
+		var timeval linux.Timeval
+		if _, err := timeval.CopyIn(t, timevalAddr); err != nil {
+			return 0, nil, err
+		}
+		if timeval.Sec < 0 || timeval.Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(timeval.ToNsecCapped())
+	}
+	startNs := t.Kernel().MonotonicClock().Now()
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Pselect implements linux syscall pselect(2).
+func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+	maskWithSizeAddr := args[5].Pointer()
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskWithSizeAddr != 0 {
+		if t.Arch().Width() != 8 {
+			panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
+		}
+		var maskStruct sigSetWithSize
+		if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
+			return 0, nil, err
+		}
+		if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// +marshal
+type sigSetWithSize struct {
+	sigsetAddr   uint64
+	sizeofSigset uint64
+}
+
+// copyTimespecInToDuration copies a Timespec from the untrusted app range,
+// validates it and converts it to a Duration.
+//
+// If the Timespec is larger than what can be represented in a Duration, the
+// returned value is the maximum that Duration will allow.
+//
+// If timespecAddr is NULL, the returned value is negative.
+func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timespecAddr != 0 {
+		var timespec linux.Timespec
+		if _, err := timespec.CopyIn(t, timespecAddr); err != nil {
+			return 0, err
+		}
+		if !timespec.Valid() {
+			return 0, syserror.EINVAL
+		}
+		timeout = time.Duration(timespec.ToNsecCapped())
+	}
+	return timeout, nil
+}
+
+func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error {
+	if maskAddr == 0 {
+		return nil
+	}
+	if maskSize != linux.SignalSetSize {
+		return syserror.EINVAL
+	}
+	var mask linux.SignalSet
+	if _, err := mask.CopyIn(t, maskAddr); err != nil {
+		return err
+	}
+	mask &^= kernel.UnblockableSignals
+	oldmask := t.SignalMask()
+	t.SetSignalMask(mask)
+	t.SetSavedSignalMask(oldmask)
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
new file mode 100644
index 000000000..7f9debd4a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -0,0 +1,608 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	eventMaskRead  = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+	eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements Linux syscall read(2).
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+// Readv implements Linux syscall readv(2).
+func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := read(t, file, dst, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+}
+
+func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.Read(t, dst, opts)
+	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = file.Read(t, dst, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
+	return total, err
+}
+
+// Pread64 implements Linux syscall pread64(2).
+func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+}
+
+// Preadv implements Linux syscall preadv(2).
+func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+}
+
+// Preadv2 implements Linux syscall preadv2(2).
+func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the glibc signature is
+	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the actual syscall
+	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 6th argument (index 5).
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := args[5].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.ReadOptions{
+		Flags: uint32(flags),
+	}
+	var n int64
+	if offset == -1 {
+		n, err = read(t, file, dst, opts)
+	} else {
+		n, err = pread(t, file, dst, offset, opts)
+	}
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+}
+
+func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	n, err := file.PRead(t, dst, offset, opts)
+	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = file.PRead(t, dst, offset+total, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
+	return total, err
+}
+
+// Write implements Linux syscall write(2).
+func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := write(t, file, src, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+}
+
+// Writev implements Linux syscall writev(2).
+func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := write(t, file, src, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+}
+
+func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	n, err := file.Write(t, src, opts)
+	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = file.Write(t, src, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	}
+	return total, err
+}
+
+// Pwrite64 implements Linux syscall pwrite64(2).
+func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+}
+
+// Pwritev implements Linux syscall pwritev(2).
+func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+}
+
+// Pwritev2 implements Linux syscall pwritev2(2).
+func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the glibc signature is
+	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the actual syscall
+	// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 6th argument (index 5).
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := args[5].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.WriteOptions{
+		Flags: uint32(flags),
+	}
+	var n int64
+	if offset == -1 {
+		n, err = write(t, file, src, opts)
+	} else {
+		n, err = pwrite(t, file, src, offset, opts)
+	}
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+}
+
+func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, err := file.PWrite(t, src, offset, opts)
+	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
+	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&w, eventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst(int(n))
+
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = file.PWrite(t, src, offset+total, opts)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
+			break
+		}
+	}
+	file.EventUnregister(&w)
+
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
+	return total, err
+}
+
+func blockPolicy(t *kernel.Task, file *vfs.FileDescription) (allowBlock bool, deadline ktime.Time, hasDeadline bool) {
+	if file.StatusFlags()&linux.O_NONBLOCK != 0 {
+		return false, ktime.Time{}, false
+	}
+	// Sockets support read/write timeouts.
+	if s, ok := file.Impl().(socket.SocketVFS2); ok {
+		dl := s.RecvTimeout()
+		if dl < 0 {
+			return false, ktime.Time{}, false
+		}
+		if dl > 0 {
+			return true, t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond), true
+		}
+	}
+	return true, ktime.Time{}, false
+}
+
+// Lseek implements Linux syscall lseek(2).
+func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	whence := args[2].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newoff, err := file.Seek(t, offset, whence)
+	return uintptr(newoff), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go
new file mode 100644
index 000000000..09ecfed26
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go
@@ -0,0 +1,428 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+// Chmod implements Linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	mode := args[1].ModeT()
+	return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
+}
+
+// Fchmodat implements Linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	mode := args[2].ModeT()
+	return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
+}
+
+func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_MODE,
+			Mode: uint16(mode & chmodMask),
+		},
+	})
+}
+
+// Fchmod implements Linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	mode := args[1].ModeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_MODE,
+			Mode: uint16(mode & chmodMask),
+		},
+	})
+}
+
+// Chown implements Linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	owner := args[1].Int()
+	group := args[2].Int()
+	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
+}
+
+// Lchown implements Linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	owner := args[1].Int()
+	group := args[2].Int()
+	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	owner := args[2].Int()
+	group := args[3].Int()
+	flags := args[4].Int()
+	return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
+}
+
+func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+		return err
+	}
+
+	return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
+}
+
+func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
+	userns := t.UserNamespace()
+	if owner != -1 {
+		kuid := userns.MapToKUID(auth.UID(owner))
+		if !kuid.Ok() {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_UID
+		opts.Stat.UID = uint32(kuid)
+	}
+	if group != -1 {
+		kgid := userns.MapToKGID(auth.GID(group))
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_GID
+		opts.Stat.GID = uint32(kgid)
+	}
+	return nil
+}
+
+// Fchown implements Linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	owner := args[1].Int()
+	group := args[2].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, file.SetStat(t, opts)
+}
+
+// Truncate implements Linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: uint64(length),
+		},
+	})
+	return 0, nil, handleSetSizeError(t, err)
+}
+
+// Ftruncate implements Linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.SetStat(t, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: uint64(length),
+		},
+	})
+	return 0, nil, handleSetSizeError(t, err)
+}
+
+// Utime implements Linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	opts := vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_ATIME | linux.STATX_MTIME,
+		},
+	}
+	if timesAddr == 0 {
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+	} else {
+		var times linux.Utime
+		if _, err := times.CopyIn(t, timesAddr); err != nil {
+			return 0, nil, err
+		}
+		opts.Stat.Atime.Sec = times.Actime
+		opts.Stat.Mtime.Sec = times.Modtime
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Utimes implements Linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
+}
+
+// Futimesat implements Linux syscall futimesat(2).
+func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+
+	// "If filename is NULL and dfd refers to an open file, then operate on the
+	// file. Otherwise look up filename, possibly using dfd as a starting
+	// point." - fs/utimes.c
+	var path fspath.Path
+	shouldAllowEmptyPath := allowEmptyPath
+	if dirfd == linux.AT_FDCWD || pathAddr != 0 {
+		var err error
+		path, err = copyInPath(t, pathAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		shouldAllowEmptyPath = disallowEmptyPath
+	}
+
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, followFinalSymlink, &opts)
+}
+
+func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error {
+	if timesAddr == 0 {
+		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+		return nil
+	}
+	var times [2]linux.Timeval
+	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		return err
+	}
+	if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
+		return syserror.EINVAL
+	}
+	opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+	opts.Stat.Atime = linux.StatxTimestamp{
+		Sec:  times[0].Sec,
+		Nsec: uint32(times[0].Usec * 1000),
+	}
+	opts.Stat.Mtime = linux.StatxTimestamp{
+		Sec:  times[1].Sec,
+		Nsec: uint32(times[1].Usec * 1000),
+	}
+	return nil
+}
+
+// Utimensat implements Linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	// Linux requires that the UTIME_OMIT check occur before checking path or
+	// flags.
+	var opts vfs.SetStatOptions
+	if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
+		return 0, nil, err
+	}
+	if opts.Stat.Mask == 0 {
+		return 0, nil, nil
+	}
+
+	if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If filename is NULL and dfd refers to an open file, then operate on the
+	// file. Otherwise look up filename, possibly using dfd as a starting
+	// point." - fs/utimes.c
+	var path fspath.Path
+	shouldAllowEmptyPath := allowEmptyPath
+	if dirfd == linux.AT_FDCWD || pathAddr != 0 {
+		var err error
+		path, err = copyInPath(t, pathAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		shouldAllowEmptyPath = disallowEmptyPath
+	}
+
+	return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
+}
+
+func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error {
+	if timesAddr == 0 {
+		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
+		opts.Stat.Atime.Nsec = linux.UTIME_NOW
+		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
+		return nil
+	}
+	var times [2]linux.Timespec
+	if _, err := t.CopyIn(timesAddr, &times); err != nil {
+		return err
+	}
+	if times[0].Nsec != linux.UTIME_OMIT {
+		if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_ATIME
+		opts.Stat.Atime = linux.StatxTimestamp{
+			Sec:  times[0].Sec,
+			Nsec: uint32(times[0].Nsec),
+		}
+	}
+	if times[1].Nsec != linux.UTIME_OMIT {
+		if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) {
+			return syserror.EINVAL
+		}
+		opts.Stat.Mask |= linux.STATX_MTIME
+		opts.Stat.Mtime = linux.StatxTimestamp{
+			Sec:  times[1].Sec,
+			Nsec: uint32(times[1].Nsec),
+		}
+	}
+	return nil
+}
+
+func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
+			return syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.SetStat() instead of
+				// VirtualFilesystem.SetStatAt(), since the former may be able
+				// to use opened file state to expedite the SetStat.
+				err := dirfile.SetStat(t, *opts)
+				dirfile.DecRef()
+				return err
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+	return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: bool(shouldFollowFinalSymlink),
+	}, opts)
+}
+
+func handleSetSizeError(t *kernel.Task, err error) error {
+	if err == syserror.ErrExceedsFileSizeLimit {
+		// Convert error to EFBIG and send a SIGXFSZ per setrlimit(2).
+		t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
+		return syserror.EFBIG
+	}
+	return err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go
new file mode 100644
index 000000000..623992f6f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/signal.go
@@ -0,0 +1,100 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/signalfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// sharedSignalfd is shared between the two calls.
+func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) {
+	// Copy in the signal mask.
+	mask, err := slinux.CopyInSigSet(t, sigset, sigsetsize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Always check for valid flags, even if not creating.
+	if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is this a change to an existing signalfd?
+	//
+	// The spec indicates that this should adjust the mask.
+	if fd != -1 {
+		file := t.GetFileVFS2(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		// Is this a signalfd?
+		if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok {
+			sfd.SetMask(mask)
+			return 0, nil, nil
+		}
+
+		// Not a signalfd.
+		return 0, nil, syserror.EINVAL
+	}
+
+	fileFlags := uint32(linux.O_RDWR)
+	if flags&linux.SFD_NONBLOCK != 0 {
+		fileFlags |= linux.O_NONBLOCK
+	}
+
+	// Create a new file.
+	vfsObj := t.Kernel().VFS()
+	file, err := signalfd.New(vfsObj, t, mask, fileFlags)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	// Create a new descriptor.
+	fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.SFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Done.
+	return uintptr(fd), nil, nil
+}
+
+// Signalfd implements the linux syscall signalfd(2).
+func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	sigset := args[1].Pointer()
+	sigsetsize := args[2].SizeT()
+	return sharedSignalfd(t, fd, sigset, sigsetsize, 0)
+}
+
+// Signalfd4 implements the linux syscall signalfd4(2).
+func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	sigset := args[1].Pointer()
+	sigsetsize := args[2].SizeT()
+	flags := args[3].Int()
+	return sharedSignalfd(t, fd, sigset, sigsetsize, flags)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
new file mode 100644
index 000000000..10b668477
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -0,0 +1,1139 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// minListenBacklog is the minimum reasonable backlog for listening sockets.
+const minListenBacklog = 8
+
+// maxListenBacklog is the maximum allowed backlog for listening sockets.
+const maxListenBacklog = 1024
+
+// maxAddrLen is the maximum socket address length we're willing to accept.
+const maxAddrLen = 200
+
+// maxOptLen is the maximum sockopt parameter length we're willing to accept.
+const maxOptLen = 1024 * 8
+
+// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
+// willing to accept. Note that this limit is smaller than Linux, which allows
+// buffers upto INT_MAX.
+const maxControlLen = 10 * 1024 * 1024
+
+// nameLenOffset is the offset from the start of the MessageHeader64 struct to
+// the NameLen field.
+const nameLenOffset = 8
+
+// controlLenOffset is the offset form the start of the MessageHeader64 struct
+// to the ControlLen field.
+const controlLenOffset = 40
+
+// flagsOffset is the offset form the start of the MessageHeader64 struct
+// to the Flags field.
+const flagsOffset = 48
+
+const sizeOfInt32 = 4
+
+// messageHeader64Len is the length of a MessageHeader64 struct.
+var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+
+// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
+var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+
+// baseRecvFlags are the flags that are accepted across recvmsg(2),
+// recvmmsg(2), and recvfrom(2).
+const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC
+
+// MessageHeader64 is the 64-bit representation of the msghdr struct used in
+// the recvmsg and sendmsg syscalls.
+type MessageHeader64 struct {
+	// Name is the optional pointer to a network address buffer.
+	Name uint64
+
+	// NameLen is the length of the buffer pointed to by Name.
+	NameLen uint32
+	_       uint32
+
+	// Iov is a pointer to an array of io vectors that describe the memory
+	// locations involved in the io operation.
+	Iov uint64
+
+	// IovLen is the length of the array pointed to by Iov.
+	IovLen uint64
+
+	// Control is the optional pointer to ancillary control data.
+	Control uint64
+
+	// ControlLen is the length of the data pointed to by Control.
+	ControlLen uint64
+
+	// Flags on the sent/received message.
+	Flags int32
+	_     int32
+}
+
+// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
+// the recvmmsg and sendmmsg syscalls.
+type multipleMessageHeader64 struct {
+	msgHdr MessageHeader64
+	msgLen uint32
+	_      int32
+}
+
+// CopyInMessageHeader64 copies a message header from user to kernel memory.
+func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
+	b := t.CopyScratchBuffer(52)
+	if _, err := t.CopyInBytes(addr, b); err != nil {
+		return err
+	}
+
+	msg.Name = usermem.ByteOrder.Uint64(b[0:])
+	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
+	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
+	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
+	msg.Control = usermem.ByteOrder.Uint64(b[32:])
+	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
+	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
+
+	return nil
+}
+
+// CaptureAddress allocates memory for and copies a socket address structure
+// from the untrusted address space range.
+func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
+	if addrlen > maxAddrLen {
+		return nil, syserror.EINVAL
+	}
+
+	addrBuf := make([]byte, addrlen)
+	if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
+		return nil, err
+	}
+
+	return addrBuf, nil
+}
+
+// writeAddress writes a sockaddr structure and its length to an output buffer
+// in the unstrusted address space range. If the address is bigger than the
+// buffer, it is truncated.
+func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+	// Get the buffer length.
+	var bufLen uint32
+	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+		return err
+	}
+
+	if int32(bufLen) < 0 {
+		return syserror.EINVAL
+	}
+
+	// Write the length unconditionally.
+	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+		return err
+	}
+
+	if addr == nil {
+		return nil
+	}
+
+	if bufLen > addrLen {
+		bufLen = addrLen
+	}
+
+	// Copy as much of the address as will fit in the buffer.
+	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	if bufLen > uint32(len(encodedAddr)) {
+		bufLen = uint32(len(encodedAddr))
+	}
+	_, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
+	return err
+}
+
+// Socket implements the linux syscall socket(2).
+func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create the new socket.
+	s, e := socket.NewVFS2(t, domain, linux.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	defer s.DecRef()
+
+	if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil {
+		return 0, nil, err
+	}
+
+	fd, err := t.NewFDFromVFS2(0, s, kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// SocketPair implements the linux syscall socketpair(2).
+func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+	addr := args[3].Pointer()
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Create the socket pair.
+	s1, s2, e := socket.PairVFS2(t, domain, linux.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	// Adding to the FD table will cause an extra reference to be acquired.
+	defer s1.DecRef()
+	defer s2.DecRef()
+
+	nonblocking := uint32(stype & linux.SOCK_NONBLOCK)
+	if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
+		return 0, nil, err
+	}
+	if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
+		return 0, nil, err
+	}
+
+	// Create the FDs for the sockets.
+	flags := kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	}
+	fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{s1, s2}, flags)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if _, err := t.CopyOut(addr, fds); err != nil {
+		for _, fd := range fds {
+			if _, file := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// Connect implements the linux syscall connect(2).
+func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
+	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+}
+
+// accept is the implementation of the accept syscall. It is called by accept
+// and accept4 syscall handlers.
+func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+	// Check that no unsupported flags are passed in.
+	if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	// Call the syscall implementation for this socket, then copy the
+	// output address if one is specified.
+	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
+
+	peerRequested := addrLen != 0
+	nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	if peerRequested {
+		// NOTE(magi): Linux does not give you an error if it can't
+		// write the data back out so neither do we.
+		if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syserror.EINVAL {
+			return 0, err
+		}
+	}
+	return uintptr(nfd), nil
+}
+
+// Accept4 implements the linux syscall accept4(2).
+func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+	flags := int(args[3].Int())
+
+	n, err := accept(t, fd, addr, addrlen, flags)
+	return n, nil, err
+}
+
+// Accept implements the linux syscall accept(2).
+func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	n, err := accept(t, fd, addr, addrlen, 0)
+	return n, nil, err
+}
+
+// Bind implements the linux syscall bind(2).
+func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, s.Bind(t, a).ToError()
+}
+
+// Listen implements the linux syscall listen(2).
+func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	backlog := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Per Linux, the backlog is silently capped to reasonable values.
+	if backlog <= 0 {
+		backlog = minListenBacklog
+	}
+	if backlog > maxListenBacklog {
+		backlog = maxListenBacklog
+	}
+
+	return 0, nil, s.Listen(t, int(backlog)).ToError()
+}
+
+// Shutdown implements the linux syscall shutdown(2).
+func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	how := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Validate how, then call syscall implementation.
+	switch how {
+	case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, s.Shutdown(t, int(how)).ToError()
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2).
+func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLenAddr := args[4].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Read the length. Reject negative values.
+	optLen := int32(0)
+	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+		return 0, nil, err
+	}
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Call syscall implementation then copy both value and value len out.
+	v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+
+	vLen := int32(binary.Size(v))
+	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+		return 0, nil, err
+	}
+
+	if v != nil {
+		if _, err := t.CopyOut(optValAddr, v); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return 0, nil, nil
+}
+
+// getSockOpt tries to handle common socket options, or dispatches to a specific
+// socket implementation.
+func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
+	if level == linux.SOL_SOCKET {
+		switch name {
+		case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
+			if len < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+		}
+
+		switch name {
+		case linux.SO_TYPE:
+			_, skType, _ := s.Type()
+			return int32(skType), nil
+		case linux.SO_DOMAIN:
+			family, _, _ := s.Type()
+			return int32(family), nil
+		case linux.SO_PROTOCOL:
+			_, _, protocol := s.Type()
+			return int32(protocol), nil
+		}
+	}
+
+	return s.GetSockOpt(t, level, name, optValAddr, len)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2).
+//
+// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
+func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLen := args[4].Int()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if optLen > maxOptLen {
+		return 0, nil, syserror.EINVAL
+	}
+	buf := t.CopyScratchBuffer(int(optLen))
+	if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+		return 0, nil, err
+	}
+
+	// Call syscall implementation.
+	if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, nil
+}
+
+// GetSockName implements the linux syscall getsockname(2).
+func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Get the socket name and copy it to the caller.
+	v, vl, err := s.GetSockName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// GetPeerName implements the linux syscall getpeername(2).
+func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Get the socket peer name and copy it to the caller.
+	v, vl, err := s.GetPeerName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// RecvMsg implements the linux syscall recvmsg(2).
+func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
+	return n, nil, err
+}
+
+// RecvMMsg implements the linux syscall recvmmsg(2).
+func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+	toPtr := args[4].Pointer()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if toPtr != 0 {
+		var ts linux.Timespec
+		if _, err := ts.CopyIn(t, toPtr); err != nil {
+			return 0, nil, err
+		}
+		if !ts.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
+		haveDeadline = true
+	}
+
+	if !haveDeadline {
+		if dl := s.RecvTimeout(); dl > 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		} else if dl < 0 {
+			flags |= linux.MSG_DONTWAIT
+		}
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		var n uintptr
+		if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
+	// Capture the message header and io vectors.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syserror.EMSGSIZE
+	}
+	dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	// FIXME(b/63594852): Pretend we have an empty error queue.
+	if flags&linux.MSG_ERRQUEUE != 0 {
+		return 0, syserror.EAGAIN
+	}
+
+	// Fast path when no control message nor name buffers are provided.
+	if msg.ControlLen == 0 && msg.NameLen == 0 {
+		n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+		if err != nil {
+			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+		}
+		if !cms.Unix.Empty() {
+			mflags |= linux.MSG_CTRUNC
+			cms.Release()
+		}
+
+		if int(msg.Flags) != mflags {
+			// Copy out the flags to the caller.
+			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+				return 0, err
+			}
+		}
+
+		return uintptr(n), nil
+	}
+
+	if msg.ControlLen > maxControlLen {
+		return 0, syserror.ENOBUFS
+	}
+	n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	defer cms.Release()
+
+	controlData := make([]byte, 0, msg.ControlLen)
+	controlData = control.PackControlMessages(t, cms, controlData)
+
+	if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
+		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
+		controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
+	}
+
+	if cms.Unix.Rights != nil {
+		controlData, mflags = control.PackRightsVFS2(t, cms.Unix.Rights.(control.SCMRightsVFS2), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
+	}
+
+	// Copy the address to the caller.
+	if msg.NameLen != 0 {
+		if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy the control data to the caller.
+	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+		return 0, err
+	}
+	if len(controlData) > 0 {
+		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy out the flags to the caller.
+	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
+		return 0, err
+	}
+
+	return uintptr(n), nil
+}
+
+// recvFrom is the implementation of the recvfrom syscall. It is called by
+// recvfrom and recv syscall handlers.
+func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+	if int(bufLen) < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
+	cm.Release()
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+
+	// Copy the address to the caller.
+	if nameLenPtr != 0 {
+		if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
+			return 0, err
+		}
+	}
+
+	return uintptr(n), nil
+}
+
+// RecvFrom implements the linux syscall recvfrom(2).
+func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLenPtr := args[5].Pointer()
+
+	n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
+	return n, nil, err
+}
+
+// SendMsg implements the linux syscall sendmsg(2).
+func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := sendSingleMsg(t, s, file, msgPtr, flags)
+	return n, nil, err
+}
+
+// SendMMsg implements the linux syscall sendmmsg(2).
+func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, nil, syserror.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		var n uintptr
+		if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) {
+	// Capture the message header.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	var controlData []byte
+	if msg.ControlLen > 0 {
+		// Put an upper bound to prevent large allocations.
+		if msg.ControlLen > maxControlLen {
+			return 0, syserror.ENOBUFS
+		}
+		controlData = make([]byte, msg.ControlLen)
+		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	if msg.NameLen != 0 {
+		var err error
+		to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	// Read data then call the sendmsg implementation.
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syserror.EMSGSIZE
+	}
+	src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	controlMessages, err := control.Parse(t, s, controlData)
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
+	err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+	if err != nil {
+		controlMessages.Release()
+	}
+	return uintptr(n), err
+}
+
+// sendTo is the implementation of the sendto syscall. It is called by sendto
+// and send syscall handlers.
+func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+	bl := int(bufLen)
+	if bl < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.Impl().(socket.SocketVFS2)
+	if !ok {
+		return 0, syserror.ENOTSOCK
+	}
+
+	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	var err error
+	if namePtr != 0 {
+		to, err = CaptureAddress(t, namePtr, nameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
+	return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+}
+
+// SendTo implements the linux syscall sendto(2).
+func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLen := args[5].Uint()
+
+	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
+	return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
new file mode 100644
index 000000000..945a364a7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -0,0 +1,291 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Splice implements Linux syscall splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := args[0].Int()
+	inOffsetPtr := args[1].Pointer()
+	outFD := args[2].Int()
+	outOffsetPtr := args[3].Pointer()
+	count := int64(args[4].SizeT())
+	flags := args[5].Int()
+
+	if count == 0 {
+		return 0, nil, nil
+	}
+	if count > int64(kernel.MAX_RW_COUNT) {
+		count = int64(kernel.MAX_RW_COUNT)
+	}
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get file descriptions.
+	inFile := t.GetFileVFS2(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+	outFile := t.GetFileVFS2(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	// Check that both files support the required directionality.
+	if !inFile.IsReadable() || !outFile.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// The operation is non-blocking if anything is non-blocking.
+	//
+	// N.B. This is a rather simplistic heuristic that avoids some
+	// poor edge case behavior since the exact semantics here are
+	// underspecified and vary between versions of Linux itself.
+	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+	// At least one file description must represent a pipe.
+	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+	if !inIsPipe && !outIsPipe {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy in offsets.
+	inOffset := int64(-1)
+	if inOffsetPtr != 0 {
+		if inIsPipe {
+			return 0, nil, syserror.ESPIPE
+		}
+		if inFile.Options().DenyPRead {
+			return 0, nil, syserror.EINVAL
+		}
+		if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+			return 0, nil, err
+		}
+		if inOffset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+	outOffset := int64(-1)
+	if outOffsetPtr != 0 {
+		if outIsPipe {
+			return 0, nil, syserror.ESPIPE
+		}
+		if outFile.Options().DenyPWrite {
+			return 0, nil, syserror.EINVAL
+		}
+		if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+			return 0, nil, err
+		}
+		if outOffset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+
+	// Move data.
+	var (
+		n     int64
+		err   error
+		inCh  chan struct{}
+		outCh chan struct{}
+	)
+	for {
+		// If both input and output are pipes, delegate to the pipe
+		// implementation. Otherwise, exactly one end is a pipe, which we
+		// ensure is consistently ordered after the non-pipe FD's locks by
+		// passing the pipe FD as usermem.IO to the non-pipe end.
+		switch {
+		case inIsPipe && outIsPipe:
+			n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
+		case inIsPipe:
+			if outOffset != -1 {
+				n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
+				outOffset += n
+			} else {
+				n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
+			}
+		case outIsPipe:
+			if inOffset != -1 {
+				n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
+				inOffset += n
+			} else {
+				n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
+			}
+		}
+		if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
+			break
+		}
+
+		// Note that the blocking behavior here is a bit different than the
+		// normal pattern. Because we need to have both data to read and data
+		// to write simultaneously, we actually explicitly block on both of
+		// these cases in turn before returning to the splice operation.
+		if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+			if inCh == nil {
+				inCh = make(chan struct{}, 1)
+				inW, _ := waiter.NewChannelEntry(inCh)
+				inFile.EventRegister(&inW, eventMaskRead)
+				defer inFile.EventUnregister(&inW)
+				continue // Need to refresh readiness.
+			}
+			if err = t.Block(inCh); err != nil {
+				break
+			}
+		}
+		if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+			if outCh == nil {
+				outCh = make(chan struct{}, 1)
+				outW, _ := waiter.NewChannelEntry(outCh)
+				outFile.EventRegister(&outW, eventMaskWrite)
+				defer outFile.EventUnregister(&outW)
+				continue // Need to refresh readiness.
+			}
+			if err = t.Block(outCh); err != nil {
+				break
+			}
+		}
+	}
+
+	// Copy updated offsets out.
+	if inOffsetPtr != 0 {
+		if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+			return 0, nil, err
+		}
+	}
+	if outOffsetPtr != 0 {
+		if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if n == 0 {
+		return 0, nil, err
+	}
+
+	// On Linux, inotify behavior is not very consistent with splice(2). We try
+	// our best to emulate Linux for very basic calls to splice, where for some
+	// reason, events are generated for output files, but not input files.
+	outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	return uintptr(n), nil, nil
+}
+
+// Tee implements Linux syscall tee(2).
+func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := args[0].Int()
+	outFD := args[1].Int()
+	count := int64(args[2].SizeT())
+	flags := args[3].Int()
+
+	if count == 0 {
+		return 0, nil, nil
+	}
+	if count > int64(kernel.MAX_RW_COUNT) {
+		count = int64(kernel.MAX_RW_COUNT)
+	}
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get file descriptions.
+	inFile := t.GetFileVFS2(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+	outFile := t.GetFileVFS2(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	// Check that both files support the required directionality.
+	if !inFile.IsReadable() || !outFile.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// The operation is non-blocking if anything is non-blocking.
+	//
+	// N.B. This is a rather simplistic heuristic that avoids some
+	// poor edge case behavior since the exact semantics here are
+	// underspecified and vary between versions of Linux itself.
+	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+	// Both file descriptions must represent pipes.
+	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+	if !inIsPipe || !outIsPipe {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy data.
+	var (
+		inCh  chan struct{}
+		outCh chan struct{}
+	)
+	for {
+		n, err := pipe.Tee(t, outPipeFD, inPipeFD, count)
+		if n != 0 {
+			return uintptr(n), nil, nil
+		}
+		if err != syserror.ErrWouldBlock || nonBlock {
+			return 0, nil, err
+		}
+
+		// Note that the blocking behavior here is a bit different than the
+		// normal pattern. Because we need to have both data to read and data
+		// to write simultaneously, we actually explicitly block on both of
+		// these cases in turn before returning to the tee operation.
+		if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+			if inCh == nil {
+				inCh = make(chan struct{}, 1)
+				inW, _ := waiter.NewChannelEntry(inCh)
+				inFile.EventRegister(&inW, eventMaskRead)
+				defer inFile.EventUnregister(&inW)
+				continue // Need to refresh readiness.
+			}
+			if err := t.Block(inCh); err != nil {
+				return 0, nil, err
+			}
+		}
+		if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+			if outCh == nil {
+				outCh = make(chan struct{}, 1)
+				outW, _ := waiter.NewChannelEntry(outCh)
+				outFile.EventRegister(&outW, eventMaskWrite)
+				defer outFile.EventUnregister(&outW)
+				continue // Need to refresh readiness.
+			}
+			if err := t.Block(outCh); err != nil {
+				return 0, nil, err
+			}
+		}
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
new file mode 100644
index 000000000..bb1d5cac4
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -0,0 +1,388 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Stat implements Linux syscall stat(2).
+func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+	return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */)
+}
+
+// Lstat implements Linux syscall lstat(2).
+func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+	return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW)
+}
+
+// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2).
+func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	statAddr := args[2].Pointer()
+	flags := args[3].Int()
+	return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags)
+}
+
+func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return syserror.EINVAL
+	}
+
+	opts := vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.Stat() instead of
+				// VirtualFilesystem.StatAt() for fstatat(fd, ""), since the
+				// former may be able to use opened file state to expedite the
+				// Stat.
+				statx, err := dirfile.Stat(t, opts)
+				dirfile.DecRef()
+				if err != nil {
+					return err
+				}
+				var stat linux.Stat
+				convertStatxToUserStat(t, &statx, &stat)
+				_, err = stat.CopyOut(t, statAddr)
+				return err
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+
+	statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+	}, &opts)
+	if err != nil {
+		return err
+	}
+	var stat linux.Stat
+	convertStatxToUserStat(t, &statx, &stat)
+	_, err = stat.CopyOut(t, statAddr)
+	return err
+}
+
+func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
+	return linux.Timespec{
+		Sec:  sxts.Sec,
+		Nsec: int64(sxts.Nsec),
+	}
+}
+
+// Fstat implements Linux syscall fstat(2).
+func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	statAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	statx, err := file.Stat(t, vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	var stat linux.Stat
+	convertStatxToUserStat(t, &statx, &stat)
+	_, err = stat.CopyOut(t, statAddr)
+	return 0, nil, err
+}
+
+// Statx implements Linux syscall statx(2).
+func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	flags := args[2].Int()
+	mask := args[3].Uint()
+	statxAddr := args[4].Pointer()
+
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	// Make sure that only one sync type option is set.
+	syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE)
+	if syncType != 0 && !bits.IsPowerOfTwo32(syncType) {
+		return 0, nil, syserror.EINVAL
+	}
+	if mask&linux.STATX__RESERVED != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := vfs.StatOptions{
+		Mask: mask,
+		Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE),
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	root := t.FSContext().RootDirectoryVFS2()
+	defer root.DecRef()
+	start := root
+	if !path.Absolute {
+		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
+			return 0, nil, syserror.ENOENT
+		}
+		if dirfd == linux.AT_FDCWD {
+			start = t.FSContext().WorkingDirectoryVFS2()
+			defer start.DecRef()
+		} else {
+			dirfile := t.GetFileVFS2(dirfd)
+			if dirfile == nil {
+				return 0, nil, syserror.EBADF
+			}
+			if !path.HasComponents() {
+				// Use FileDescription.Stat() instead of
+				// VirtualFilesystem.StatAt() for statx(fd, ""), since the
+				// former may be able to use opened file state to expedite the
+				// Stat.
+				statx, err := dirfile.Stat(t, opts)
+				dirfile.DecRef()
+				if err != nil {
+					return 0, nil, err
+				}
+				userifyStatx(t, &statx)
+				_, err = statx.CopyOut(t, statxAddr)
+				return 0, nil, err
+			}
+			start = dirfile.VirtualDentry()
+			start.IncRef()
+			defer start.DecRef()
+			dirfile.DecRef()
+		}
+	}
+
+	statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
+		Root:               root,
+		Start:              start,
+		Path:               path,
+		FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
+	}, &opts)
+	if err != nil {
+		return 0, nil, err
+	}
+	userifyStatx(t, &statx)
+	_, err = statx.CopyOut(t, statxAddr)
+	return 0, nil, err
+}
+
+func userifyStatx(t *kernel.Task, statx *linux.Statx) {
+	userns := t.UserNamespace()
+	statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow())
+	statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow())
+}
+
+// Readlink implements Linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+	size := args[2].SizeT()
+	return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
+}
+
+// Access implements Linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+
+	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Faccessat implements Linux syscall faccessat(2).
+//
+// Note that the faccessat() system call does not take a flags argument:
+// "The raw faccessat() system call takes only the first three arguments. The
+// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
+// the glibc wrapper function for faccessat().  If either of these flags is
+// specified, then the wrapper function employs fstatat(2) to determine access
+// permissions." - faccessat(2)
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+
+	return 0, nil, accessAt(t, dirfd, addr, mode)
+}
+
+func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
+	const rOK = 4
+	const wOK = 2
+	const xOK = 1
+
+	// Sanity check the mode.
+	if mode&^(rOK|wOK|xOK) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	// access(2) and faccessat(2) check permissions using real
+	// UID/GID, not effective UID/GID.
+	//
+	// "access() needs to use the real uid/gid, not the effective
+	// uid/gid. We do this by temporarily clearing all FS-related
+	// capabilities and switching the fsuid/fsgid around to the
+	// real ones." -fs/open.c:faccessat
+	creds := t.Credentials().Fork()
+	creds.EffectiveKUID = creds.RealKUID
+	creds.EffectiveKGID = creds.RealKGID
+	if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
+		creds.EffectiveCaps = creds.PermittedCaps
+	} else {
+		creds.EffectiveCaps = 0
+	}
+
+	return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop)
+}
+
+// Readlinkat implements Linux syscall mknodat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirfd := args[0].Int()
+	pathAddr := args[1].Pointer()
+	bufAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	return readlinkat(t, dirfd, pathAddr, bufAddr, size)
+}
+
+func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
+	if int(size) <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	// "Since Linux 2.6.39, pathname can be an empty string, in which case the
+	// call operates on the symbolic link referred to by dirfd ..." -
+	// readlinkat(2)
+	tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if len(target) > int(size) {
+		target = target[:size]
+	}
+	n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
+	if n == 0 {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Statfs implements Linux syscall statfs(2).
+func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+	_, err = statfs.CopyOut(t, bufAddr)
+	return 0, nil, err
+}
+
+// Fstatfs implements Linux syscall fstatfs(2).
+func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	bufAddr := args[1].Pointer()
+
+	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
+	if err != nil {
+		return 0, nil, err
+	}
+	_, err = statfs.CopyOut(t, bufAddr)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
new file mode 100644
index 000000000..2da538fc6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint64(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int64(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
new file mode 100644
index 000000000..88b9c7627
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go
@@ -0,0 +1,46 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// This takes both input and output as pointer arguments to avoid copying large
+// structs.
+func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
+	// Linux just copies fields from struct kstat without regard to struct
+	// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
+	userns := t.UserNamespace()
+	*stat = linux.Stat{
+		Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
+		Ino:     statx.Ino,
+		Nlink:   uint32(statx.Nlink),
+		Mode:    uint32(statx.Mode),
+		UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
+		GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
+		Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
+		Size:    int64(statx.Size),
+		Blksize: int32(statx.Blksize),
+		Blocks:  int64(statx.Blocks),
+		ATime:   timespecFromStatxTimestamp(statx.Atime),
+		MTime:   timespecFromStatxTimestamp(statx.Mtime),
+		CTime:   timespecFromStatxTimestamp(statx.Ctime),
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go
new file mode 100644
index 000000000..365250b0b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sync.go
@@ -0,0 +1,87 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements Linux syscall sync(2).
+func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t)
+}
+
+// Syncfs implements Linux syscall syncfs(2).
+func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.SyncFS(t)
+}
+
+// Fsync implements Linux syscall fsync(2).
+func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, file.Sync(t)
+}
+
+// Fdatasync implements Linux syscall fdatasync(2).
+func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata.
+	return Fsync(t, args)
+}
+
+// SyncFileRange implements Linux syscall sync_file_range(2).
+func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	offset := args[1].Int64()
+	nbytes := args[2].Int64()
+	flags := args[3].Uint()
+
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if nbytes < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of
+	// [offset, offset+nbytes).
+	return 0, nil, file.Sync(t)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
deleted file mode 100644
index 7667524c7..000000000
--- a/pkg/sentry/syscalls/linux/vfs2/sys_read.go
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs2
-
-import (
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
-)
-
-const (
-	// EventMaskRead contains events that can be triggered on reads.
-	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
-)
-
-// Read implements linux syscall read(2).  Note that we try to get a buffer that
-// is exactly the size requested because some applications like qemu expect
-// they can do large reads all at once.  Bug for bug.  Same for other read
-// calls below.
-func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	fd := args[0].Int()
-	addr := args[1].Pointer()
-	size := args[2].SizeT()
-
-	file := t.GetFileVFS2(fd)
-	if file == nil {
-		return 0, nil, syserror.EBADF
-	}
-	defer file.DecRef()
-
-	// Check that the size is legitimate.
-	si := int(size)
-	if si < 0 {
-		return 0, nil, syserror.EINVAL
-	}
-
-	// Get the destination of the read.
-	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-	if err != nil {
-		return 0, nil, err
-	}
-
-	n, err := read(t, file, dst, vfs.ReadOptions{})
-	t.IOUsage().AccountReadSyscall(n)
-	return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
-}
-
-func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	n, err := file.Read(t, dst, opts)
-	if err != syserror.ErrWouldBlock {
-		return n, err
-	}
-
-	// Register for notifications.
-	w, ch := waiter.NewChannelEntry(nil)
-	file.EventRegister(&w, EventMaskRead)
-
-	total := n
-	for {
-		// Shorten dst to reflect bytes previously read.
-		dst = dst.DropFirst(int(n))
-
-		// Issue the request and break out if it completes with anything other than
-		// "would block".
-		n, err := file.Read(t, dst, opts)
-		total += n
-		if err != syserror.ErrWouldBlock {
-			break
-		}
-		if err := t.Block(ch); err != nil {
-			break
-		}
-	}
-	file.EventUnregister(&w)
-
-	return total, err
-}
diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
new file mode 100644
index 000000000..5ac79bc09
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go
@@ -0,0 +1,127 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TimerfdCreate implements Linux syscall timerfd_create(2).
+func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := args[0].Int()
+	flags := args[1].Int()
+
+	if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Timerfds aren't writable per se (their implementation of Write just
+	// returns EINVAL), but they are "opened for writing", which is necessary
+	// to actually reach said implementation of Write.
+	fileFlags := uint32(linux.O_RDWR)
+	if flags&linux.TFD_NONBLOCK != 0 {
+		fileFlags |= linux.O_NONBLOCK
+	}
+
+	var clock ktime.Clock
+	switch clockID {
+	case linux.CLOCK_REALTIME:
+		clock = t.Kernel().RealtimeClock()
+	case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME:
+		clock = t.Kernel().MonotonicClock()
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+	vfsObj := t.Kernel().VFS()
+	file, err := timerfd.New(vfsObj, clock, fileFlags)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+	fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+		CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(fd), nil, nil
+}
+
+// TimerfdSettime implements Linux syscall timerfd_settime(2).
+func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	flags := args[1].Int()
+	newValAddr := args[2].Pointer()
+	oldValAddr := args[3].Pointer()
+
+	if flags&^(linux.TFD_TIMER_ABSTIME) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var newVal linux.Itimerspec
+	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+		return 0, nil, err
+	}
+	newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock())
+	if err != nil {
+		return 0, nil, err
+	}
+	tm, oldS := tfd.SetTime(newS)
+	if oldValAddr != 0 {
+		oldVal := ktime.ItimerspecFromSetting(tm, oldS)
+		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// TimerfdGettime implements Linux syscall timerfd_gettime(2).
+func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	curValAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	tm, s := tfd.GetTime()
+	curVal := ktime.ItimerspecFromSetting(tm, s)
+	_, err := t.CopyOut(curValAddr, &curVal)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
new file mode 100644
index 000000000..7b6e7571a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -0,0 +1,172 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package vfs2 provides syscall implementations that use VFS2.
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/syscalls"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+// Override syscall table to add syscalls implementations from this package.
+func Override() {
+	// Override AMD64.
+	s := linux.AMD64
+	s.Table[0] = syscalls.Supported("read", Read)
+	s.Table[1] = syscalls.Supported("write", Write)
+	s.Table[2] = syscalls.Supported("open", Open)
+	s.Table[3] = syscalls.Supported("close", Close)
+	s.Table[4] = syscalls.Supported("stat", Stat)
+	s.Table[5] = syscalls.Supported("fstat", Fstat)
+	s.Table[6] = syscalls.Supported("lstat", Lstat)
+	s.Table[7] = syscalls.Supported("poll", Poll)
+	s.Table[8] = syscalls.Supported("lseek", Lseek)
+	s.Table[9] = syscalls.Supported("mmap", Mmap)
+	s.Table[16] = syscalls.Supported("ioctl", Ioctl)
+	s.Table[17] = syscalls.Supported("pread64", Pread64)
+	s.Table[18] = syscalls.Supported("pwrite64", Pwrite64)
+	s.Table[19] = syscalls.Supported("readv", Readv)
+	s.Table[20] = syscalls.Supported("writev", Writev)
+	s.Table[21] = syscalls.Supported("access", Access)
+	s.Table[22] = syscalls.Supported("pipe", Pipe)
+	s.Table[23] = syscalls.Supported("select", Select)
+	s.Table[32] = syscalls.Supported("dup", Dup)
+	s.Table[33] = syscalls.Supported("dup2", Dup2)
+	delete(s.Table, 40) // sendfile
+	s.Table[41] = syscalls.Supported("socket", Socket)
+	s.Table[42] = syscalls.Supported("connect", Connect)
+	s.Table[43] = syscalls.Supported("accept", Accept)
+	s.Table[44] = syscalls.Supported("sendto", SendTo)
+	s.Table[45] = syscalls.Supported("recvfrom", RecvFrom)
+	s.Table[46] = syscalls.Supported("sendmsg", SendMsg)
+	s.Table[47] = syscalls.Supported("recvmsg", RecvMsg)
+	s.Table[48] = syscalls.Supported("shutdown", Shutdown)
+	s.Table[49] = syscalls.Supported("bind", Bind)
+	s.Table[50] = syscalls.Supported("listen", Listen)
+	s.Table[51] = syscalls.Supported("getsockname", GetSockName)
+	s.Table[52] = syscalls.Supported("getpeername", GetPeerName)
+	s.Table[53] = syscalls.Supported("socketpair", SocketPair)
+	s.Table[54] = syscalls.Supported("setsockopt", SetSockOpt)
+	s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt)
+	s.Table[59] = syscalls.Supported("execve", Execve)
+	s.Table[72] = syscalls.Supported("fcntl", Fcntl)
+	delete(s.Table, 73) // flock
+	s.Table[74] = syscalls.Supported("fsync", Fsync)
+	s.Table[75] = syscalls.Supported("fdatasync", Fdatasync)
+	s.Table[76] = syscalls.Supported("truncate", Truncate)
+	s.Table[77] = syscalls.Supported("ftruncate", Ftruncate)
+	s.Table[78] = syscalls.Supported("getdents", Getdents)
+	s.Table[79] = syscalls.Supported("getcwd", Getcwd)
+	s.Table[80] = syscalls.Supported("chdir", Chdir)
+	s.Table[81] = syscalls.Supported("fchdir", Fchdir)
+	s.Table[82] = syscalls.Supported("rename", Rename)
+	s.Table[83] = syscalls.Supported("mkdir", Mkdir)
+	s.Table[84] = syscalls.Supported("rmdir", Rmdir)
+	s.Table[85] = syscalls.Supported("creat", Creat)
+	s.Table[86] = syscalls.Supported("link", Link)
+	s.Table[87] = syscalls.Supported("unlink", Unlink)
+	s.Table[88] = syscalls.Supported("symlink", Symlink)
+	s.Table[89] = syscalls.Supported("readlink", Readlink)
+	s.Table[90] = syscalls.Supported("chmod", Chmod)
+	s.Table[91] = syscalls.Supported("fchmod", Fchmod)
+	s.Table[92] = syscalls.Supported("chown", Chown)
+	s.Table[93] = syscalls.Supported("fchown", Fchown)
+	s.Table[94] = syscalls.Supported("lchown", Lchown)
+	s.Table[132] = syscalls.Supported("utime", Utime)
+	s.Table[133] = syscalls.Supported("mknod", Mknod)
+	s.Table[137] = syscalls.Supported("statfs", Statfs)
+	s.Table[138] = syscalls.Supported("fstatfs", Fstatfs)
+	s.Table[161] = syscalls.Supported("chroot", Chroot)
+	s.Table[162] = syscalls.Supported("sync", Sync)
+	s.Table[165] = syscalls.Supported("mount", Mount)
+	s.Table[166] = syscalls.Supported("umount2", Umount2)
+	delete(s.Table, 187) // readahead
+	s.Table[188] = syscalls.Supported("setxattr", Setxattr)
+	s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
+	s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
+	s.Table[191] = syscalls.Supported("getxattr", Getxattr)
+	s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
+	s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
+	s.Table[194] = syscalls.Supported("listxattr", Listxattr)
+	s.Table[195] = syscalls.Supported("llistxattr", Llistxattr)
+	s.Table[196] = syscalls.Supported("flistxattr", Flistxattr)
+	s.Table[197] = syscalls.Supported("removexattr", Removexattr)
+	s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
+	s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
+	delete(s.Table, 206) // io_setup
+	delete(s.Table, 207) // io_destroy
+	delete(s.Table, 208) // io_getevents
+	delete(s.Table, 209) // io_submit
+	delete(s.Table, 210) // io_cancel
+	s.Table[213] = syscalls.Supported("epoll_create", EpollCreate)
+	s.Table[217] = syscalls.Supported("getdents64", Getdents64)
+	delete(s.Table, 221) // fdavise64
+	s.Table[232] = syscalls.Supported("epoll_wait", EpollWait)
+	s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
+	s.Table[235] = syscalls.Supported("utimes", Utimes)
+	s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil)
+	s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+	s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
+	s.Table[257] = syscalls.Supported("openat", Openat)
+	s.Table[258] = syscalls.Supported("mkdirat", Mkdirat)
+	s.Table[259] = syscalls.Supported("mknodat", Mknodat)
+	s.Table[260] = syscalls.Supported("fchownat", Fchownat)
+	s.Table[261] = syscalls.Supported("futimesat", Futimesat)
+	s.Table[262] = syscalls.Supported("newfstatat", Newfstatat)
+	s.Table[263] = syscalls.Supported("unlinkat", Unlinkat)
+	s.Table[264] = syscalls.Supported("renameat", Renameat)
+	s.Table[265] = syscalls.Supported("linkat", Linkat)
+	s.Table[266] = syscalls.Supported("symlinkat", Symlinkat)
+	s.Table[267] = syscalls.Supported("readlinkat", Readlinkat)
+	s.Table[268] = syscalls.Supported("fchmodat", Fchmodat)
+	s.Table[269] = syscalls.Supported("faccessat", Faccessat)
+	s.Table[270] = syscalls.Supported("pselect", Pselect)
+	s.Table[271] = syscalls.Supported("ppoll", Ppoll)
+	s.Table[275] = syscalls.Supported("splice", Splice)
+	s.Table[276] = syscalls.Supported("tee", Tee)
+	s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
+	s.Table[280] = syscalls.Supported("utimensat", Utimensat)
+	s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
+	s.Table[282] = syscalls.Supported("signalfd", Signalfd)
+	s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate)
+	s.Table[284] = syscalls.Supported("eventfd", Eventfd)
+	delete(s.Table, 285) // fallocate
+	s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime)
+	s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
+	s.Table[288] = syscalls.Supported("accept4", Accept4)
+	s.Table[289] = syscalls.Supported("signalfd4", Signalfd4)
+	s.Table[290] = syscalls.Supported("eventfd2", Eventfd2)
+	s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
+	s.Table[292] = syscalls.Supported("dup3", Dup3)
+	s.Table[293] = syscalls.Supported("pipe2", Pipe2)
+	s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
+	s.Table[295] = syscalls.Supported("preadv", Preadv)
+	s.Table[296] = syscalls.Supported("pwritev", Pwritev)
+	s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg)
+	s.Table[306] = syscalls.Supported("syncfs", Syncfs)
+	s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg)
+	s.Table[316] = syscalls.Supported("renameat2", Renameat2)
+	s.Table[319] = syscalls.Supported("memfd_create", MemfdCreate)
+	s.Table[322] = syscalls.Supported("execveat", Execveat)
+	s.Table[327] = syscalls.Supported("preadv2", Preadv2)
+	s.Table[328] = syscalls.Supported("pwritev2", Pwritev2)
+	s.Table[332] = syscalls.Supported("statx", Statx)
+	s.Init()
+
+	// Override ARM64.
+	s = linux.ARM64
+	s.Table[63] = syscalls.Supported("read", Read)
+	s.Init()
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
new file mode 100644
index 000000000..af455d5c1
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -0,0 +1,356 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"bytes"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Listxattr implements Linux syscall listxattr(2).
+func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listxattr(t, args, followFinalSymlink)
+}
+
+// Llistxattr implements Linux syscall llistxattr(2).
+func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listxattr(t, args, nofollowFinalSymlink)
+}
+
+func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	listAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrNameList(t, listAddr, size, names)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Flistxattr implements Linux syscall flistxattr(2).
+func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	listAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	names, err := file.Listxattr(t, uint64(size))
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrNameList(t, listAddr, size, names)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Getxattr implements Linux syscall getxattr(2).
+func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getxattr(t, args, followFinalSymlink)
+}
+
+// Lgetxattr implements Linux syscall lgetxattr(2).
+func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getxattr(t, args, nofollowFinalSymlink)
+}
+
+func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+		Name: name,
+		Size: uint64(size),
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrValue(t, valueAddr, size, value)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Fgetxattr implements Linux syscall fgetxattr(2).
+func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
+	if err != nil {
+		return 0, nil, err
+	}
+	n, err := copyOutXattrValue(t, valueAddr, size, value)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+// Setxattr implements Linux syscall setxattr(2).
+func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, setxattr(t, args, followFinalSymlink)
+}
+
+// Lsetxattr implements Linux syscall lsetxattr(2).
+func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, setxattr(t, args, nofollowFinalSymlink)
+}
+
+func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Int()
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+	value, err := copyInXattrValue(t, valueAddr, size)
+	if err != nil {
+		return err
+	}
+
+	return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
+		Name:  name,
+		Value: value,
+		Flags: uint32(flags),
+	})
+}
+
+// Fsetxattr implements Linux syscall fsetxattr(2).
+func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Int()
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+	value, err := copyInXattrValue(t, valueAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
+		Name:  name,
+		Value: value,
+		Flags: uint32(flags),
+	})
+}
+
+// Removexattr implements Linux syscall removexattr(2).
+func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, removexattr(t, args, followFinalSymlink)
+}
+
+// Lremovexattr implements Linux syscall lremovexattr(2).
+func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, removexattr(t, args, nofollowFinalSymlink)
+}
+
+func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+
+	path, err := copyInPath(t, pathAddr)
+	if err != nil {
+		return err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
+	if err != nil {
+		return err
+	}
+	defer tpop.Release()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
+}
+
+// Fremovexattr implements Linux syscall fremovexattr(2).
+func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, file.Removexattr(t, name)
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+	name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+	if err != nil {
+		if err == syserror.ENAMETOOLONG {
+			return "", syserror.ERANGE
+		}
+		return "", err
+	}
+	if len(name) == 0 {
+		return "", syserror.ERANGE
+	}
+	return name, nil
+}
+
+func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) {
+	if size > linux.XATTR_LIST_MAX {
+		size = linux.XATTR_LIST_MAX
+	}
+	var buf bytes.Buffer
+	for _, name := range names {
+		buf.WriteString(name)
+		buf.WriteByte(0)
+	}
+	if size == 0 {
+		// Return the size that would be required to accomodate the list.
+		return buf.Len(), nil
+	}
+	if buf.Len() > int(size) {
+		if size >= linux.XATTR_LIST_MAX {
+			return 0, syserror.E2BIG
+		}
+		return 0, syserror.ERANGE
+	}
+	return t.CopyOutBytes(listAddr, buf.Bytes())
+}
+
+func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) {
+	if size > linux.XATTR_SIZE_MAX {
+		return "", syserror.E2BIG
+	}
+	buf := make([]byte, size)
+	if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
+		return "", err
+	}
+	return gohacks.StringFromImmutableBytes(buf), nil
+}
+
+func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) {
+	if size > linux.XATTR_SIZE_MAX {
+		size = linux.XATTR_SIZE_MAX
+	}
+	if size == 0 {
+		// Return the size that would be required to accomodate the value.
+		return len(value), nil
+	}
+	if len(value) > int(size) {
+		if size >= linux.XATTR_SIZE_MAX {
+			return 0, syserror.E2BIG
+		}
+		return 0, syserror.ERANGE
+	}
+	return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value))
+}
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 4320ad17f..ab1d140d2 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -252,18 +252,23 @@ func (m *MemoryLocked) Copy() (MemoryStats, uint64) {
 	return ms, m.totalLocked()
 }
 
-// MinimumTotalMemoryBytes is the minimum reported total system memory.
-//
-// This can be configured through options provided to the Sentry at start.
-// This number is purely synthetic. This is only set before the application
-// starts executing, and must not be modified.
-var MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB
+// These options control how much total memory the is reported to the application.
+// They may only be set before the application starts executing, and must not
+// be modified.
+var (
+	// MinimumTotalMemoryBytes is the minimum reported total system memory.
+	MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB
+
+	// MaximumTotalMemoryBytes is the maximum reported total system memory.
+	// The 0 value indicates no maximum.
+	MaximumTotalMemoryBytes uint64
+)
 
 // TotalMemory returns the "total usable memory" available.
 //
 // This number doesn't really have a true value so it's based on the following
-// inputs and further bounded to be above some minimum guaranteed value (2GB),
-// additionally ensuring that total memory reported is always less than used.
+// inputs and further bounded to be above the MinumumTotalMemoryBytes and below
+// MaximumTotalMemoryBytes.
 //
 // memSize should be the platform.Memory size reported by platform.Memory.TotalSize()
 // used is the total memory reported by MemoryLocked.Total()
@@ -279,5 +284,8 @@ func TotalMemory(memSize, used uint64) uint64 {
 			memSize = uint64(1) << (uint(msb) + 1)
 		}
 	}
+	if MaximumTotalMemoryBytes > 0 && memSize > MaximumTotalMemoryBytes {
+		memSize = MaximumTotalMemoryBytes
+	}
 	return memSize
 }
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 14b39eb9d..774cc66cc 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -15,6 +15,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "event_list",
+    out = "event_list.go",
+    package = "vfs",
+    prefix = "event",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Event",
+        "Linker": "*Event",
+    },
+)
+
 go_library(
     name = "vfs",
     srcs = [
@@ -25,11 +37,13 @@ go_library(
         "device.go",
         "epoll.go",
         "epoll_interest_list.go",
+        "event_list.go",
         "file_description.go",
         "file_description_impl_util.go",
         "filesystem.go",
         "filesystem_impl_util.go",
         "filesystem_type.go",
+        "inotify.go",
         "mount.go",
         "mount_unsafe.go",
         "options.go",
@@ -41,16 +55,28 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/binary",
         "//pkg/context",
+        "//pkg/fd",
+        "//pkg/fdnotifier",
         "//pkg/fspath",
+        "//pkg/gohacks",
+        "//pkg/log",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/uniqueid",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 9aa133bcb..66f3105bd 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -39,8 +39,8 @@ Mount references are held by:
 -   Mount: Each referenced Mount holds a reference on its parent, which is the
     mount containing its mount point.
 
--   VirtualFilesystem: A reference is held on each Mount that has not been
-    umounted.
+-   VirtualFilesystem: A reference is held on each Mount that has been connected
+    to a mount point, but not yet umounted.
 
 MountNamespace and FileDescription references are held by users of VFS. The
 expectation is that each `kernel.Task` holds a reference on its corresponding
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index 2db25be49..b7c6b60b8 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -41,7 +42,27 @@ func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry {
 	}
 }
 
-const anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+const (
+	anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+
+	// Mode, UID, and GID for a generic anonfs file.
+	anonFileMode = 0600 // no type is correct
+	anonFileUID  = auth.RootKUID
+	anonFileGID  = auth.RootKGID
+)
+
+// anonFilesystemType implements FilesystemType.
+type anonFilesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *auth.Credentials, string, GetFilesystemOptions) (*Filesystem, *Dentry, error) {
+	panic("cannot instaniate an anon filesystem")
+}
+
+// Name implemenents FilesystemType.Name.
+func (anonFilesystemType) Name() string {
+	return "none"
+}
 
 // anonFilesystem is the implementation of FilesystemImpl that backs
 // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
@@ -69,6 +90,14 @@ func (fs *anonFilesystem) Sync(ctx context.Context) error {
 	return nil
 }
 
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error {
+	if !rp.Done() {
+		return syserror.ENOTDIR
+	}
+	return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID)
+}
+
 // GetDentryAt implements FilesystemImpl.GetDentryAt.
 func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
 	if !rp.Done() {
@@ -167,13 +196,13 @@ func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts St
 		Mask:     linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
 		Blksize:  anonfsBlockSize,
 		Nlink:    1,
-		UID:      uint32(auth.RootKUID),
-		GID:      uint32(auth.RootKGID),
-		Mode:     0600, // no type is correct
+		UID:      uint32(anonFileUID),
+		GID:      uint32(anonFileGID),
+		Mode:     anonFileMode,
 		Ino:      1,
 		Size:     0,
 		Blocks:   0,
-		DevMajor: 0,
+		DevMajor: linux.UNNAMED_MAJOR,
 		DevMinor: fs.devMinor,
 	}, nil
 }
@@ -205,8 +234,19 @@ func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 	return syserror.EPERM
 }
 
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	if !rp.Final() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := GenericCheckPermissions(rp.Credentials(), MayWrite, anonFileMode, anonFileUID, anonFileGID); err != nil {
+		return nil, err
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
 // ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
 	if !rp.Done() {
 		return nil, syserror.ENOTDIR
 	}
@@ -214,7 +254,7 @@ func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([
 }
 
 // GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
 	if !rp.Done() {
 		return "", syserror.ENOTDIR
 	}
@@ -257,3 +297,15 @@ func (d *anonDentry) TryIncRef() bool {
 func (d *anonDentry) DecRef() {
 	// no-op
 }
+
+// InotifyWithParent implements DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {}
+
+// Watches implements DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *anonDentry) Watches() *Watches {
+	return nil
+}
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index d97362b9a..c9e724fef 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -29,9 +29,10 @@ const (
 	CtxRoot
 )
 
-// MountNamespaceFromContext returns the MountNamespace used by ctx. It does
-// not take a reference on the returned MountNamespace. If ctx is not
-// associated with a MountNamespace, MountNamespaceFromContext returns nil.
+// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is
+// not associated with a MountNamespace, MountNamespaceFromContext returns nil.
+//
+// A reference is taken on the returned MountNamespace.
 func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
 	if v := ctx.Value(CtxMountNamespace); v != nil {
 		return v.(*MountNamespace)
@@ -48,3 +49,27 @@ func RootFromContext(ctx context.Context) VirtualDentry {
 	}
 	return VirtualDentry{}
 }
+
+type rootContext struct {
+	context.Context
+	root VirtualDentry
+}
+
+// WithRoot returns a copy of ctx with the given root.
+func WithRoot(ctx context.Context, root VirtualDentry) context.Context {
+	return &rootContext{
+		Context: ctx,
+		root:    root,
+	}
+}
+
+// Value implements Context.Value.
+func (rc rootContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxRoot:
+		rc.root.IncRef()
+		return rc.root
+	default:
+		return rc.Context.Value(key)
+	}
+}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 486a76475..24af13eb1 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -15,34 +15,17 @@
 package vfs
 
 import (
-	"fmt"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Dentry represents a node in a Filesystem tree which may represent a file.
+// Dentry represents a node in a Filesystem tree at which a file exists.
 //
 // Dentries are reference-counted. Unless otherwise specified, all Dentry
 // methods require that a reference is held.
 //
-// A Dentry transitions through up to 3 different states through its lifetime:
-//
-// - Dentries are initially "independent". Independent Dentries have no parent,
-// and consequently no name.
-//
-// - Dentry.InsertChild() causes an independent Dentry to become a "child" of
-// another Dentry. A child node has a parent node, and a name in that parent,
-// both of which are mutable by DentryMoveChild(). Each child Dentry's name is
-// unique within its parent.
-//
-// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A
-// disowned Dentry can still refer to its former parent and its former name in
-// said parent, but the disowned Dentry is no longer reachable from its parent,
-// and a new Dentry with the same name may become a child of the parent. (This
-// is analogous to a struct dentry being "unhashed" in Linux.)
-//
 // Dentry is loosely analogous to Linux's struct dentry, but:
 //
 // - VFS does not associate Dentries with inodes. gVisor interacts primarily
@@ -57,9 +40,6 @@ import (
 // and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do
 // support inodes may store appropriate state in implementations of DentryImpl.
 //
-// - VFS does not provide synchronization for mutable Dentry fields, other than
-// mount-related ones.
-//
 // - VFS does not require that Dentries are instantiated for all paths accessed
 // through VFS, only those that are tracked beyond the scope of a single
 // Filesystem operation. This includes file descriptions, mount points, mount
@@ -67,41 +47,34 @@ import (
 // of Dentries for operations on mutable remote filesystems that can't actually
 // cache any state in the Dentry.
 //
+// - VFS does not track filesystem structure (i.e. relationships between
+// Dentries), since both the relevant state and synchronization are
+// filesystem-specific.
+//
 // - For the reasons above, VFS is not directly responsible for managing Dentry
 // lifetime. Dentry reference counts only indicate the extent to which VFS
 // requires Dentries to exist; Filesystems may elect to cache or discard
 // Dentries with zero references.
+//
+// +stateify savable
 type Dentry struct {
-	// parent is this Dentry's parent in this Filesystem. If this Dentry is
-	// independent, parent is nil.
-	parent *Dentry
+	// mu synchronizes deletion/invalidation and mounting over this Dentry.
+	mu sync.Mutex `state:"nosave"`
 
-	// name is this Dentry's name in parent.
-	name string
-
-	flags uint32
+	// dead is true if the file represented by this Dentry has been deleted (by
+	// CommitDeleteDentry or CommitRenameReplaceDentry) or invalidated (by
+	// InvalidateDentry). dead is protected by mu.
+	dead bool
 
 	// mounts is the number of Mounts for which this Dentry is Mount.point.
 	// mounts is accessed using atomic memory operations.
 	mounts uint32
 
-	// children are child Dentries.
-	children map[string]*Dentry
-
-	// mu synchronizes disowning and mounting over this Dentry.
-	mu sync.Mutex
-
 	// impl is the DentryImpl associated with this Dentry. impl is immutable.
 	// This should be the last field in Dentry.
 	impl DentryImpl
 }
 
-const (
-	// dflagsDisownedMask is set in Dentry.flags if the Dentry has been
-	// disowned.
-	dflagsDisownedMask = 1 << iota
-)
-
 // Init must be called before first use of d.
 func (d *Dentry) Init(impl DentryImpl) {
 	d.impl = impl
@@ -130,20 +103,22 @@ type DentryImpl interface {
 
 	// DecRef decrements the Dentry's reference count.
 	DecRef()
-}
-
-// IsDisowned returns true if d is disowned.
-func (d *Dentry) IsDisowned() bool {
-	return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0
-}
-
-// Preconditions: !d.IsDisowned().
-func (d *Dentry) setDisowned() {
-	atomic.AddUint32(&d.flags, dflagsDisownedMask)
-}
 
-func (d *Dentry) isMounted() bool {
-	return atomic.LoadUint32(&d.mounts) != 0
+	// InotifyWithParent notifies all watches on the targets represented by this
+	// dentry and its parent. The parent's watches are notified first, followed
+	// by this dentry's.
+	//
+	// InotifyWithParent automatically adds the IN_ISDIR flag for dentries
+	// representing directories.
+	//
+	// Note that the events may not actually propagate up to the user, depending
+	// on the event masks.
+	InotifyWithParent(events uint32, cookie uint32, et EventType)
+
+	// Watches returns the set of inotify watches for the file corresponding to
+	// the Dentry. Dentries that are hard links to the same underlying file
+	// share the same watches.
+	Watches() *Watches
 }
 
 // IncRef increments d's reference count.
@@ -162,104 +137,37 @@ func (d *Dentry) DecRef() {
 	d.impl.DecRef()
 }
 
-// These functions are exported so that filesystem implementations can use
-// them. The vfs package, and users of VFS, should not call these functions.
-// Unless otherwise specified, these methods require that there are no
-// concurrent mutators of d.
-
-// Name returns d's name in its parent in its owning Filesystem. If d is
-// independent, Name returns an empty string.
-func (d *Dentry) Name() string {
-	return d.name
-}
-
-// Parent returns d's parent in its owning Filesystem. It does not take a
-// reference on the returned Dentry. If d is independent, Parent returns nil.
-func (d *Dentry) Parent() *Dentry {
-	return d.parent
+// IsDead returns true if d has been deleted or invalidated by its owning
+// filesystem.
+func (d *Dentry) IsDead() bool {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.dead
 }
 
-// ParentOrSelf is equivalent to Parent, but returns d if d is independent.
-func (d *Dentry) ParentOrSelf() *Dentry {
-	if d.parent == nil {
-		return d
-	}
-	return d.parent
+func (d *Dentry) isMounted() bool {
+	return atomic.LoadUint32(&d.mounts) != 0
 }
 
-// Child returns d's child with the given name in its owning Filesystem. It
-// does not take a reference on the returned Dentry. If no such child exists,
-// Child returns nil.
-func (d *Dentry) Child(name string) *Dentry {
-	return d.children[name]
+// InotifyWithParent notifies all watches on the inodes for this dentry and
+// its parent of events.
+func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {
+	d.impl.InotifyWithParent(events, cookie, et)
 }
 
-// HasChildren returns true if d has any children.
-func (d *Dentry) HasChildren() bool {
-	return len(d.children) != 0
+// Watches returns the set of inotify watches associated with d.
+func (d *Dentry) Watches() *Watches {
+	return d.impl.Watches()
 }
 
-// Children returns a map containing all of d's children.
-func (d *Dentry) Children() map[string]*Dentry {
-	if !d.HasChildren() {
-		return nil
-	}
-	m := make(map[string]*Dentry)
-	for name, child := range d.children {
-		m[name] = child
-	}
-	return m
-}
-
-// InsertChild makes child a child of d with the given name.
-//
-// InsertChild is a mutator of d and child.
-//
-// Preconditions: child must be an independent Dentry. d and child must be from
-// the same Filesystem. d must not already have a child with the given name.
-func (d *Dentry) InsertChild(child *Dentry, name string) {
-	if checkInvariants {
-		if _, ok := d.children[name]; ok {
-			panic(fmt.Sprintf("parent already contains a child named %q", name))
-		}
-		if child.parent != nil || child.name != "" {
-			panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name))
-		}
-	}
-	if d.children == nil {
-		d.children = make(map[string]*Dentry)
-	}
-	d.children[name] = child
-	child.parent = d
-	child.name = name
-}
-
-// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either
-// d2's parent or an ancestor of d2's parent.
-func (d *Dentry) IsAncestorOf(d2 *Dentry) bool {
-	for d2.parent != nil {
-		if d2.parent == d {
-			return true
-		}
-		d2 = d2.parent
-	}
-	return false
-}
+// The following functions are exported so that filesystem implementations can
+// use them. The vfs package, and users of VFS, should not call these
+// functions.
 
 // PrepareDeleteDentry must be called before attempting to delete the file
 // represented by d. If PrepareDeleteDentry succeeds, the caller must call
 // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
-//
-// Preconditions: d is a child Dentry.
 func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error {
-	if checkInvariants {
-		if d.parent == nil {
-			panic("d is independent")
-		}
-		if d.IsDisowned() {
-			panic("d is already disowned")
-		}
-	}
 	vfs.mountMu.Lock()
 	if mntns.mountpoints[d] != 0 {
 		vfs.mountMu.Unlock()
@@ -278,42 +186,27 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
 	d.mu.Unlock()
 }
 
-// CommitDeleteDentry must be called after the file represented by d is
-// deleted, and causes d to become disowned.
-//
-// CommitDeleteDentry is a mutator of d and d.Parent().
-//
-// Preconditions: PrepareDeleteDentry was previously called on d.
+// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion
+// succeeds.
 func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
-	if d.parent != nil {
-		delete(d.parent.children, d.name)
-	}
-	d.setDisowned()
+	d.dead = true
 	d.mu.Unlock()
 	if d.isMounted() {
-		vfs.forgetDisownedMountpoint(d)
+		vfs.forgetDeadMountpoint(d)
 	}
 }
 
-// ForceDeleteDentry causes d to become disowned. It should only be used in
-// cases where VFS has no ability to stop the deletion (e.g. d represents the
-// local state of a file on a remote filesystem on which the file has already
-// been deleted).
-//
-// ForceDeleteDentry is a mutator of d and d.Parent().
-//
-// Preconditions: d is a child Dentry.
-func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
-	if checkInvariants {
-		if d.parent == nil {
-			panic("d is independent")
-		}
-		if d.IsDisowned() {
-			panic("d is already disowned")
-		}
-	}
+// InvalidateDentry is called when d ceases to represent the file it formerly
+// did for reasons outside of VFS' control (e.g. d represents the local state
+// of a file on a remote filesystem on which the file has already been
+// deleted).
+func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) {
 	d.mu.Lock()
-	vfs.CommitDeleteDentry(d)
+	d.dead = true
+	d.mu.Unlock()
+	if d.isMounted() {
+		vfs.forgetDeadMountpoint(d)
+	}
 }
 
 // PrepareRenameDentry must be called before attempting to rename the file
@@ -322,25 +215,9 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
 // caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
 // CommitRenameExchangeDentry depending on the rename's outcome.
 //
-// Preconditions: from is a child Dentry. If to is not nil, it must be a child
-// Dentry from the same Filesystem. from != to.
+// Preconditions: If to is not nil, it must be a child Dentry from the same
+// Filesystem. from != to.
 func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
-	if checkInvariants {
-		if from.parent == nil {
-			panic("from is independent")
-		}
-		if from.IsDisowned() {
-			panic("from is already disowned")
-		}
-		if to != nil {
-			if to.parent == nil {
-				panic("to is independent")
-			}
-			if to.IsDisowned() {
-				panic("to is already disowned")
-			}
-		}
-	}
 	vfs.mountMu.Lock()
 	if mntns.mountpoints[from] != 0 {
 		vfs.mountMu.Unlock()
@@ -374,24 +251,14 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
 // is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file
 // that was replaced by from.
 //
-// CommitRenameReplaceDentry is a mutator of from, to, from.Parent(), and
-// to.Parent().
-//
 // Preconditions: PrepareRenameDentry was previously called on from and to.
-// newParent.Child(newName) == to.
-func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) {
-	if newParent.children == nil {
-		newParent.children = make(map[string]*Dentry)
-	}
-	newParent.children[newName] = from
-	from.parent = newParent
-	from.name = newName
+func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) {
 	from.mu.Unlock()
 	if to != nil {
-		to.setDisowned()
+		to.dead = true
 		to.mu.Unlock()
 		if to.isMounted() {
-			vfs.forgetDisownedMountpoint(to)
+			vfs.forgetDeadMountpoint(to)
 		}
 	}
 }
@@ -399,25 +266,18 @@ func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry,
 // CommitRenameExchangeDentry must be called after the files represented by
 // from and to are exchanged by rename(RENAME_EXCHANGE).
 //
-// CommitRenameExchangeDentry is a mutator of from, to, from.Parent(), and
-// to.Parent().
-//
 // Preconditions: PrepareRenameDentry was previously called on from and to.
 func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
-	from.parent, to.parent = to.parent, from.parent
-	from.name, to.name = to.name, from.name
-	from.parent.children[from.name] = from
-	to.parent.children[to.name] = to
 	from.mu.Unlock()
 	to.mu.Unlock()
 }
 
-// forgetDisownedMountpoint is called when a mount point is deleted to umount
-// all mounts using it in all other mount namespaces.
+// forgetDeadMountpoint is called when a mount point is deleted or invalidated
+// to umount all mounts using it in all other mount namespaces.
 //
-// forgetDisownedMountpoint is analogous to Linux's
+// forgetDeadMountpoint is analogous to Linux's
 // fs/namespace.c:__detach_mounts().
-func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) {
+func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) {
 	var (
 		vdsToDecRef    []VirtualDentry
 		mountsToDecRef []*Mount
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 3af2aa58d..1e9dffc8f 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -56,6 +56,7 @@ type Device interface {
 	Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
 }
 
+// +stateify savable
 type registeredDevice struct {
 	dev  Device
 	opts RegisterDeviceOptions
@@ -63,6 +64,8 @@ type registeredDevice struct {
 
 // RegisterDeviceOptions contains options to
 // VirtualFilesystem.RegisterDevice().
+//
+// +stateify savable
 type RegisterDeviceOptions struct {
 	// GroupName is the name shown for this device registration in
 	// /proc/devices. If GroupName is empty, this registration will not be
@@ -100,7 +103,7 @@ func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mo
 }
 
 // GetAnonBlockDevMinor allocates and returns an unused minor device number for
-// an "anonymous" block device with major number 0.
+// an "anonymous" block device with major number UNNAMED_MAJOR.
 func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) {
 	vfs.anonBlockDevMinorMu.Lock()
 	defer vfs.anonBlockDevMinorMu.Unlock()
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 7c83f9a5a..8297f964b 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -85,8 +85,8 @@ type epollInterest struct {
 	ready bool
 	epollInterestEntry
 
-	// userData is the epoll_data_t associated with this epollInterest.
-	// userData is protected by epoll.mu.
+	// userData is the struct epoll_event::data associated with this
+	// epollInterest. userData is protected by epoll.mu.
 	userData [2]int32
 }
 
@@ -99,6 +99,8 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
 		interest: make(map[epollInterestKey]*epollInterest),
 	}
 	if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
 		UseDentryMetadata: true,
 	}); err != nil {
 		return nil, err
@@ -157,7 +159,7 @@ func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (
 // AddInterest implements the semantics of EPOLL_CTL_ADD.
 //
 // Preconditions: A reference must be held on file.
-func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
 	// Check for cyclic polling if necessary.
 	subep, _ := file.impl.(*EpollInstance)
 	if subep != nil {
@@ -183,13 +185,14 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint
 	}
 
 	// Register interest in file.
-	mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+	mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
 	epi := &epollInterest{
 		epoll:    ep,
 		key:      key,
 		mask:     mask,
-		userData: userData,
+		userData: event.Data,
 	}
+	epi.waiter.Callback = epi
 	ep.interest[key] = epi
 	wmask := waiter.EventMaskFromLinux(mask)
 	file.EventRegister(&epi.waiter, wmask)
@@ -202,6 +205,9 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint
 	// Add epi to file.epolls so that it is removed when the last
 	// FileDescription reference is dropped.
 	file.epollMu.Lock()
+	if file.epolls == nil {
+		file.epolls = make(map[*epollInterest]struct{})
+	}
 	file.epolls[epi] = struct{}{}
 	file.epollMu.Unlock()
 
@@ -236,7 +242,7 @@ func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursi
 // ModifyInterest implements the semantics of EPOLL_CTL_MOD.
 //
 // Preconditions: A reference must be held on file.
-func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
 	ep.interestMu.Lock()
 	defer ep.interestMu.Unlock()
 
@@ -250,13 +256,13 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask u
 	}
 
 	// Update epi for the next call to ep.ReadEvents().
+	mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
 	ep.mu.Lock()
 	epi.mask = mask
-	epi.userData = userData
+	epi.userData = event.Data
 	ep.mu.Unlock()
 
 	// Re-register with the new mask.
-	mask |= linux.EPOLLERR | linux.EPOLLRDHUP
 	file.EventUnregister(&epi.waiter)
 	wmask := waiter.EventMaskFromLinux(mask)
 	file.EventRegister(&epi.waiter, wmask)
@@ -363,8 +369,7 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
 		// Report ievents.
 		events[i] = linux.EpollEvent{
 			Events: ievents.ToLinux(),
-			Fd:     epi.userData[0],
-			Data:   epi.userData[1],
+			Data:   epi.userData,
 		}
 		i++
 		if i == len(events) {
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 5bac660c7..bb294563d 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -84,6 +84,13 @@ type FileDescriptionOptions struct {
 	// usually only the case if O_DIRECT would actually have an effect.
 	AllowDirectIO bool
 
+	// If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
+	DenyPRead bool
+
+	// If DenyPWrite is true, calls to FileDescription.PWrite() return
+	// ESPIPE.
+	DenyPWrite bool
+
 	// If UseDentryMetadata is true, calls to FileDescription methods that
 	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
 	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
@@ -100,10 +107,10 @@ type FileDescriptionOptions struct {
 }
 
 // Init must be called before first use of fd. If it succeeds, it takes
-// references on mnt and d. statusFlags is the initial file description status
-// flags, which is usually the full set of flags passed to open(2).
-func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
-	writable := MayWriteFileWithOpenFlags(statusFlags)
+// references on mnt and d. flags is the initial file description flags, which
+// is usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
+	writable := MayWriteFileWithOpenFlags(flags)
 	if writable {
 		if err := mnt.CheckBeginWrite(); err != nil {
 			return err
@@ -111,14 +118,18 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn
 	}
 
 	fd.refs = 1
-	fd.statusFlags = statusFlags | linux.O_LARGEFILE
+
+	// Remove "file creation flags" to mirror the behavior from file.f_flags in
+	// fs/open.c:do_dentry_open
+	fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC)
 	fd.vd = VirtualDentry{
 		mount:  mnt,
 		dentry: d,
 	}
-	fd.vd.IncRef()
+	mnt.IncRef()
+	d.IncRef()
 	fd.opts = *opts
-	fd.readable = MayReadFileWithOpenFlags(statusFlags)
+	fd.readable = MayReadFileWithOpenFlags(flags)
 	fd.writable = writable
 	fd.impl = impl
 	return nil
@@ -175,6 +186,12 @@ func (fd *FileDescription) DecRef() {
 	}
 }
 
+// Refs returns the current number of references. The returned count
+// is inherently racy and is unsafe to use without external synchronization.
+func (fd *FileDescription) Refs() int64 {
+	return atomic.LoadInt64(&fd.refs)
+}
+
 // Mount returns the mount on which fd was opened. It does not take a reference
 // on the returned Mount.
 func (fd *FileDescription) Mount() *Mount {
@@ -193,6 +210,11 @@ func (fd *FileDescription) VirtualDentry() VirtualDentry {
 	return fd.vd
 }
 
+// Options returns the options passed to fd.Init().
+func (fd *FileDescription) Options() FileDescriptionOptions {
+	return fd.opts
+}
+
 // StatusFlags returns file description status flags, as for fcntl(F_GETFL).
 func (fd *FileDescription) StatusFlags() uint32 {
 	return atomic.LoadUint32(&fd.statusFlags)
@@ -286,7 +308,8 @@ type FileDescriptionImpl interface {
 	Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
 
 	// SetStat updates metadata for the file represented by the
-	// FileDescription.
+	// FileDescription. Implementations are responsible for checking if the
+	// operation can be performed (see vfs.CheckSetStat() for common checks).
 	SetStat(ctx context.Context, opts SetStatOptions) error
 
 	// StatFS returns metadata for the filesystem containing the file
@@ -305,6 +328,7 @@ type FileDescriptionImpl interface {
 	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
 	//
 	// Preconditions: The FileDescription was opened for reading.
+	// FileDescriptionOptions.DenyPRead == false.
 	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
 
 	// Read is similar to PRead, but does not specify an offset.
@@ -336,6 +360,7 @@ type FileDescriptionImpl interface {
 	// EOPNOTSUPP.
 	//
 	// Preconditions: The FileDescription was opened for writing.
+	// FileDescriptionOptions.DenyPWrite == false.
 	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
 
 	// Write is similar to PWrite, but does not specify an offset, which is
@@ -381,11 +406,11 @@ type FileDescriptionImpl interface {
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
 
 	// Listxattr returns all extended attribute names for the file.
-	Listxattr(ctx context.Context) ([]string, error)
+	Listxattr(ctx context.Context, size uint64) ([]string, error)
 
 	// Getxattr returns the value associated with the given extended attribute
 	// for the file.
-	Getxattr(ctx context.Context, name string) (string, error)
+	Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
 
 	// Setxattr changes the value associated with the given extended attribute
 	// for the file.
@@ -435,11 +460,11 @@ type Dirent struct {
 
 // IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
 type IterDirentsCallback interface {
-	// Handle handles the given iterated Dirent. It returns true if iteration
-	// should continue, and false if FileDescriptionImpl.IterDirents should
-	// terminate now and restart with the same Dirent the next time it is
-	// called.
-	Handle(dirent Dirent) bool
+	// Handle handles the given iterated Dirent. If Handle returns a non-nil
+	// error, FileDescriptionImpl.IterDirents must stop iteration and return
+	// the error; the next call to FileDescriptionImpl.IterDirents should
+	// restart with the same Dirent.
+	Handle(dirent Dirent) error
 }
 
 // OnClose is called when a file descriptor representing the FileDescription is
@@ -514,6 +539,9 @@ func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
 // offset, and returns the number of bytes read. PRead is permitted to return
 // partial reads with a nil error.
 func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	if fd.opts.DenyPRead {
+		return 0, syserror.ESPIPE
+	}
 	if !fd.readable {
 		return 0, syserror.EBADF
 	}
@@ -532,6 +560,9 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt
 // offset, and returns the number of bytes written. PWrite is permitted to
 // return partial writes with a nil error.
 func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	if fd.opts.DenyPWrite {
+		return 0, syserror.ESPIPE
+	}
 	if !fd.writable {
 		return 0, syserror.EBADF
 	}
@@ -576,18 +607,23 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 
 // Listxattr returns all extended attribute names for the file represented by
 // fd.
-func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+//
+// If the size of the list (including a NUL terminating byte after every entry)
+// would exceed size, ERANGE may be returned. Note that implementations
+// are free to ignore size entirely and return without error). In all cases,
+// if size is 0, the list should be returned without error, regardless of size.
+func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
 		vfsObj.putResolvingPath(rp)
 		return names, err
 	}
-	names, err := fd.impl.Listxattr(ctx)
+	names, err := fd.impl.Listxattr(ctx, size)
 	if err == syserror.ENOTSUP {
 		// Linux doesn't actually return ENOTSUP in this case; instead,
 		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -600,34 +636,39 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
 
 // Getxattr returns the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+//
+// If the size of the return value exceeds opts.Size, ERANGE may be returned
+// (note that implementations are free to ignore opts.Size entirely and return
+// without error). In all cases, if opts.Size is 0, the value should be
+// returned without error, regardless of size.
+func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(rp)
 		return val, err
 	}
-	return fd.impl.Getxattr(ctx, name)
+	return fd.impl.Getxattr(ctx, *opts)
 }
 
 // Setxattr changes the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(rp)
 		return err
 	}
-	return fd.impl.Setxattr(ctx, opts)
+	return fd.impl.Setxattr(ctx, *opts)
 }
 
 // Removexattr removes the given extended attribute from the file represented
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index c2a52ec1b..f4c111926 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -33,8 +33,8 @@ import (
 // implementations to adapt:
 //   - Have a local fileDescription struct (containing FileDescription) which
 //     embeds FileDescriptionDefaultImpl and overrides the default methods
-//     which are common to all fd implementations for that for that filesystem
-//     like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
+//     which are common to all fd implementations for that filesystem like
+//     StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
 //   - This should be embedded in all file description implementations as the
 //     first field by value.
 //   - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl.
@@ -130,14 +130,14 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 
 // Listxattr implements FileDescriptionImpl.Listxattr analogously to
 // inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) {
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
 	// This isn't exactly accurate; see FileDescription.Listxattr.
 	return nil, syserror.ENOTSUP
 }
 
 // Getxattr implements FileDescriptionImpl.Getxattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) {
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
 	return "", syserror.ENOTSUP
 }
 
@@ -339,6 +339,11 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src
 	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
+	limit, err := CheckLimit(ctx, offset, src.NumBytes())
+	if err != nil {
+		return 0, err
+	}
+	src = src.TakeFirst64(limit)
 
 	writable, ok := fd.data.(WritableDynamicBytesSource)
 	if !ok {
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 8fa26418e..3a75d4d62 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -107,7 +107,10 @@ func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error {
 func TestGenCountFD(t *testing.T) {
 	ctx := contexttest.Context(t)
 
-	vfsObj := New() // vfs.New()
+	vfsObj := &VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
 	defer fd.DecRef()
 
@@ -162,7 +165,10 @@ func TestGenCountFD(t *testing.T) {
 func TestWritable(t *testing.T) {
 	ctx := contexttest.Context(t)
 
-	vfsObj := New() // vfs.New()
+	vfsObj := &VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
 	fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
 	defer fd.DecRef()
 
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index a06a6caf3..1edd584c9 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -20,6 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 )
 
 // A Filesystem is a tree of nodes represented by Dentries, which forms part of
@@ -29,6 +31,8 @@ import (
 // Filesystem methods require that a reference is held.
 //
 // Filesystem is analogous to Linux's struct super_block.
+//
+// +stateify savable
 type Filesystem struct {
 	// refs is the reference count. refs is accessed using atomic memory
 	// operations.
@@ -38,21 +42,30 @@ type Filesystem struct {
 	// immutable.
 	vfs *VirtualFilesystem
 
+	// fsType is the FilesystemType of this Filesystem.
+	fsType FilesystemType
+
 	// impl is the FilesystemImpl associated with this Filesystem. impl is
 	// immutable. This should be the last field in Dentry.
 	impl FilesystemImpl
 }
 
 // Init must be called before first use of fs.
-func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) {
+func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
 	fs.refs = 1
 	fs.vfs = vfsObj
+	fs.fsType = fsType
 	fs.impl = impl
 	vfsObj.filesystemsMu.Lock()
 	vfsObj.filesystems[fs] = struct{}{}
 	vfsObj.filesystemsMu.Unlock()
 }
 
+// FilesystemType returns the FilesystemType for this Filesystem.
+func (fs *Filesystem) FilesystemType() FilesystemType {
+	return fs.fsType
+}
+
 // VirtualFilesystem returns the containing VirtualFilesystem.
 func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem {
 	return fs.vfs
@@ -142,6 +155,9 @@ type FilesystemImpl interface {
 	// file data to be written to the underlying [filesystem]", as by syncfs(2).
 	Sync(ctx context.Context) error
 
+	// AccessAt checks whether a user with creds can access the file at rp.
+	AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error
+
 	// GetDentryAt returns a Dentry representing the file at rp. A reference is
 	// taken on the returned Dentry.
 	//
@@ -330,7 +346,10 @@ type FilesystemImpl interface {
 	// ENOTEMPTY.
 	//
 	// Preconditions: !rp.Done(). For the final path component in rp,
-	// !rp.ShouldFollowSymlink(). oldName is not "." or "..".
+	// !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a
+	// previous call to
+	// oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is
+	// not "." or "..".
 	//
 	// Postconditions: If RenameAt returns an error returned by
 	// ResolvingPath.Resolve*(), then !rp.Done().
@@ -360,7 +379,9 @@ type FilesystemImpl interface {
 	// ResolvingPath.Resolve*(), then !rp.Done().
 	RmdirAt(ctx context.Context, rp *ResolvingPath) error
 
-	// SetStatAt updates metadata for the file at the given path.
+	// SetStatAt updates metadata for the file at the given path. Implementations
+	// are responsible for checking if the operation can be performed
+	// (see vfs.CheckSetStat() for common checks).
 	//
 	// Errors:
 	//
@@ -422,9 +443,14 @@ type FilesystemImpl interface {
 	// Errors:
 	//
 	// - If extended attributes are not supported by the filesystem,
-	// ListxattrAt returns nil. (See FileDescription.Listxattr for an
-	// explanation.)
-	ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
+	// ListxattrAt returns ENOTSUP.
+	//
+	// - If the size of the list (including a NUL terminating byte after every
+	// entry) would exceed size, ERANGE may be returned. Note that
+	// implementations are free to ignore size entirely and return without
+	// error). In all cases, if size is 0, the list should be returned without
+	// error, regardless of size.
+	ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
 
 	// GetxattrAt returns the value associated with the given extended
 	// attribute for the file at rp.
@@ -433,7 +459,15 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem, GetxattrAt
 	// returns ENOTSUP.
-	GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
+	//
+	// - If an extended attribute named opts.Name does not exist, ENODATA is
+	// returned.
+	//
+	// - If the size of the return value exceeds opts.Size, ERANGE may be
+	// returned (note that implementations are free to ignore opts.Size entirely
+	// and return without error). In all cases, if opts.Size is 0, the value
+	// should be returned without error, regardless of size.
+	GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
 
 	// SetxattrAt changes the value associated with the given extended
 	// attribute for the file at rp.
@@ -442,6 +476,10 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem, SetxattrAt
 	// returns ENOTSUP.
+	//
+	// - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
+	// EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
+	// ENODATA is returned.
 	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
 
 	// RemovexattrAt removes the given extended attribute from the file at rp.
@@ -450,8 +488,21 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem,
 	// RemovexattrAt returns ENOTSUP.
+	//
+	// - If name does not exist, ENODATA is returned.
 	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
 
+	// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
+	//
+	// Errors:
+	//
+	// - If the file does not have write permissions, then BoundEndpointAt
+	// returns EACCES.
+	//
+	// - If a non-socket file exists at rp, then BoundEndpointAt returns
+	// ECONNREFUSED.
+	BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error)
+
 	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
 	//
 	// If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
@@ -474,7 +525,7 @@ type FilesystemImpl interface {
 	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
 	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
 
-	// TODO: inotify_add_watch(); bind()
+	// TODO(gvisor.dev/issue/1479): inotify_add_watch()
 }
 
 // PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 7315a588e..465e610e0 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,8 +16,6 @@ package vfs
 
 import (
 	"strings"
-
-	"gvisor.dev/gvisor/pkg/fspath"
 )
 
 // GenericParseMountOptions parses a comma-separated list of options of the
@@ -43,27 +41,3 @@ func GenericParseMountOptions(str string) map[string]string {
 	}
 	return m
 }
-
-// GenericPrependPath may be used by implementations of
-// FilesystemImpl.PrependPath() for which a single statically-determined lock
-// or set of locks is sufficient to ensure its preconditions (as opposed to
-// e.g. per-Dentry locks).
-//
-// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for
-// vd.Dentry() and all of its ancestors.
-func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error {
-	mnt, d := vd.mount, vd.dentry
-	for {
-		if mnt == vfsroot.mount && d == vfsroot.dentry {
-			return PrependPathAtVFSRootError{}
-		}
-		if d == mnt.root {
-			return nil
-		}
-		if d.parent == nil {
-			return PrependPathAtNonMountRootError{}
-		}
-		b.PrependComponent(d.name)
-		d = d.parent
-	}
-}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index c58b70728..f2298f7f6 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -30,6 +30,9 @@ type FilesystemType interface {
 	// along with its mount root. A reference is taken on the returned
 	// Filesystem and Dentry.
 	GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error)
+
+	// Name returns the name of this FilesystemType.
+	Name() string
 }
 
 // GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
@@ -44,6 +47,7 @@ type GetFilesystemOptions struct {
 	InternalData interface{}
 }
 
+// +stateify savable
 type registeredFilesystemType struct {
 	fsType FilesystemType
 	opts   RegisterFilesystemTypeOptions
diff --git a/pkg/sentry/vfs/genericfstree/BUILD b/pkg/sentry/vfs/genericfstree/BUILD
new file mode 100644
index 000000000..d8fd92677
--- /dev/null
+++ b/pkg/sentry/vfs/genericfstree/BUILD
@@ -0,0 +1,16 @@
+load("//tools/go_generics:defs.bzl", "go_template")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+go_template(
+    name = "generic_fstree",
+    srcs = [
+        "genericfstree.go",
+    ],
+    types = [
+        "Dentry",
+    ],
+)
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
new file mode 100644
index 000000000..8882fa84a
--- /dev/null
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -0,0 +1,81 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package genericfstree provides tools for implementing vfs.FilesystemImpls
+// where a single statically-determined lock or set of locks is sufficient to
+// ensure that a Dentry's name and parent are contextually immutable.
+//
+// Clients using this package must use the go_template_instance rule in
+// tools/go_generics/defs.bzl to create an instantiation of this template
+// package, providing types to use in place of Dentry.
+package genericfstree
+
+import (
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Dentry is a required type parameter that is a struct with the given fields.
+type Dentry struct {
+	// vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl.
+	vfsd vfs.Dentry
+
+	// parent is the parent of this Dentry in the filesystem's tree. If this
+	// Dentry is a filesystem root, parent is nil.
+	parent *Dentry
+
+	// name is the name of this Dentry in its parent. If this Dentry is a
+	// filesystem root, name is unspecified.
+	name string
+}
+
+// IsAncestorDentry returns true if d is an ancestor of d2; that is, d is
+// either d2's parent or an ancestor of d2's parent.
+func IsAncestorDentry(d, d2 *Dentry) bool {
+	for d2 != nil { // Stop at root, where d2.parent == nil.
+		if d2.parent == d {
+			return true
+		}
+		if d2.parent == d2 {
+			return false
+		}
+		d2 = d2.parent
+	}
+	return false
+}
+
+// ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d.
+func ParentOrSelf(d *Dentry) *Dentry {
+	if d.parent != nil {
+		return d.parent
+	}
+	return d
+}
+
+// PrependPath is a generic implementation of FilesystemImpl.PrependPath().
+func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath.Builder) error {
+	for {
+		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+			return vfs.PrependPathAtVFSRootError{}
+		}
+		if &d.vfsd == mnt.Root() {
+			return nil
+		}
+		if d.parent == nil {
+			return vfs.PrependPathAtNonMountRootError{}
+		}
+		b.PrependComponent(d.name)
+		d = d.parent
+	}
+}
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
new file mode 100644
index 000000000..05a3051a4
--- /dev/null
+++ b/pkg/sentry/vfs/inotify.go
@@ -0,0 +1,697 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"bytes"
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// inotifyEventBaseSize is the base size of linux's struct inotify_event. This
+// must be a power 2 for rounding below.
+const inotifyEventBaseSize = 16
+
+// EventType defines different kinds of inotfiy events.
+//
+// The way events are labelled appears somewhat arbitrary, but they must match
+// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+type EventType uint8
+
+// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
+// FSNOTIFY_EVENT_INODE in Linux.
+const (
+	PathEvent  EventType = iota
+	InodeEvent EventType = iota
+)
+
+// Inotify represents an inotify instance created by inotify_init(2) or
+// inotify_init1(2). Inotify implements FileDescriptionImpl.
+//
+// Lock ordering:
+//   Inotify.mu -> Watches.mu -> Inotify.evMu
+//
+// +stateify savable
+type Inotify struct {
+	vfsfd FileDescription
+	FileDescriptionDefaultImpl
+	DentryMetadataFileDescriptionImpl
+
+	// Unique identifier for this inotify instance. We don't just reuse the
+	// inotify fd because fds can be duped. These should not be exposed to the
+	// user, since we may aggressively reuse an id on S/R.
+	id uint64
+
+	// queue is used to notify interested parties when the inotify instance
+	// becomes readable or writable.
+	queue waiter.Queue `state:"nosave"`
+
+	// evMu *only* protects the events list. We need a separate lock while
+	// queuing events: using mu may violate lock ordering, since at that point
+	// the calling goroutine may already hold Watches.mu.
+	evMu sync.Mutex `state:"nosave"`
+
+	// A list of pending events for this inotify instance. Protected by evMu.
+	events eventList
+
+	// A scratch buffer, used to serialize inotify events. Allocate this
+	// ahead of time for the sake of performance. Protected by evMu.
+	scratch []byte
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// nextWatchMinusOne is used to allocate watch descriptors on this Inotify
+	// instance. Note that Linux starts numbering watch descriptors from 1.
+	nextWatchMinusOne int32
+
+	// Map from watch descriptors to watch objects.
+	watches map[int32]*Watch
+}
+
+var _ FileDescriptionImpl = (*Inotify)(nil)
+
+// NewInotifyFD constructs a new Inotify instance.
+func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) {
+	// O_CLOEXEC affects file descriptors, so it must be handled outside of vfs.
+	flags &^= linux.O_CLOEXEC
+	if flags&^linux.O_NONBLOCK != 0 {
+		return nil, syserror.EINVAL
+	}
+
+	id := uniqueid.GlobalFromContext(ctx)
+	vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id))
+	defer vd.DecRef()
+	fd := &Inotify{
+		id:      id,
+		scratch: make([]byte, inotifyEventBaseSize),
+		watches: make(map[int32]*Watch),
+	}
+	if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release. Release removes all
+// watches and frees all resources for an inotify instance.
+func (i *Inotify) Release() {
+	// We need to hold i.mu to avoid a race with concurrent calls to
+	// Inotify.handleDeletion from Watches. There's no risk of Watches
+	// accessing this Inotify after the destructor ends, because we remove all
+	// references to it below.
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	for _, w := range i.watches {
+		// Remove references to the watch from the watches set on the target. We
+		// don't need to worry about the references from i.watches, since this
+		// file description is about to be destroyed.
+		w.set.Remove(i.id)
+	}
+}
+
+// EventRegister implements waiter.Waitable.
+func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	i.queue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.
+func (i *Inotify) EventUnregister(e *waiter.Entry) {
+	i.queue.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// Readiness indicates whether there are pending events for an inotify instance.
+func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	if !i.events.Empty() {
+		ready |= waiter.EventIn
+	}
+
+	return mask & ready
+}
+
+// PRead implements FileDescriptionImpl.
+func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// PWrite implements FileDescriptionImpl.
+func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	if dst.NumBytes() < inotifyEventBaseSize {
+		return 0, syserror.EINVAL
+	}
+
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	if i.events.Empty() {
+		// Nothing to read yet, tell caller to block.
+		return 0, syserror.ErrWouldBlock
+	}
+
+	var writeLen int64
+	for it := i.events.Front(); it != nil; {
+		// Advance `it` before the element is removed from the list, or else
+		// it.Next() will always be nil.
+		event := it
+		it = it.Next()
+
+		// Does the buffer have enough remaining space to hold the event we're
+		// about to write out?
+		if dst.NumBytes() < int64(event.sizeOf()) {
+			if writeLen > 0 {
+				// Buffer wasn't big enough for all pending events, but we did
+				// write some events out.
+				return writeLen, nil
+			}
+			return 0, syserror.EINVAL
+		}
+
+		// Linux always dequeues an available event as long as there's enough
+		// buffer space to copy it out, even if the copy below fails. Emulate
+		// this behaviour.
+		i.events.Remove(event)
+
+		// Buffer has enough space, copy event to the read buffer.
+		n, err := event.CopyTo(ctx, i.scratch, dst)
+		if err != nil {
+			return 0, err
+		}
+
+		writeLen += n
+		dst = dst.DropFirst64(n)
+	}
+	return writeLen, nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch args[1].Int() {
+	case linux.FIONREAD:
+		i.evMu.Lock()
+		defer i.evMu.Unlock()
+		var n uint32
+		for e := i.events.Front(); e != nil; e = e.Next() {
+			n += uint32(e.sizeOf())
+		}
+		var buf [4]byte
+		usermem.ByteOrder.PutUint32(buf[:], n)
+		_, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+func (i *Inotify) queueEvent(ev *Event) {
+	i.evMu.Lock()
+
+	// Check if we should coalesce the event we're about to queue with the last
+	// one currently in the queue. Events are coalesced if they are identical.
+	if last := i.events.Back(); last != nil {
+		if ev.equals(last) {
+			// "Coalesce" the two events by simply not queuing the new one. We
+			// don't need to raise a waiter.EventIn notification because no new
+			// data is available for reading.
+			i.evMu.Unlock()
+			return
+		}
+	}
+
+	i.events.PushBack(ev)
+
+	// Release mutex before notifying waiters because we don't control what they
+	// can do.
+	i.evMu.Unlock()
+
+	i.queue.Notify(waiter.EventIn)
+}
+
+// newWatchLocked creates and adds a new watch to target.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch {
+	targetWatches := target.Watches()
+	w := &Watch{
+		owner: i,
+		wd:    i.nextWatchIDLocked(),
+		set:   targetWatches,
+		mask:  mask,
+	}
+
+	// Hold the watch in this inotify instance as well as the watch set on the
+	// target.
+	i.watches[w.wd] = w
+	targetWatches.Add(w)
+	return w
+}
+
+// newWatchIDLocked allocates and returns a new watch descriptor.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) nextWatchIDLocked() int32 {
+	i.nextWatchMinusOne++
+	return i.nextWatchMinusOne
+}
+
+// handleDeletion handles the deletion of the target of watch w. It removes w
+// from i.watches and a watch removal event is generated.
+func (i *Inotify) handleDeletion(w *Watch) {
+	i.mu.Lock()
+	_, found := i.watches[w.wd]
+	delete(i.watches, w.wd)
+	i.mu.Unlock()
+
+	if found {
+		i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0))
+	}
+}
+
+// AddWatch constructs a new inotify watch and adds it to the target. It
+// returns the watch descriptor returned by inotify_add_watch(2).
+func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
+	// Note: Locking this inotify instance protects the result returned by
+	// Lookup() below. With the lock held, we know for sure the lookup result
+	// won't become stale because it's impossible for *this* instance to
+	// add/remove watches on target.
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	// Does the target already have a watch from this inotify instance?
+	if existing := target.Watches().Lookup(i.id); existing != nil {
+		newmask := mask
+		if mask&linux.IN_MASK_ADD != 0 {
+			// "Add (OR) events to watch mask for this pathname if it already
+			// exists (instead of replacing mask)." -- inotify(7)
+			newmask |= atomic.LoadUint32(&existing.mask)
+		}
+		atomic.StoreUint32(&existing.mask, newmask)
+		return existing.wd
+	}
+
+	// No existing watch, create a new watch.
+	w := i.newWatchLocked(target, mask)
+	return w.wd
+}
+
+// RmWatch looks up an inotify watch for the given 'wd' and configures the
+// target to stop sending events to this inotify instance.
+func (i *Inotify) RmWatch(wd int32) error {
+	i.mu.Lock()
+
+	// Find the watch we were asked to removed.
+	w, ok := i.watches[wd]
+	if !ok {
+		i.mu.Unlock()
+		return syserror.EINVAL
+	}
+
+	// Remove the watch from this instance.
+	delete(i.watches, wd)
+
+	// Remove the watch from the watch target.
+	w.set.Remove(w.OwnerID())
+	i.mu.Unlock()
+
+	// Generate the event for the removal.
+	i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0))
+
+	return nil
+}
+
+// Watches is the collection of all inotify watches on a single file.
+//
+// +stateify savable
+type Watches struct {
+	// mu protects the fields below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// ws is the map of active watches in this collection, keyed by the inotify
+	// instance id of the owner.
+	ws map[uint64]*Watch
+}
+
+// Lookup returns the watch owned by an inotify instance with the given id.
+// Returns nil if no such watch exists.
+//
+// Precondition: the inotify instance with the given id must be locked to
+// prevent the returned watch from being concurrently modified or replaced in
+// Inotify.watches.
+func (w *Watches) Lookup(id uint64) *Watch {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.ws[id]
+}
+
+// Add adds watch into this set of watches.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Add(watch *Watch) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	owner := watch.OwnerID()
+	// Sanity check, we should never have two watches for one owner on the
+	// same target.
+	if _, exists := w.ws[owner]; exists {
+		panic(fmt.Sprintf("Watch collision with ID %+v", owner))
+	}
+	if w.ws == nil {
+		w.ws = make(map[uint64]*Watch)
+	}
+	w.ws[owner] = watch
+}
+
+// Remove removes a watch with the given id from this set of watches and
+// releases it. The caller is responsible for generating any watch removal
+// event, as appropriate. The provided id must match an existing watch in this
+// collection.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Remove(id uint64) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	if w.ws == nil {
+		// This watch set is being destroyed. The thread executing the
+		// destructor is already in the process of deleting all our watches. We
+		// got here with no references on the target because we raced with the
+		// destructor notifying all the watch owners of destruction. See the
+		// comment in Watches.HandleDeletion for why this race exists.
+		return
+	}
+
+	if _, ok := w.ws[id]; !ok {
+		// While there's technically no problem with silently ignoring a missing
+		// watch, this is almost certainly a bug.
+		panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id))
+	}
+	delete(w.ws, id)
+}
+
+// Notify queues a new event with all watches in this set.
+func (w *Watches) Notify(name string, events, cookie uint32, et EventType) {
+	w.NotifyWithExclusions(name, events, cookie, et, false)
+}
+
+// NotifyWithExclusions queues a new event with watches in this set. Watches
+// with IN_EXCL_UNLINK are skipped if the event is coming from a child that
+// has been unlinked.
+func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et EventType, unlinked bool) {
+	// N.B. We don't defer the unlocks because Notify is in the hot path of
+	// all IO operations, and the defer costs too much for small IO
+	// operations.
+	w.mu.RLock()
+	for _, watch := range w.ws {
+		if unlinked && watch.ExcludeUnlinkedChildren() && et == PathEvent {
+			continue
+		}
+		watch.Notify(name, events, cookie)
+	}
+	w.mu.RUnlock()
+}
+
+// HandleDeletion is called when the watch target is destroyed to emit
+// the appropriate events.
+func (w *Watches) HandleDeletion() {
+	w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent)
+
+	// TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied
+	// by value. Ideally, we wouldn't have this circular locking so we can just
+	// notify of IN_DELETE_SELF in the same loop below.
+	//
+	// We can't hold w.mu while calling watch.handleDeletion to preserve lock
+	// ordering w.r.t to the owner inotify instances. Instead, atomically move
+	// the watches map into a local variable so we can iterate over it safely.
+	//
+	// Because of this however, it is possible for the watches' owners to reach
+	// this inode while the inode has no refs. This is still safe because the
+	// owners can only reach the inode until this function finishes calling
+	// watch.handleDeletion below and the inode is guaranteed to exist in the
+	// meantime. But we still have to be very careful not to rely on inode state
+	// that may have been already destroyed.
+	var ws map[uint64]*Watch
+	w.mu.Lock()
+	ws = w.ws
+	w.ws = nil
+	w.mu.Unlock()
+
+	for _, watch := range ws {
+		// TODO(gvisor.dev/issue/1479): consider refactoring this.
+		watch.handleDeletion()
+	}
+}
+
+// Watch represent a particular inotify watch created by inotify_add_watch.
+//
+// +stateify savable
+type Watch struct {
+	// Inotify instance which owns this watch.
+	owner *Inotify
+
+	// Descriptor for this watch. This is unique across an inotify instance.
+	wd int32
+
+	// set is the watch set containing this watch. It belongs to the target file
+	// of this watch.
+	set *Watches
+
+	// Events being monitored via this watch. Must be accessed with atomic
+	// memory operations.
+	mask uint32
+}
+
+// OwnerID returns the id of the inotify instance that owns this watch.
+func (w *Watch) OwnerID() uint64 {
+	return w.owner.id
+}
+
+// ExcludeUnlinkedChildren indicates whether the watched object should continue
+// to be notified of events of its children after they have been unlinked, e.g.
+// for an open file descriptor.
+//
+// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK.
+// We can do this by keeping track of the set of unlinked children in Watches
+// to skip notification.
+func (w *Watch) ExcludeUnlinkedChildren() bool {
+	return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
+}
+
+// Notify queues a new event on this watch.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) {
+	mask := atomic.LoadUint32(&w.mask)
+	if mask&events == 0 {
+		// We weren't watching for this event.
+		return
+	}
+
+	// Event mask should include bits matched from the watch plus all control
+	// event bits.
+	unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
+	effectiveMask := unmaskableBits | mask
+	matchedEvents := effectiveMask & events
+	w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
+}
+
+// handleDeletion handles the deletion of w's target.
+func (w *Watch) handleDeletion() {
+	w.owner.handleDeletion(w)
+}
+
+// Event represents a struct inotify_event from linux.
+//
+// +stateify savable
+type Event struct {
+	eventEntry
+
+	wd     int32
+	mask   uint32
+	cookie uint32
+
+	// len is computed based on the name field is set automatically by
+	// Event.setName. It should be 0 when no name is set; otherwise it is the
+	// length of the name slice.
+	len uint32
+
+	// The name field has special padding requirements and should only be set by
+	// calling Event.setName.
+	name []byte
+}
+
+func newEvent(wd int32, name string, events, cookie uint32) *Event {
+	e := &Event{
+		wd:     wd,
+		mask:   events,
+		cookie: cookie,
+	}
+	if name != "" {
+		e.setName(name)
+	}
+	return e
+}
+
+// paddedBytes converts a go string to a null-terminated c-string, padded with
+// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes
+// in the 's' plus at least one null byte.
+func paddedBytes(s string, l uint32) []byte {
+	if l < uint32(len(s)+1) {
+		panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!")
+	}
+	b := make([]byte, l)
+	copy(b, s)
+
+	// b was zero-value initialized during make(), so the rest of the slice is
+	// already filled with null bytes.
+
+	return b
+}
+
+// setName sets the optional name for this event.
+func (e *Event) setName(name string) {
+	// We need to pad the name such that the entire event length ends up a
+	// multiple of inotifyEventBaseSize.
+	unpaddedLen := len(name) + 1
+	// Round up to nearest multiple of inotifyEventBaseSize.
+	e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1))
+	// Make sure we haven't overflowed and wrapped around when rounding.
+	if unpaddedLen > int(e.len) {
+		panic("Overflow when rounding inotify event size, the 'name' field was too big.")
+	}
+	e.name = paddedBytes(name, e.len)
+}
+
+func (e *Event) sizeOf() int {
+	s := inotifyEventBaseSize + int(e.len)
+	if s < inotifyEventBaseSize {
+		panic("overflow")
+	}
+	return s
+}
+
+// CopyTo serializes this event to dst. buf is used as a scratch buffer to
+// construct the output. We use a buffer allocated ahead of time for
+// performance. buf must be at least inotifyEventBaseSize bytes.
+func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) {
+	usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd))
+	usermem.ByteOrder.PutUint32(buf[4:], e.mask)
+	usermem.ByteOrder.PutUint32(buf[8:], e.cookie)
+	usermem.ByteOrder.PutUint32(buf[12:], e.len)
+
+	writeLen := 0
+
+	n, err := dst.CopyOut(ctx, buf)
+	if err != nil {
+		return 0, err
+	}
+	writeLen += n
+	dst = dst.DropFirst(n)
+
+	if e.len > 0 {
+		n, err = dst.CopyOut(ctx, e.name)
+		if err != nil {
+			return 0, err
+		}
+		writeLen += n
+	}
+
+	// Santiy check.
+	if writeLen != e.sizeOf() {
+		panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen))
+	}
+
+	return int64(writeLen), nil
+}
+
+func (e *Event) equals(other *Event) bool {
+	return e.wd == other.wd &&
+		e.mask == other.mask &&
+		e.cookie == other.cookie &&
+		e.len == other.len &&
+		bytes.Equal(e.name, other.name)
+}
+
+// InotifyEventFromStatMask generates the appropriate events for an operation
+// that set the stats specified in mask.
+func InotifyEventFromStatMask(mask uint32) uint32 {
+	var ev uint32
+	if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 {
+		ev |= linux.IN_ATTRIB
+	}
+	if mask&linux.STATX_SIZE != 0 {
+		ev |= linux.IN_MODIFY
+	}
+
+	if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) {
+		// Both times indicates a utime(s) call.
+		ev |= linux.IN_ATTRIB
+	} else if mask&linux.STATX_ATIME != 0 {
+		ev |= linux.IN_ACCESS
+	} else if mask&linux.STATX_MTIME != 0 {
+		mask |= linux.IN_MODIFY
+	}
+	return ev
+}
+
+// InotifyRemoveChild sends the appriopriate notifications to the watch sets of
+// the child being removed and its parent.
+func InotifyRemoveChild(self, parent *Watches, name string) {
+	self.Notify("", linux.IN_ATTRIB, 0, InodeEvent)
+	parent.Notify(name, linux.IN_DELETE, 0, InodeEvent)
+	// TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK.
+}
+
+// InotifyRename sends the appriopriate notifications to the watch sets of the
+// file being renamed and its old/new parents.
+func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) {
+	var dirEv uint32
+	if isDir {
+		dirEv = linux.IN_ISDIR
+	}
+	cookie := uniqueid.InotifyCookie(ctx)
+	oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent)
+	newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent)
+	// Somewhat surprisingly, self move events do not have a cookie.
+	renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent)
+}
diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD
new file mode 100644
index 000000000..d8c4d27b9
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "memxattr",
+    srcs = ["xattr.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
new file mode 100644
index 000000000..cc1e7d764
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memxattr provides a default, in-memory extended attribute
+// implementation.
+package memxattr
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SimpleExtendedAttributes implements extended attributes using a map of
+// names to values.
+//
+// +stateify savable
+type SimpleExtendedAttributes struct {
+	// mu protects the below fields.
+	mu     sync.RWMutex `state:"nosave"`
+	xattrs map[string]string
+}
+
+// Getxattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+	x.mu.RLock()
+	value, ok := x.xattrs[opts.Name]
+	x.mu.RUnlock()
+	if !ok {
+		return "", syserror.ENODATA
+	}
+	// Check that the size of the buffer provided in getxattr(2) is large enough
+	// to contain the value.
+	if opts.Size != 0 && uint64(len(value)) > opts.Size {
+		return "", syserror.ERANGE
+	}
+	return value, nil
+}
+
+// Setxattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+	x.mu.Lock()
+	defer x.mu.Unlock()
+	if x.xattrs == nil {
+		if opts.Flags&linux.XATTR_REPLACE != 0 {
+			return syserror.ENODATA
+		}
+		x.xattrs = make(map[string]string)
+	}
+
+	_, ok := x.xattrs[opts.Name]
+	if ok && opts.Flags&linux.XATTR_CREATE != 0 {
+		return syserror.EEXIST
+	}
+	if !ok && opts.Flags&linux.XATTR_REPLACE != 0 {
+		return syserror.ENODATA
+	}
+
+	x.xattrs[opts.Name] = opts.Value
+	return nil
+}
+
+// Listxattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+	// Keep track of the size of the buffer needed in listxattr(2) for the list.
+	listSize := 0
+	x.mu.RLock()
+	names := make([]string, 0, len(x.xattrs))
+	for n := range x.xattrs {
+		names = append(names, n)
+		// Add one byte per null terminator.
+		listSize += len(n) + 1
+	}
+	x.mu.RUnlock()
+	if size != 0 && uint64(listSize) > size {
+		return nil, syserror.ERANGE
+	}
+	return names, nil
+}
+
+// Removexattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+	x.mu.Lock()
+	defer x.mu.Unlock()
+	if _, ok := x.xattrs[name]; !ok {
+		return syserror.ENODATA
+	}
+	delete(x.xattrs, name)
+	return nil
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1fbb420f9..32f901bd8 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -15,7 +15,11 @@
 package vfs
 
 import (
+	"bytes"
+	"fmt"
 	"math"
+	"sort"
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -38,14 +42,23 @@ import (
 //
 // Mount is analogous to Linux's struct mount. (gVisor does not distinguish
 // between struct mount and struct vfsmount.)
+//
+// +stateify savable
 type Mount struct {
-	// vfs, fs, and root are immutable. References are held on fs and root.
+	// vfs, fs, root are immutable. References are held on fs and root.
 	//
 	// Invariant: root belongs to fs.
 	vfs  *VirtualFilesystem
 	fs   *Filesystem
 	root *Dentry
 
+	// ID is the immutable mount ID.
+	ID uint64
+
+	// Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
+	// for MS_RDONLY which is tracked in "writers". Immutable.
+	Flags MountFlags
+
 	// key is protected by VirtualFilesystem.mountMu and
 	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
 	// key.parent and key.point if they are not nil.
@@ -79,13 +92,43 @@ type Mount struct {
 	writers int64
 }
 
-// A MountNamespace is a collection of Mounts.
-//
+func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
+	mnt := &Mount{
+		ID:    atomic.AddUint64(&vfs.lastMountID, 1),
+		Flags: opts.Flags,
+		vfs:   vfs,
+		fs:    fs,
+		root:  root,
+		ns:    mntns,
+		refs:  1,
+	}
+	if opts.ReadOnly {
+		mnt.setReadOnlyLocked(true)
+	}
+	return mnt
+}
+
+// Options returns a copy of the MountOptions currently applicable to mnt.
+func (mnt *Mount) Options() MountOptions {
+	mnt.vfs.mountMu.Lock()
+	defer mnt.vfs.mountMu.Unlock()
+	return MountOptions{
+		Flags:    mnt.Flags,
+		ReadOnly: mnt.readOnly(),
+	}
+}
+
+// A MountNamespace is a collection of Mounts.//
 // MountNamespaces are reference-counted. Unless otherwise specified, all
 // MountNamespace methods require that a reference is held.
 //
 // MountNamespace is analogous to Linux's struct mnt_namespace.
+//
+// +stateify savable
 type MountNamespace struct {
+	// Owner is the usernamespace that owns this mount namespace.
+	Owner *auth.UserNamespace
+
 	// root is the MountNamespace's root mount. root is immutable.
 	root *Mount
 
@@ -114,6 +157,7 @@ type MountNamespace struct {
 func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
+		ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
 		return nil, syserror.ENODEV
 	}
 	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
@@ -121,49 +165,63 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 		return nil, err
 	}
 	mntns := &MountNamespace{
+		Owner:       creds.UserNamespace,
 		refs:        1,
 		mountpoints: make(map[*Dentry]uint32),
 	}
-	mntns.root = &Mount{
-		vfs:  vfs,
-		fs:   fs,
-		root: root,
-		ns:   mntns,
-		refs: 1,
-	}
+	mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
 	return mntns, nil
 }
 
-// MountAt creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+// NewDisconnectedMount returns a Mount representing fs with the given root
+// (which may be nil). The new Mount is not associated with any MountNamespace
+// and is not connected to any other Mounts. References are taken on fs and
+// root.
+func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) {
+	fs.IncRef()
+	if root != nil {
+		root.IncRef()
+	}
+	return newMount(vfs, fs, root, nil /* mntns */, opts), nil
+}
+
+// MountDisconnected creates a Filesystem configured by the given arguments,
+// then returns a Mount representing it. The new Mount is not associated with
+// any MountNamespace and is not connected to any other Mounts.
+func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
-		return syserror.ENODEV
+		return nil, syserror.ENODEV
 	}
 	if !opts.InternalMount && !rft.opts.AllowUserMount {
-		return syserror.ENODEV
+		return nil, syserror.ENODEV
 	}
 	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
-		return err
+		return nil, err
 	}
+	defer root.DecRef()
+	defer fs.DecRef()
+	return vfs.NewDisconnectedMount(fs, root, opts)
+}
+
+// ConnectMountAt connects mnt at the path represented by target.
+//
+// Preconditions: mnt must be disconnected.
+func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
 	// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
 	// lock ordering.
 	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
 	if err != nil {
-		root.DecRef()
-		fs.DecRef()
 		return err
 	}
 	vfs.mountMu.Lock()
 	vd.dentry.mu.Lock()
 	for {
-		if vd.dentry.IsDisowned() {
+		if vd.dentry.dead {
 			vd.dentry.mu.Unlock()
 			vfs.mountMu.Unlock()
 			vd.DecRef()
-			root.DecRef()
-			fs.DecRef()
 			return syserror.ENOENT
 		}
 		// vd might have been mounted over between vfs.GetDentryAt() and
@@ -192,17 +250,10 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 		}
 		vd.dentry.mu.Lock()
 	}
-	// TODO: Linux requires that either both the mount point and the mount root
-	// are directories, or neither are, and returns ENOTDIR if this is not the
-	// case.
+	// TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
+	// point and the mount root are directories, or neither are, and returns
+	// ENOTDIR if this is not the case.
 	mntns := vd.mount.ns
-	mnt := &Mount{
-		vfs:  vfs,
-		fs:   fs,
-		root: root,
-		ns:   mntns,
-		refs: 1,
-	}
 	vfs.mounts.seq.BeginWrite()
 	vfs.connectLocked(mnt, vd, mntns)
 	vfs.mounts.seq.EndWrite()
@@ -211,6 +262,19 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 	return nil
 }
 
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
+	if err != nil {
+		return err
+	}
+	defer mnt.DecRef()
+	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
+		return err
+	}
+	return nil
+}
+
 // UmountAt removes the Mount at the given path.
 func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
 	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
@@ -218,6 +282,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 	}
 
 	// MNT_FORCE is currently unimplemented except for the permission check.
+	// Force unmounting specifically requires CAP_SYS_ADMIN in the root user
+	// namespace, and not in the owner user namespace for the target mount. See
+	// fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
 	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
 		return syserror.EPERM
 	}
@@ -231,14 +298,17 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 		return syserror.EINVAL
 	}
 	vfs.mountMu.Lock()
-	if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns {
-		vfs.mountMu.Unlock()
-		return syserror.EINVAL
+	if mntns := MountNamespaceFromContext(ctx); mntns != nil {
+		defer mntns.DecRef()
+		if mntns != vd.mount.ns {
+			vfs.mountMu.Unlock()
+			return syserror.EINVAL
+		}
 	}
 
-	// TODO(jamieliu): Linux special-cases umount of the caller's root, which
-	// we don't implement yet (we'll just fail it since the caller holds a
-	// reference on it).
+	// TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
+	// root, which we don't implement yet (we'll just fail it since the caller
+	// holds a reference on it).
 
 	vfs.mounts.seq.BeginWrite()
 	if opts.Flags&linux.MNT_DETACH == 0 {
@@ -330,14 +400,22 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu
 // references held by vd.
 //
 // Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. d.mu must be locked. mnt.parent() == nil.
+// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt
+// must not already be connected.
 func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
+	if checkInvariants {
+		if mnt.parent() != nil {
+			panic("VFS.connectLocked called on connected mount")
+		}
+	}
+	mnt.IncRef() // dropped by callers of umountRecursiveLocked
 	mnt.storeKey(vd)
 	if vd.mount.children == nil {
 		vd.mount.children = make(map[*Mount]struct{})
 	}
 	vd.mount.children[mnt] = struct{}{}
 	atomic.AddUint32(&vd.dentry.mounts, 1)
+	mnt.ns = mntns
 	mntns.mountpoints[vd.dentry]++
 	vfs.mounts.insertSeqed(mnt)
 	vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
@@ -355,6 +433,11 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
 // writer critical section. mnt.parent() != nil.
 func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
 	vd := mnt.loadKey()
+	if checkInvariants {
+		if vd.mount != nil {
+			panic("VFS.disconnectLocked called on disconnected mount")
+		}
+	}
 	mnt.storeKey(VirtualDentry{})
 	delete(vd.mount.children, mnt)
 	atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
@@ -605,12 +688,34 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error {
 	return nil
 }
 
+func (mnt *Mount) readOnly() bool {
+	return atomic.LoadInt64(&mnt.writers) < 0
+}
+
 // Filesystem returns the mounted Filesystem. It does not take a reference on
 // the returned Filesystem.
 func (mnt *Mount) Filesystem() *Filesystem {
 	return mnt.fs
 }
 
+// submountsLocked returns this Mount and all Mounts that are descendents of
+// it.
+//
+// Precondition: mnt.vfs.mountMu must be held.
+func (mnt *Mount) submountsLocked() []*Mount {
+	mounts := []*Mount{mnt}
+	for m := range mnt.children {
+		mounts = append(mounts, m.submountsLocked()...)
+	}
+	return mounts
+}
+
+// Root returns the mount's root. It does not take a reference on the returned
+// Dentry.
+func (mnt *Mount) Root() *Dentry {
+	return mnt.root
+}
+
 // Root returns mntns' root. A reference is taken on the returned
 // VirtualDentry.
 func (mntns *MountNamespace) Root() VirtualDentry {
@@ -621,3 +726,178 @@ func (mntns *MountNamespace) Root() VirtualDentry {
 	vd.IncRef()
 	return vd
 }
+
+// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	rootMnt := taskRootDir.mount
+	mounts := rootMnt.submountsLocked()
+	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+	for _, mnt := range mounts {
+		// Get the path to this mount relative to task root.
+		mntRootVD := VirtualDentry{
+			mount:  mnt,
+			dentry: mnt.root,
+		}
+		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+		if err != nil {
+			// For some reason we didn't get a path. Log a warning
+			// and run with empty path.
+			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			path = ""
+		}
+		if path == "" {
+			// Either an error occurred, or path is not reachable
+			// from root.
+			break
+		}
+
+		opts := "rw"
+		if mnt.readOnly() {
+			opts = "ro"
+		}
+		if mnt.Flags.NoATime {
+			opts = ",noatime"
+		}
+		if mnt.Flags.NoExec {
+			opts += ",noexec"
+		}
+
+		// Format:
+		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
+		//
+		// The "needs dump" and "fsck order" flags are always 0, which
+		// is allowed.
+		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
+	}
+}
+
+// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
+// buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	rootMnt := taskRootDir.mount
+	mounts := rootMnt.submountsLocked()
+	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+	for _, mnt := range mounts {
+		// Get the path to this mount relative to task root.
+		mntRootVD := VirtualDentry{
+			mount:  mnt,
+			dentry: mnt.root,
+		}
+		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+		if err != nil {
+			// For some reason we didn't get a path. Log a warning
+			// and run with empty path.
+			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			path = ""
+		}
+		if path == "" {
+			// Either an error occurred, or path is not reachable
+			// from root.
+			break
+		}
+		// Stat the mount root to get the major/minor device numbers.
+		pop := &PathOperation{
+			Root:  mntRootVD,
+			Start: mntRootVD,
+		}
+		statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+		if err != nil {
+			// Well that's not good. Ignore this mount.
+			break
+		}
+
+		// Format:
+		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
+
+		// (1) Mount ID.
+		fmt.Fprintf(buf, "%d ", mnt.ID)
+
+		// (2)  Parent ID (or this ID if there is no parent).
+		pID := mnt.ID
+		if p := mnt.parent(); p != nil {
+			pID = p.ID
+		}
+		fmt.Fprintf(buf, "%d ", pID)
+
+		// (3) Major:Minor device ID. We don't have a superblock, so we
+		// just use the root inode device number.
+		fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
+
+		// (4) Root: the pathname of the directory in the filesystem
+		// which forms the root of this mount.
+		//
+		// NOTE(b/78135857): This will always be "/" until we implement
+		// bind mounts.
+		fmt.Fprintf(buf, "/ ")
+
+		// (5) Mount point (relative to process root).
+		fmt.Fprintf(buf, "%s ", manglePath(path))
+
+		// (6) Mount options.
+		opts := "rw"
+		if mnt.readOnly() {
+			opts = "ro"
+		}
+		if mnt.Flags.NoATime {
+			opts = ",noatime"
+		}
+		if mnt.Flags.NoExec {
+			opts += ",noexec"
+		}
+		fmt.Fprintf(buf, "%s ", opts)
+
+		// (7) Optional fields: zero or more fields of the form "tag[:value]".
+		// (8) Separator: the end of the optional fields is marked by a single hyphen.
+		fmt.Fprintf(buf, "- ")
+
+		// (9) Filesystem type.
+		fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
+
+		// (10) Mount source: filesystem-specific information or "none".
+		fmt.Fprintf(buf, "none ")
+
+		// (11) Superblock options, and final newline.
+		fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt))
+	}
+}
+
+// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
+// See Linux fs/seq_file.c:mangle_path.
+func manglePath(p string) string {
+	r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
+	return r.Replace(p)
+}
+
+// superBlockOpts returns the super block options string for the the mount at
+// the given path.
+func superBlockOpts(mountPath string, mnt *Mount) string {
+	// gVisor doesn't (yet) have a concept of super block options, so we
+	// use the ro/rw bit from the mount flag.
+	opts := "rw"
+	if mnt.readOnly() {
+		opts = "ro"
+	}
+
+	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
+	// the cgroup name in the options. For now we just read that from the
+	// path.
+	//
+	// TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
+	// should get this value from the cgroup itself, and not rely on the
+	// path.
+	if mnt.fs.FilesystemType().Name() == "cgroup" {
+		splitPath := strings.Split(mountPath, "/")
+		cgroupType := splitPath[len(splitPath)-1]
+		opts += "," + cgroupType
+	}
+	return opts
+}
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3b933468d..3335e4057 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -55,7 +55,7 @@ func TestMountTableInsertLookup(t *testing.T) {
 	}
 }
 
-// TODO: concurrent lookup/insertion/removal
+// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal.
 
 // must be powers of 2
 var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8}
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index bd90d36c4..70f850ca4 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -26,6 +26,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
+	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
@@ -64,6 +65,8 @@ func (mnt *Mount) storeKey(vd VirtualDentry) {
 // (provided mutation is sufficiently uncommon).
 //
 // mountTable.Init() must be called on new mountTables before use.
+//
+// +stateify savable
 type mountTable struct {
 	// mountTable is implemented as a seqcount-protected hash table that
 	// resolves collisions with linear probing, featuring Robin Hood insertion
@@ -75,8 +78,8 @@ type mountTable struct {
 	// intrinsics and inline assembly, limiting the performance of this
 	// approach.)
 
-	seq  sync.SeqCount
-	seed uint32 // for hashing keys
+	seq  sync.SeqCount `state:"nosave"`
+	seed uint32        // for hashing keys
 
 	// size holds both length (number of elements) and capacity (number of
 	// slots): capacity is stored as its base-2 log (referred to as order) in
@@ -89,7 +92,7 @@ type mountTable struct {
 	// length and cap in separate uint32s) for ~free.
 	size uint64
 
-	slots unsafe.Pointer // []mountSlot; never nil after Init
+	slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
 }
 
 type mountSlot struct {
@@ -158,7 +161,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer {
 // Lookup may be called even if there are concurrent mutators of mt.
 func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
 	key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
-	hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
+	hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
 
 loop:
 	for {
@@ -359,12 +362,3 @@ func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
 
 //go:linkname rand32 runtime.fastrand
 func rand32() uint32
-
-// This is copy/pasted from runtime.noescape(), and is needed because arguments
-// apparently escape from all functions defined by linkname.
-//
-//go:nosplit
-func noescape(p unsafe.Pointer) unsafe.Pointer {
-	x := uintptr(p)
-	return unsafe.Pointer(x ^ 0)
-}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index fdf8be157..f223aeda8 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -16,6 +16,7 @@ package vfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 )
 
 // GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
@@ -32,6 +33,25 @@ type GetDentryOptions struct {
 type MkdirOptions struct {
 	// Mode is the file mode bits for the created directory.
 	Mode linux.FileMode
+
+	// If ForSyntheticMountpoint is true, FilesystemImpl.MkdirAt() may create
+	// the given directory in memory only (as opposed to persistent storage).
+	// The created directory should be able to support the creation of
+	// subdirectories with ForSyntheticMountpoint == true. It does not need to
+	// support the creation of subdirectories with ForSyntheticMountpoint ==
+	// false, or files of other types.
+	//
+	// FilesystemImpls are permitted to ignore the ForSyntheticMountpoint
+	// option.
+	//
+	// The ForSyntheticMountpoint option exists because, unlike mount(2), the
+	// OCI Runtime Specification permits the specification of mount points that
+	// do not exist, under the expectation that container runtimes will create
+	// them. (More accurately, the OCI Runtime Specification completely fails
+	// to document this feature, but it's implemented by runc.)
+	// ForSyntheticMountpoint allows such mount points to be created even when
+	// the underlying persistent filesystem is immutable.
+	ForSyntheticMountpoint bool
 }
 
 // MknodOptions contains options to VirtualFilesystem.MknodAt() and
@@ -44,10 +64,31 @@ type MknodOptions struct {
 	// DevMinor are the major and minor device numbers for the created device.
 	DevMajor uint32
 	DevMinor uint32
+
+	// Endpoint is the endpoint to bind to the created file, if a socket file is
+	// being created for bind(2) on a Unix domain socket.
+	Endpoint transport.BoundEndpoint
+}
+
+// MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
+// MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers.
+type MountFlags struct {
+	// NoExec is equivalent to MS_NOEXEC.
+	NoExec bool
+
+	// NoATime is equivalent to MS_NOATIME and indicates that the
+	// filesystem should not update access time in-place.
+	NoATime bool
 }
 
 // MountOptions contains options to VirtualFilesystem.MountAt().
 type MountOptions struct {
+	// Flags contains flags as specified for mount(2), e.g. MS_NOEXEC.
+	Flags MountFlags
+
+	// ReadOnly is equivalent to MS_RDONLY.
+	ReadOnly bool
+
 	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
 	GetFilesystemOptions GetFilesystemOptions
 
@@ -61,7 +102,7 @@ type MountOptions struct {
 type OpenOptions struct {
 	// Flags contains access mode and flags as specified for open(2).
 	//
-	// FilesystemImpls is reponsible for implementing the following flags:
+	// FilesystemImpls are responsible for implementing the following flags:
 	// O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC,
 	// O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and
 	// O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and
@@ -75,7 +116,8 @@ type OpenOptions struct {
 
 	// FileExec is set when the file is being opened to be executed.
 	// VirtualFilesystem.OpenAt() checks that the caller has execute permissions
-	// on the file, and that the file is a regular file.
+	// on the file, that the file is a regular file, and that the mount doesn't
+	// have MS_NOEXEC set.
 	FileExec bool
 }
 
@@ -113,6 +155,38 @@ type SetStatOptions struct {
 	Stat linux.Statx
 }
 
+// BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt()
+// and FilesystemImpl.BoundEndpointAt().
+type BoundEndpointOptions struct {
+	// Addr is the path of the file whose socket endpoint is being retrieved.
+	// It is generally irrelevant: most endpoints are stored at a dentry that
+	// was created through a bind syscall, so the path can be stored on creation.
+	// However, if the endpoint was created in FilesystemImpl.BoundEndpointAt(),
+	// then we may not know what the original bind address was.
+	//
+	// For example, if connect(2) is called with address "foo" which corresponds
+	// a remote named socket in goferfs, we need to generate an endpoint wrapping
+	// that file. In this case, we can use Addr to set the endpoint address to
+	// "foo". Note that Addr is only a best-effort attempt--we still do not know
+	// the exact address that was used on the remote fs to bind the socket (it
+	// may have been "foo", "./foo", etc.).
+	Addr string
+}
+
+// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
+// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
+// FileDescriptionImpl.Getxattr().
+type GetxattrOptions struct {
+	// Name is the name of the extended attribute to retrieve.
+	Name string
+
+	// Size is the maximum value size that the caller will tolerate. If the value
+	// is larger than size, getxattr methods may return ERANGE, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	Size uint64
+}
+
 // SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
 // FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
 // FileDescriptionImpl.Setxattr().
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
index b318c681a..cd78d66bc 100644
--- a/pkg/sentry/vfs/pathname.go
+++ b/pkg/sentry/vfs/pathname.go
@@ -58,7 +58,7 @@ loop:
 		switch err.(type) {
 		case nil:
 			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
-				// GenericPrependPath() will have returned
+				// genericfstree.PrependPath() will have returned
 				// PrependPathAtVFSRootError in this case since it checks
 				// against vfsroot before mnt.root, but other implementations
 				// of FilesystemImpl.PrependPath() may return nil instead.
@@ -84,16 +84,59 @@ loop:
 		}
 	}
 	b.PrependByte('/')
-	if origD.IsDisowned() {
+	if origD.IsDead() {
 		b.AppendString(" (deleted)")
 	}
 	return b.String(), nil
 }
 
+// PathnameReachable returns an absolute pathname to vd, consistent with
+// Linux's __d_path() (as used by seq_path_root()). If vfsroot.Ok() and vd is
+// not reachable from vfsroot, such that seq_path_root() would return SEQ_SKIP
+// (causing the entire containing entry to be skipped), PathnameReachable
+// returns ("", nil).
+func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				return "", nil
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+		case PrependPathAtVFSRootError:
+			break loop
+		case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+			return "", nil
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	return b.String(), nil
+}
+
 // PathnameForGetcwd returns an absolute pathname to vd, consistent with
 // Linux's sys_getcwd().
 func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
-	if vd.dentry.IsDisowned() {
+	if vd.dentry.IsDead() {
 		return "", syserror.ENOENT
 	}
 
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 8e250998a..f9647f90e 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -15,8 +15,12 @@
 package vfs
 
 import (
+	"math"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -25,9 +29,9 @@ type AccessTypes uint16
 
 // Bits in AccessTypes.
 const (
+	MayExec  AccessTypes = 1
+	MayWrite AccessTypes = 2
 	MayRead  AccessTypes = 4
-	MayWrite             = 2
-	MayExec              = 1
 )
 
 // OnlyRead returns true if access _only_ allows read.
@@ -52,16 +56,17 @@ func (a AccessTypes) MayExec() bool {
 
 // GenericCheckPermissions checks that creds has the given access rights on a
 // file with the given permissions, UID, and GID, subject to the rules of
-// fs/namei.c:generic_permission(). isDir is true if the file is a directory.
-func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+// fs/namei.c:generic_permission().
+func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
 	// Check permission bits.
-	perms := mode
+	perms := uint16(mode.Permissions())
 	if creds.EffectiveKUID == kuid {
 		perms >>= 6
 	} else if creds.InGroup(kgid) {
 		perms >>= 3
 	}
 	if uint16(ats)&perms == uint16(ats) {
+		// All permission bits match, access granted.
 		return nil
 	}
 
@@ -73,7 +78,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 	}
 	// CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
 	// directories, and read arbitrary non-directory files.
-	if (isDir && !ats.MayWrite()) || ats.OnlyRead() {
+	if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() {
 		if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
 			return nil
 		}
@@ -81,7 +86,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
 	// CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
 	// access to non-directory files, and execute access to non-directory files
 	// for which at least one execute bit is set.
-	if isDir || !ats.MayExec() || (mode&0111 != 0) {
+	if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) {
 		if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
 			return nil
 		}
@@ -147,7 +152,16 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
 // CheckSetStat checks that creds has permission to change the metadata of a
 // file with the given permissions, UID, and GID as specified by stat, subject
 // to the rules of Linux's fs/attr.c:setattr_prepare().
-func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+	if stat.Mask&linux.STATX_SIZE != 0 {
+		limit, err := CheckLimit(ctx, 0, int64(stat.Size))
+		if err != nil {
+			return err
+		}
+		if limit < int64(stat.Size) {
+			return syserror.ErrExceedsFileSizeLimit
+		}
+	}
 	if stat.Mask&linux.STATX_MODE != 0 {
 		if !CanActAsOwner(creds, kuid) {
 			return syserror.EPERM
@@ -177,11 +191,7 @@ func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid
 				(stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) {
 				return syserror.EPERM
 			}
-			// isDir is irrelevant in the following call to
-			// GenericCheckPermissions since ats == MayWrite means that
-			// CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE
-			// applies, regardless of isDir.
-			if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil {
+			if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
 				return err
 			}
 		}
@@ -205,3 +215,21 @@ func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool {
 func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool {
 	return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok()
 }
+
+// CheckLimit enforces file size rlimits. It returns error if the write
+// operation must not proceed. Otherwise it returns the max length allowed to
+// without violating the limit.
+func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
+	fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
+	if fileSizeLimit > math.MaxInt64 {
+		return size, nil
+	}
+	if offset >= int64(fileSizeLimit) {
+		return 0, syserror.ErrExceedsFileSizeLimit
+	}
+	remaining := int64(fileSizeLimit) - offset
+	if remaining < size {
+		return remaining, nil
+	}
+	return size, nil
+}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 8a0b382f6..9d047ff88 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -29,7 +29,9 @@ import (
 //
 // From the perspective of FilesystemImpl methods, a ResolvingPath represents a
 // starting Dentry on the associated Filesystem (on which a reference is
-// already held) and a stream of path components relative to that Dentry.
+// already held), a stream of path components relative to that Dentry, and
+// elements of the invoking Context that are commonly required by
+// FilesystemImpl methods.
 //
 // ResolvingPath is loosely analogous to Linux's struct nameidata.
 type ResolvingPath struct {
@@ -228,7 +230,7 @@ func (rp *ResolvingPath) Advance() {
 		rp.pit = next
 	} else { // at end of path segment, continue with next one
 		rp.curPart--
-		rp.pit = rp.parts[rp.curPart-1]
+		rp.pit = rp.parts[rp.curPart]
 	}
 }
 
@@ -251,18 +253,17 @@ func (rp *ResolvingPath) relpathCommit() {
 	rp.origParts[rp.curPart] = rp.pit
 }
 
-// ResolveParent returns the VFS parent of d. It does not take a reference on
-// the returned Dentry.
-//
-// Preconditions: There are no concurrent mutators of d.
-//
-// Postconditions: If the returned error is nil, then the returned Dentry is
-// not nil.
-func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
-	var parent *Dentry
+// CheckRoot is called before resolving the parent of the Dentry d. If the
+// Dentry is contextually a VFS root, such that path resolution should treat
+// d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the
+// root of a non-root mount, such that path resolution should switch to another
+// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path
+// resolution should resolve d's parent normally, and CheckRoot returns (false,
+// nil).
+func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) {
 	if d == rp.root.dentry && rp.mount == rp.root.mount {
-		// At contextual VFS root.
-		parent = d
+		// At contextual VFS root (due to e.g. chroot(2)).
+		return true, nil
 	} else if d == rp.mount.root {
 		// At mount root ...
 		vd := rp.vfs.getMountpointAt(rp.mount, rp.root)
@@ -270,69 +271,49 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
 			// ... of non-root mount.
 			rp.nextMount = vd.mount
 			rp.nextStart = vd.dentry
-			return nil, resolveMountRootOrJumpError{}
+			return false, resolveMountRootOrJumpError{}
 		}
 		// ... of root mount.
-		parent = d
-	} else if d.parent == nil {
-		// At filesystem root.
-		parent = d
-	} else {
-		parent = d.parent
-	}
-	if parent.isMounted() {
-		if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil {
-			rp.nextMount = mnt
-			return nil, resolveMountPointError{}
-		}
+		return true, nil
 	}
-	return parent, nil
+	return false, nil
 }
 
-// ResolveChild returns the VFS child of d with the given name. It does not
-// take a reference on the returned Dentry. If no such child exists,
-// ResolveChild returns (nil, nil).
-//
-// Preconditions: There are no concurrent mutators of d.
-func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) {
-	child := d.children[name]
-	if child == nil {
-		return nil, nil
+// CheckMount is called after resolving the parent or child of another Dentry
+// to d. If d is a mount point, such that path resolution should switch to
+// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount
+// returns nil.
+func (rp *ResolvingPath) CheckMount(d *Dentry) error {
+	if !d.isMounted() {
+		return nil
 	}
-	if child.isMounted() {
-		if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil {
-			rp.nextMount = mnt
-			return nil, resolveMountPointError{}
-		}
-	}
-	return child, nil
-}
-
-// ResolveComponent returns the Dentry reached by starting at d and resolving
-// the current path component in the stream represented by rp. It does not
-// advance the stream. It does not take a reference on the returned Dentry. If
-// no such Dentry exists, ResolveComponent returns (nil, nil).
-//
-// Preconditions: !rp.Done(). There are no concurrent mutators of d.
-func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) {
-	switch pc := rp.Component(); pc {
-	case ".":
-		return d, nil
-	case "..":
-		return rp.ResolveParent(d)
-	default:
-		return rp.ResolveChild(d, pc)
+	if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil {
+		rp.nextMount = mnt
+		return resolveMountPointError{}
 	}
+	return nil
 }
 
 // ShouldFollowSymlink returns true if, supposing that the current path
 // component in pcs represents a symbolic link, the symbolic link should be
 // followed.
 //
+// If path is terminated with '/', the '/' is considered the last element and
+// any symlink before that is followed:
+//   - For most non-creating walks, the last path component is handled by
+//     fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte
+//     after the path component is non-NULL (which is only possible if it's '/')
+//     and the path component is of type LAST_NORM.
+//
+//   - For open/openat/openat2 without O_CREAT, the last path component is
+//     handled by fs/namei.c:do_last(), which does the same, though without the
+//     LAST_NORM check.
+//
 // Preconditions: !rp.Done().
 func (rp *ResolvingPath) ShouldFollowSymlink() bool {
-	// Non-final symlinks are always followed.
-	return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final()
+	// Non-final symlinks are always followed. Paths terminated with '/' are also
+	// always followed.
+	return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir()
 }
 
 // HandleSymlink is called when the current path component is a symbolic link
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 9629afee9..9acca8bc7 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -38,6 +38,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -46,11 +47,13 @@ import (
 //
 // There is no analogue to the VirtualFilesystem type in Linux, as the
 // equivalent state in Linux is global.
+//
+// +stateify savable
 type VirtualFilesystem struct {
 	// mountMu serializes mount mutations.
 	//
 	// mountMu is analogous to Linux's namespace_sem.
-	mountMu sync.Mutex
+	mountMu sync.Mutex `state:"nosave"`
 
 	// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
 	// are uniquely namespaced, including mount parent in the key correctly
@@ -79,6 +82,10 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// lastMountID is the last allocated mount ID. lastMountID is accessed
+	// using atomic memory operations.
+	lastMountID uint64
+
 	// anonMount is a Mount, not included in mounts or mountpoints,
 	// representing an anonFilesystem. anonMount is used to back
 	// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
@@ -89,56 +96,59 @@ type VirtualFilesystem struct {
 
 	// devices contains all registered Devices. devices is protected by
 	// devicesMu.
-	devicesMu sync.RWMutex
+	devicesMu sync.RWMutex `state:"nosave"`
 	devices   map[devTuple]*registeredDevice
 
 	// anonBlockDevMinor contains all allocated anonymous block device minor
 	// numbers. anonBlockDevMinorNext is a lower bound for the smallest
 	// unallocated anonymous block device number. anonBlockDevMinorNext and
 	// anonBlockDevMinor are protected by anonBlockDevMinorMu.
-	anonBlockDevMinorMu   sync.Mutex
+	anonBlockDevMinorMu   sync.Mutex `state:"nosave"`
 	anonBlockDevMinorNext uint32
 	anonBlockDevMinor     map[uint32]struct{}
 
 	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
 	// fsTypesMu.
-	fsTypesMu sync.RWMutex
+	fsTypesMu sync.RWMutex `state:"nosave"`
 	fsTypes   map[string]*registeredFilesystemType
 
 	// filesystems contains all Filesystems. filesystems is protected by
 	// filesystemsMu.
-	filesystemsMu sync.Mutex
+	filesystemsMu sync.Mutex `state:"nosave"`
 	filesystems   map[*Filesystem]struct{}
 }
 
-// New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
-func New() *VirtualFilesystem {
-	vfs := &VirtualFilesystem{
-		mountpoints:           make(map[*Dentry]map[*Mount]struct{}),
-		devices:               make(map[devTuple]*registeredDevice),
-		anonBlockDevMinorNext: 1,
-		anonBlockDevMinor:     make(map[uint32]struct{}),
-		fsTypes:               make(map[string]*registeredFilesystemType),
-		filesystems:           make(map[*Filesystem]struct{}),
-	}
+// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
+func (vfs *VirtualFilesystem) Init() error {
+	vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
+	vfs.devices = make(map[devTuple]*registeredDevice)
+	vfs.anonBlockDevMinorNext = 1
+	vfs.anonBlockDevMinor = make(map[uint32]struct{})
+	vfs.fsTypes = make(map[string]*registeredFilesystemType)
+	vfs.filesystems = make(map[*Filesystem]struct{})
 	vfs.mounts.Init()
 
 	// Construct vfs.anonMount.
 	anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
 	if err != nil {
-		panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err))
+		// This shouldn't be possible since anonBlockDevMinorNext was
+		// initialized to 1 above (no device numbers have been allocated yet).
+		panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err))
 	}
 	anonfs := anonFilesystem{
 		devMinor: anonfsDevMinor,
 	}
-	anonfs.vfsfs.Init(vfs, &anonfs)
-	vfs.anonMount = &Mount{
-		vfs:  vfs,
-		fs:   &anonfs.vfsfs,
-		refs: 1,
+	anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs)
+	defer anonfs.vfsfs.DecRef()
+	anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
+	if err != nil {
+		// We should not be passing any MountOptions that would cause
+		// construction of this mount to fail.
+		panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err))
 	}
+	vfs.anonMount = anonMount
 
-	return vfs
+	return nil
 }
 
 // PathOperation specifies the path operated on by a VFS method.
@@ -169,6 +179,23 @@ type PathOperation struct {
 	FollowFinalSymlink bool
 }
 
+// AccessAt checks whether a user with creds has access to the file at
+// the given path.
+func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
 // GetDentryAt returns a VirtualDentry representing the given path, at which a
 // file must exist. A reference is taken on the returned VirtualDentry.
 func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
@@ -208,7 +235,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -249,7 +276,7 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -285,7 +312,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -312,13 +339,13 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
-		if err != nil {
+		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -333,14 +360,11 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
 func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
 	// Remove:
 	//
-	// - O_LARGEFILE, which we always report in FileDescription status flags
-	// since only 64-bit architectures are supported at this time.
-	//
 	// - O_CLOEXEC, which affects file descriptors and therefore must be
 	// handled outside of VFS.
 	//
 	// - Unknown flags.
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
 	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
 	if opts.Flags&linux.O_SYNC != 0 {
 		opts.Flags |= linux.O_DSYNC
@@ -380,24 +404,25 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 		if err == nil {
 			vfs.putResolvingPath(rp)
 
-			// TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call
-			// to FileDescription.Stat().
 			if opts.FileExec {
+				if fd.Mount().Flags.NoExec {
+					fd.DecRef()
+					return nil, syserror.EACCES
+				}
+
 				// Only a regular file can be executed.
 				stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
 				if err != nil {
+					fd.DecRef()
 					return nil, err
 				}
-				if stat.Mask&linux.STATX_TYPE != 0 {
-					// This shouldn't happen, but if type can't be retrieved, file can't
-					// be executed.
-					return nil, syserror.EACCES
-				}
-				if linux.FileMode(stat.Mode).FileType() != linux.ModeRegular {
+				if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
+					fd.DecRef()
 					return nil, syserror.EACCES
 				}
 			}
 
+			fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent)
 			return fd, nil
 		}
 		if !rp.handleError(err) {
@@ -472,7 +497,7 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -505,7 +530,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -586,7 +611,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -618,7 +643,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 		}
 		if checkInvariants {
 			if rp.canHandleError(err) && rp.Done() {
-				panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+				panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
 			}
 		}
 		if !rp.handleError(err) {
@@ -628,12 +653,39 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 	}
 }
 
+// BoundEndpointAt gets the bound endpoint at the given path, if one exists.
+func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return nil, syserror.ECONNREFUSED
+		}
+		return nil, syserror.ENOENT
+	}
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return bep, nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
 // ListxattrAt returns all extended attribute names for the file at the given
 // path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
+		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return names, nil
@@ -655,10 +707,10 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
 
 // GetxattrAt returns the value associated with the given extended attribute
 // for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return val, nil
@@ -743,11 +795,21 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
 // VirtualDentry methods require that a reference is held on the VirtualDentry.
 //
 // VirtualDentry is analogous to Linux's struct path.
+//
+// +stateify savable
 type VirtualDentry struct {
 	mount  *Mount
 	dentry *Dentry
 }
 
+// MakeVirtualDentry creates a VirtualDentry.
+func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry {
+	return VirtualDentry{
+		mount:  mount,
+		dentry: dentry,
+	}
+}
+
 // Ok returns true if vd is not empty. It does not require that a reference is
 // held.
 func (vd VirtualDentry) Ok() bool {
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index bfb2fac26..e2894f9f5 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -77,7 +77,10 @@ var DefaultOpts = Opts{
 // trigger it.
 const descheduleThreshold = 1 * time.Second
 
-var stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+var (
+	stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout")
+	stuckTasks   = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+)
 
 // Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck.
 var stackDumpSameTaskPeriod = time.Minute
@@ -220,8 +223,11 @@ func (w *Watchdog) waitForStart() {
 		// We are fine.
 		return
 	}
+
+	stuckStartup.Increment()
+
 	var buf bytes.Buffer
-	buf.WriteString("Watchdog.Start() not called within %s:\n")
+	buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
 	w.doAction(w.StartupTimeoutAction, false, &buf)
 }
 
@@ -255,7 +261,7 @@ func (w *Watchdog) runTurn() {
 	case <-done:
 	case <-time.After(w.TaskTimeout):
 		// Report if the watchdog is not making progress.
-		// No one is wathching the watchdog watcher though.
+		// No one is watching the watchdog watcher though.
 		w.reportStuckWatchdog()
 		<-done
 	}
@@ -317,28 +323,28 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
 
 	buf.WriteString("Search for '(*Task).run(0x..., 0x<tid>)' in the stack dump to find the offending goroutine")
 
-	// Dump stack only if a new task is detected or if it sometime has
-	// passed since the last time a stack dump was generated.
-	skipStack := newTaskFound || time.Since(w.lastStackDump) >= stackDumpSameTaskPeriod
-	w.doAction(w.TaskTimeoutAction, skipStack, &buf)
+	// Force stack dump only if a new task is detected.
+	w.doAction(w.TaskTimeoutAction, newTaskFound, &buf)
 }
 
 func (w *Watchdog) reportStuckWatchdog() {
 	var buf bytes.Buffer
-	buf.WriteString("Watchdog goroutine is stuck:\n")
+	buf.WriteString("Watchdog goroutine is stuck:")
 	w.doAction(w.TaskTimeoutAction, false, &buf)
 }
 
-// doAction will take the given action. If the action is LogWarnind and
-// skipStack is true, then the stack printing will be skipped.
-func (w *Watchdog) doAction(action Action, skipStack bool, msg *bytes.Buffer) {
+// doAction will take the given action. If the action is LogWarning, the stack
+// is not always dumped to the log to prevent log flooding. "forceStack"
+// guarantees that the stack will be dumped regardless.
+func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
 	switch action {
 	case LogWarning:
-		if skipStack {
+		// Dump stack only if forced or sometime has passed since the last time a
+		// stack dump was generated.
+		if !forceStack && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod {
 			msg.WriteString("\n...[stack dump skipped]...")
 			log.Warningf(msg.String())
 			return
-
 		}
 		log.TracebackAll(msg.String())
 		w.lastStackDump = time.Now()
@@ -359,7 +365,8 @@ func (w *Watchdog) doAction(action Action, skipStack bool, msg *bytes.Buffer) {
 		case <-metricsEmitted:
 		case <-time.After(1 * time.Second):
 		}
-		panic(fmt.Sprintf("Stack for running G's are skipped while panicking.\n%s", msg.String()))
+		panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String()))
+
 	default:
 		panic(fmt.Sprintf("Unknown watchdog action %v", action))
 
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index 65bfcf778..f68c12620 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index 921af9d63..2b1350135 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -47,6 +47,7 @@ go_library(
         "state.go",
         "stats.go",
     ],
+    marshal = False,
     stateify = False,
     visibility = ["//:sandbox"],
     deps = [
diff --git a/pkg/state/state.go b/pkg/state/state.go
index dbe507ab4..03ae2dbb0 100644
--- a/pkg/state/state.go
+++ b/pkg/state/state.go
@@ -241,10 +241,7 @@ func Register(name string, instance interface{}, fns Fns) {
 //
 // This function is used by the stateify tool.
 func IsZeroValue(val interface{}) bool {
-	if val == nil {
-		return true
-	}
-	return reflect.DeepEqual(val, reflect.Zero(reflect.TypeOf(val)).Interface())
+	return val == nil || reflect.ValueOf(val).Elem().IsZero()
 }
 
 // step captures one encoding / decoding step. On each step, there is up to one
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 5340cf0d6..d0d77e19c 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -31,23 +31,25 @@ go_library(
     name = "sync",
     srcs = [
         "aliases.go",
-        "downgradable_rwmutex_unsafe.go",
         "memmove_unsafe.go",
+        "mutex_unsafe.go",
         "norace_unsafe.go",
         "race_unsafe.go",
+        "rwmutex_unsafe.go",
         "seqcount.go",
-        "syncutil.go",
-        "tmutex_unsafe.go",
+        "sync.go",
     ],
+    marshal = False,
+    stateify = False,
 )
 
 go_test(
     name = "sync_test",
     size = "small",
     srcs = [
-        "downgradable_rwmutex_test.go",
+        "mutex_test.go",
+        "rwmutex_test.go",
         "seqcount_test.go",
-        "tmutex_test.go",
     ],
     library = ":sync",
 )
diff --git a/pkg/sync/aliases.go b/pkg/sync/aliases.go
index d2d7132fa..0d4316254 100644
--- a/pkg/sync/aliases.go
+++ b/pkg/sync/aliases.go
@@ -29,3 +29,8 @@ type (
 	// Map is an alias of sync.Map.
 	Map = sync.Map
 )
+
+// NewCond is a wrapper around sync.NewCond.
+func NewCond(l Locker) *Cond {
+	return sync.NewCond(l)
+}
diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
index ad4a3a37e..1d7780695 100644
--- a/pkg/sync/memmove_unsafe.go
+++ b/pkg/sync/memmove_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sync/tmutex_test.go b/pkg/sync/mutex_test.go
index 0838248b4..0838248b4 100644
--- a/pkg/sync/tmutex_test.go
+++ b/pkg/sync/mutex_test.go
diff --git a/pkg/sync/tmutex_unsafe.go b/pkg/sync/mutex_unsafe.go
index 3dd15578b..dc034d561 100644
--- a/pkg/sync/tmutex_unsafe.go
+++ b/pkg/sync/mutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.15
+// +build !go1.16
 
 // When updating the build constraint (above), check that syncMutex matches the
 // standard library sync.Mutex definition.
diff --git a/pkg/sync/downgradable_rwmutex_test.go b/pkg/sync/rwmutex_test.go
index ce667e825..ce667e825 100644
--- a/pkg/sync/downgradable_rwmutex_test.go
+++ b/pkg/sync/rwmutex_test.go
diff --git a/pkg/sync/downgradable_rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go
index ea6cdc447..995c0346e 100644
--- a/pkg/sync/downgradable_rwmutex_unsafe.go
+++ b/pkg/sync/rwmutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sync/syncutil.go b/pkg/sync/sync.go
index b16cf5333..b16cf5333 100644
--- a/pkg/sync/syncutil.go
+++ b/pkg/sync/sync.go
diff --git a/pkg/syncevent/BUILD b/pkg/syncevent/BUILD
new file mode 100644
index 000000000..0500a22cf
--- /dev/null
+++ b/pkg/syncevent/BUILD
@@ -0,0 +1,39 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "syncevent",
+    srcs = [
+        "broadcaster.go",
+        "receiver.go",
+        "source.go",
+        "syncevent.go",
+        "waiter_amd64.s",
+        "waiter_arm64.s",
+        "waiter_asm_unsafe.go",
+        "waiter_noasm_unsafe.go",
+        "waiter_unsafe.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/atomicbitops",
+        "//pkg/sync",
+    ],
+)
+
+go_test(
+    name = "syncevent_test",
+    size = "small",
+    srcs = [
+        "broadcaster_test.go",
+        "syncevent_example_test.go",
+        "waiter_test.go",
+    ],
+    library = ":syncevent",
+    deps = [
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/syncevent/broadcaster.go b/pkg/syncevent/broadcaster.go
new file mode 100644
index 000000000..4bff59e7d
--- /dev/null
+++ b/pkg/syncevent/broadcaster.go
@@ -0,0 +1,218 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Broadcaster is an implementation of Source that supports any number of
+// subscribed Receivers.
+//
+// The zero value of Broadcaster is valid and has no subscribed Receivers.
+// Broadcaster is not copyable by value.
+//
+// All Broadcaster methods may be called concurrently from multiple goroutines.
+type Broadcaster struct {
+	// Broadcaster is implemented as a hash table where keys are assigned by
+	// the Broadcaster and returned as SubscriptionIDs, making it safe to use
+	// the identity function for hashing. The hash table resolves collisions
+	// using linear probing and features Robin Hood insertion and backward
+	// shift deletion in order to support a relatively high load factor
+	// efficiently, which matters since the cost of Broadcast is linear in the
+	// size of the table.
+
+	// mu protects the following fields.
+	mu sync.Mutex
+
+	// Invariants: len(table) is 0 or a power of 2.
+	table []broadcasterSlot
+
+	// load is the number of entries in table with receiver != nil.
+	load int
+
+	lastID SubscriptionID
+}
+
+type broadcasterSlot struct {
+	// Invariants: If receiver == nil, then filter == NoEvents and id == 0.
+	// Otherwise, id != 0.
+	receiver *Receiver
+	filter   Set
+	id       SubscriptionID
+}
+
+const (
+	broadcasterMinNonZeroTableSize = 2 // must be a power of 2 > 1
+
+	broadcasterMaxLoadNum = 13
+	broadcasterMaxLoadDen = 16
+)
+
+// SubscribeEvents implements Source.SubscribeEvents.
+func (b *Broadcaster) SubscribeEvents(r *Receiver, filter Set) SubscriptionID {
+	b.mu.Lock()
+
+	// Assign an ID for this subscription.
+	b.lastID++
+	id := b.lastID
+
+	// Expand the table if over the maximum load factor:
+	//
+	//          load / len(b.table) > broadcasterMaxLoadNum / broadcasterMaxLoadDen
+	// load * broadcasterMaxLoadDen > broadcasterMaxLoadNum * len(b.table)
+	b.load++
+	if (b.load * broadcasterMaxLoadDen) > (broadcasterMaxLoadNum * len(b.table)) {
+		// Double the number of slots in the new table.
+		newlen := broadcasterMinNonZeroTableSize
+		if len(b.table) != 0 {
+			newlen = 2 * len(b.table)
+		}
+		if newlen <= cap(b.table) {
+			// Reuse excess capacity in the current table, moving entries not
+			// already in their first-probed positions to better ones.
+			newtable := b.table[:newlen]
+			newmask := uint64(newlen - 1)
+			for i := range b.table {
+				if b.table[i].receiver != nil && uint64(b.table[i].id)&newmask != uint64(i) {
+					entry := b.table[i]
+					b.table[i] = broadcasterSlot{}
+					broadcasterTableInsert(newtable, entry.id, entry.receiver, entry.filter)
+				}
+			}
+			b.table = newtable
+		} else {
+			newtable := make([]broadcasterSlot, newlen)
+			// Copy existing entries to the new table.
+			for i := range b.table {
+				if b.table[i].receiver != nil {
+					broadcasterTableInsert(newtable, b.table[i].id, b.table[i].receiver, b.table[i].filter)
+				}
+			}
+			// Switch to the new table.
+			b.table = newtable
+		}
+	}
+
+	broadcasterTableInsert(b.table, id, r, filter)
+	b.mu.Unlock()
+	return id
+}
+
+// Preconditions: table must not be full. len(table) is a power of 2.
+func broadcasterTableInsert(table []broadcasterSlot, id SubscriptionID, r *Receiver, filter Set) {
+	entry := broadcasterSlot{
+		receiver: r,
+		filter:   filter,
+		id:       id,
+	}
+	mask := uint64(len(table) - 1)
+	i := uint64(id) & mask
+	disp := uint64(0)
+	for {
+		if table[i].receiver == nil {
+			table[i] = entry
+			return
+		}
+		// If we've been displaced farther from our first-probed slot than the
+		// element stored in this one, swap elements and switch to inserting
+		// the replaced one. (This is Robin Hood insertion.)
+		slotDisp := (i - uint64(table[i].id)) & mask
+		if disp > slotDisp {
+			table[i], entry = entry, table[i]
+			disp = slotDisp
+		}
+		i = (i + 1) & mask
+		disp++
+	}
+}
+
+// UnsubscribeEvents implements Source.UnsubscribeEvents.
+func (b *Broadcaster) UnsubscribeEvents(id SubscriptionID) {
+	b.mu.Lock()
+
+	mask := uint64(len(b.table) - 1)
+	i := uint64(id) & mask
+	for {
+		if b.table[i].id == id {
+			// Found the element to remove. Move all subsequent elements
+			// backward until we either find an empty slot, or an element that
+			// is already in its first-probed slot. (This is backward shift
+			// deletion.)
+			for {
+				next := (i + 1) & mask
+				if b.table[next].receiver == nil {
+					break
+				}
+				if uint64(b.table[next].id)&mask == next {
+					break
+				}
+				b.table[i] = b.table[next]
+				i = next
+			}
+			b.table[i] = broadcasterSlot{}
+			break
+		}
+		i = (i + 1) & mask
+	}
+
+	// If a table 1/4 of the current size would still be at or under the
+	// maximum load factor (i.e. the current table size is at least two
+	// expansions bigger than necessary), halve the size of the table to reduce
+	// the cost of Broadcast. Since we are concerned with iteration time and
+	// not memory usage, reuse the existing slice to reduce future allocations
+	// from table re-expansion.
+	b.load--
+	if len(b.table) > broadcasterMinNonZeroTableSize && (b.load*(4*broadcasterMaxLoadDen)) <= (broadcasterMaxLoadNum*len(b.table)) {
+		newlen := len(b.table) / 2
+		newtable := b.table[:newlen]
+		for i := newlen; i < len(b.table); i++ {
+			if b.table[i].receiver != nil {
+				broadcasterTableInsert(newtable, b.table[i].id, b.table[i].receiver, b.table[i].filter)
+				b.table[i] = broadcasterSlot{}
+			}
+		}
+		b.table = newtable
+	}
+
+	b.mu.Unlock()
+}
+
+// Broadcast notifies all Receivers subscribed to the Broadcaster of the subset
+// of events to which they subscribed. The order in which Receivers are
+// notified is unspecified.
+func (b *Broadcaster) Broadcast(events Set) {
+	b.mu.Lock()
+	for i := range b.table {
+		if intersection := events & b.table[i].filter; intersection != 0 {
+			// We don't need to check if broadcasterSlot.receiver is nil, since
+			// if it is then broadcasterSlot.filter is 0.
+			b.table[i].receiver.Notify(intersection)
+		}
+	}
+	b.mu.Unlock()
+}
+
+// FilteredEvents returns the set of events for which Broadcast will notify at
+// least one Receiver, i.e. the union of filters for all subscribed Receivers.
+func (b *Broadcaster) FilteredEvents() Set {
+	var es Set
+	b.mu.Lock()
+	for i := range b.table {
+		es |= b.table[i].filter
+	}
+	b.mu.Unlock()
+	return es
+}
diff --git a/pkg/syncevent/broadcaster_test.go b/pkg/syncevent/broadcaster_test.go
new file mode 100644
index 000000000..e88779e23
--- /dev/null
+++ b/pkg/syncevent/broadcaster_test.go
@@ -0,0 +1,376 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"fmt"
+	"math/rand"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestBroadcasterFilter(t *testing.T) {
+	const numReceivers = 2 * MaxEvents
+
+	var br Broadcaster
+	ws := make([]Waiter, numReceivers)
+	for i := range ws {
+		ws[i].Init()
+		br.SubscribeEvents(ws[i].Receiver(), 1<<(i%MaxEvents))
+	}
+	for ev := 0; ev < MaxEvents; ev++ {
+		br.Broadcast(1 << ev)
+		for i := range ws {
+			want := NoEvents
+			if i%MaxEvents == ev {
+				want = 1 << ev
+			}
+			if got := ws[i].Receiver().PendingAndAckAll(); got != want {
+				t.Errorf("after Broadcast of event %d: waiter %d has pending event set %#x, wanted %#x", ev, i, got, want)
+			}
+		}
+	}
+}
+
+// TestBroadcasterManySubscriptions tests that subscriptions are not lost by
+// table expansion/compaction.
+func TestBroadcasterManySubscriptions(t *testing.T) {
+	const numReceivers = 5000 // arbitrary
+
+	var br Broadcaster
+	ws := make([]Waiter, numReceivers)
+	for i := range ws {
+		ws[i].Init()
+	}
+
+	ids := make([]SubscriptionID, numReceivers)
+	for i := 0; i < numReceivers; i++ {
+		// Subscribe receiver i.
+		ids[i] = br.SubscribeEvents(ws[i].Receiver(), 1)
+		// Check that receivers [0, i] are subscribed.
+		br.Broadcast(1)
+		for j := 0; j <= i; j++ {
+			if ws[j].Pending() != 1 {
+				t.Errorf("receiver %d did not receive an event after subscription of receiver %d", j, i)
+			}
+			ws[j].Ack(1)
+		}
+	}
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numReceivers)
+	for i := 0; i < numReceivers; i++ {
+		// Unsubscribe receiver unsub[i].
+		br.UnsubscribeEvents(ids[unsub[i]])
+		// Check that receivers [unsub[0], unsub[i]] are not subscribed, and that
+		// receivers (unsub[i], unsub[numReceivers]) are still subscribed.
+		br.Broadcast(1)
+		for j := 0; j <= i; j++ {
+			if ws[unsub[j]].Pending() != 0 {
+				t.Errorf("unsub iteration %d: receiver %d received an event after unsubscription of receiver %d", i, unsub[j], unsub[i])
+			}
+		}
+		for j := i + 1; j < numReceivers; j++ {
+			if ws[unsub[j]].Pending() != 1 {
+				t.Errorf("unsub iteration %d: receiver %d did not receive an event after unsubscription of receiver %d", i, unsub[j], unsub[i])
+			}
+			ws[unsub[j]].Ack(1)
+		}
+	}
+}
+
+var (
+	receiverCountsNonZero       = []int{1, 4, 16, 64}
+	receiverCountsIncludingZero = append([]int{0}, receiverCountsNonZero...)
+)
+
+// BenchmarkBroadcasterX, BenchmarkMapX, and BenchmarkQueueX benchmark usage
+// pattern X (described in terms of Broadcaster) with Broadcaster, a
+// Mutex-protected map[*Receiver]Set, and waiter.Queue respectively.
+
+// BenchmarkXxxSubscribeUnsubscribe measures the cost of a Subscribe/Unsubscribe
+// cycle.
+
+func BenchmarkBroadcasterSubscribeUnsubscribe(b *testing.B) {
+	var br Broadcaster
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		id := br.SubscribeEvents(w.Receiver(), 1)
+		br.UnsubscribeEvents(id)
+	}
+}
+
+func BenchmarkMapSubscribeUnsubscribe(b *testing.B) {
+	var mu sync.Mutex
+	m := make(map[*Receiver]Set)
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		mu.Lock()
+		m[w.Receiver()] = Set(1)
+		mu.Unlock()
+		mu.Lock()
+		delete(m, w.Receiver())
+		mu.Unlock()
+	}
+}
+
+func BenchmarkQueueSubscribeUnsubscribe(b *testing.B) {
+	var q waiter.Queue
+	e, _ := waiter.NewChannelEntry(nil)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		q.EventRegister(&e, 1)
+		q.EventUnregister(&e)
+	}
+}
+
+// BenchmarkXxxSubscribeUnsubscribeBatch is similar to
+// BenchmarkXxxSubscribeUnsubscribe, but subscribes and unsubscribes a large
+// number of Receivers at a time in order to measure the amortized overhead of
+// table expansion/compaction. (Since waiter.Queue is implemented using a
+// linked list, BenchmarkQueueSubscribeUnsubscribe and
+// BenchmarkQueueSubscribeUnsubscribeBatch should produce nearly the same
+// result.)
+
+const numBatchReceivers = 1000
+
+func BenchmarkBroadcasterSubscribeUnsubscribeBatch(b *testing.B) {
+	var br Broadcaster
+	ws := make([]Waiter, numBatchReceivers)
+	for i := range ws {
+		ws[i].Init()
+	}
+	ids := make([]SubscriptionID, numBatchReceivers)
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numBatchReceivers)
+
+	b.ResetTimer()
+	for i := 0; i < b.N/numBatchReceivers; i++ {
+		for j := 0; j < numBatchReceivers; j++ {
+			ids[j] = br.SubscribeEvents(ws[j].Receiver(), 1)
+		}
+		for j := 0; j < numBatchReceivers; j++ {
+			br.UnsubscribeEvents(ids[unsub[j]])
+		}
+	}
+}
+
+func BenchmarkMapSubscribeUnsubscribeBatch(b *testing.B) {
+	var mu sync.Mutex
+	m := make(map[*Receiver]Set)
+	ws := make([]Waiter, numBatchReceivers)
+	for i := range ws {
+		ws[i].Init()
+	}
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numBatchReceivers)
+
+	b.ResetTimer()
+	for i := 0; i < b.N/numBatchReceivers; i++ {
+		for j := 0; j < numBatchReceivers; j++ {
+			mu.Lock()
+			m[ws[j].Receiver()] = Set(1)
+			mu.Unlock()
+		}
+		for j := 0; j < numBatchReceivers; j++ {
+			mu.Lock()
+			delete(m, ws[unsub[j]].Receiver())
+			mu.Unlock()
+		}
+	}
+}
+
+func BenchmarkQueueSubscribeUnsubscribeBatch(b *testing.B) {
+	var q waiter.Queue
+	es := make([]waiter.Entry, numBatchReceivers)
+	for i := range es {
+		es[i], _ = waiter.NewChannelEntry(nil)
+	}
+
+	// Generate a random order for unsubscriptions.
+	unsub := rand.Perm(numBatchReceivers)
+
+	b.ResetTimer()
+	for i := 0; i < b.N/numBatchReceivers; i++ {
+		for j := 0; j < numBatchReceivers; j++ {
+			q.EventRegister(&es[j], 1)
+		}
+		for j := 0; j < numBatchReceivers; j++ {
+			q.EventUnregister(&es[unsub[j]])
+		}
+	}
+}
+
+// BenchmarkXxxBroadcastRedundant measures how long it takes to Broadcast
+// already-pending events to multiple Receivers.
+
+func BenchmarkBroadcasterBroadcastRedundant(b *testing.B) {
+	for _, n := range receiverCountsIncludingZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var br Broadcaster
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				br.SubscribeEvents(ws[i].Receiver(), 1)
+			}
+			br.Broadcast(1)
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				br.Broadcast(1)
+			}
+		})
+	}
+}
+
+func BenchmarkMapBroadcastRedundant(b *testing.B) {
+	for _, n := range receiverCountsIncludingZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var mu sync.Mutex
+			m := make(map[*Receiver]Set)
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				m[ws[i].Receiver()] = Set(1)
+			}
+			mu.Lock()
+			for r := range m {
+				r.Notify(1)
+			}
+			mu.Unlock()
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				mu.Lock()
+				for r := range m {
+					r.Notify(1)
+				}
+				mu.Unlock()
+			}
+		})
+	}
+}
+
+func BenchmarkQueueBroadcastRedundant(b *testing.B) {
+	for _, n := range receiverCountsIncludingZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var q waiter.Queue
+			for i := 0; i < n; i++ {
+				e, _ := waiter.NewChannelEntry(nil)
+				q.EventRegister(&e, 1)
+			}
+			q.Notify(1)
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				q.Notify(1)
+			}
+		})
+	}
+}
+
+// BenchmarkXxxBroadcastAck measures how long it takes to Broadcast events to
+// multiple Receivers, check that all Receivers have received the event, and
+// clear the event from all Receivers.
+
+func BenchmarkBroadcasterBroadcastAck(b *testing.B) {
+	for _, n := range receiverCountsNonZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var br Broadcaster
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				br.SubscribeEvents(ws[i].Receiver(), 1)
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				br.Broadcast(1)
+				for j := range ws {
+					if got, want := ws[j].Pending(), Set(1); got != want {
+						b.Fatalf("Receiver.Pending(): got %#x, wanted %#x", got, want)
+					}
+					ws[j].Ack(1)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkMapBroadcastAck(b *testing.B) {
+	for _, n := range receiverCountsNonZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var mu sync.Mutex
+			m := make(map[*Receiver]Set)
+			ws := make([]Waiter, n)
+			for i := range ws {
+				ws[i].Init()
+				m[ws[i].Receiver()] = Set(1)
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				mu.Lock()
+				for r := range m {
+					r.Notify(1)
+				}
+				mu.Unlock()
+				for j := range ws {
+					if got, want := ws[j].Pending(), Set(1); got != want {
+						b.Fatalf("Receiver.Pending(): got %#x, wanted %#x", got, want)
+					}
+					ws[j].Ack(1)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkQueueBroadcastAck(b *testing.B) {
+	for _, n := range receiverCountsNonZero {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			var q waiter.Queue
+			chs := make([]chan struct{}, n)
+			for i := range chs {
+				e, ch := waiter.NewChannelEntry(nil)
+				q.EventRegister(&e, 1)
+				chs[i] = ch
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				q.Notify(1)
+				for _, ch := range chs {
+					select {
+					case <-ch:
+					default:
+						b.Fatalf("channel did not receive event")
+					}
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/syncevent/receiver.go b/pkg/syncevent/receiver.go
new file mode 100644
index 000000000..5c86e5400
--- /dev/null
+++ b/pkg/syncevent/receiver.go
@@ -0,0 +1,103 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/atomicbitops"
+)
+
+// Receiver is an event sink that holds pending events and invokes a callback
+// whenever new events become pending. Receiver's methods may be called
+// concurrently from multiple goroutines.
+//
+// Receiver.Init() must be called before first use.
+type Receiver struct {
+	// pending is the set of pending events. pending is accessed using atomic
+	// memory operations.
+	pending uint64
+
+	// cb is notified when new events become pending. cb is immutable after
+	// Init().
+	cb ReceiverCallback
+}
+
+// ReceiverCallback receives callbacks from a Receiver.
+type ReceiverCallback interface {
+	// NotifyPending is called when the corresponding Receiver has new pending
+	// events.
+	//
+	// NotifyPending is called synchronously from Receiver.Notify(), so
+	// implementations must not take locks that may be held by callers of
+	// Receiver.Notify(). NotifyPending may be called concurrently from
+	// multiple goroutines.
+	NotifyPending()
+}
+
+// Init must be called before first use of r.
+func (r *Receiver) Init(cb ReceiverCallback) {
+	r.cb = cb
+}
+
+// Pending returns the set of pending events.
+func (r *Receiver) Pending() Set {
+	return Set(atomic.LoadUint64(&r.pending))
+}
+
+// Notify sets the given events as pending.
+func (r *Receiver) Notify(es Set) {
+	p := Set(atomic.LoadUint64(&r.pending))
+	// Optimization: Skip the atomic CAS on r.pending if all events are
+	// already pending.
+	if p&es == es {
+		return
+	}
+	// When this is uncontended (the common case), CAS is faster than
+	// atomic-OR because the former is inlined and the latter (which we
+	// implement in assembly ourselves) is not.
+	if !atomic.CompareAndSwapUint64(&r.pending, uint64(p), uint64(p|es)) {
+		// If the CAS fails, fall back to atomic-OR.
+		atomicbitops.OrUint64(&r.pending, uint64(es))
+	}
+	r.cb.NotifyPending()
+}
+
+// Ack unsets the given events as pending.
+func (r *Receiver) Ack(es Set) {
+	p := Set(atomic.LoadUint64(&r.pending))
+	// Optimization: Skip the atomic CAS on r.pending if all events are
+	// already not pending.
+	if p&es == 0 {
+		return
+	}
+	// When this is uncontended (the common case), CAS is faster than
+	// atomic-AND because the former is inlined and the latter (which we
+	// implement in assembly ourselves) is not.
+	if !atomic.CompareAndSwapUint64(&r.pending, uint64(p), uint64(p&^es)) {
+		// If the CAS fails, fall back to atomic-AND.
+		atomicbitops.AndUint64(&r.pending, ^uint64(es))
+	}
+}
+
+// PendingAndAckAll unsets all events as pending and returns the set of
+// previously-pending events.
+//
+// PendingAndAckAll should only be used in preference to a call to Pending
+// followed by a conditional call to Ack when the caller expects events to be
+// pending (e.g. after a call to ReceiverCallback.NotifyPending()).
+func (r *Receiver) PendingAndAckAll() Set {
+	return Set(atomic.SwapUint64(&r.pending, 0))
+}
diff --git a/pkg/syncevent/source.go b/pkg/syncevent/source.go
new file mode 100644
index 000000000..ddffb171a
--- /dev/null
+++ b/pkg/syncevent/source.go
@@ -0,0 +1,59 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+// Source represents an event source.
+type Source interface {
+	// SubscribeEvents causes the Source to notify the given Receiver of the
+	// given subset of events.
+	//
+	// Preconditions: r != nil. The ReceiverCallback for r must not take locks
+	// that are ordered prior to the Source; for example, it cannot call any
+	// Source methods.
+	SubscribeEvents(r *Receiver, filter Set) SubscriptionID
+
+	// UnsubscribeEvents causes the Source to stop notifying the Receiver
+	// subscribed by a previous call to SubscribeEvents that returned the given
+	// SubscriptionID.
+	//
+	// Preconditions: UnsubscribeEvents may be called at most once for any
+	// given SubscriptionID.
+	UnsubscribeEvents(id SubscriptionID)
+}
+
+// SubscriptionID identifies a call to Source.SubscribeEvents.
+type SubscriptionID uint64
+
+// UnsubscribeAndAck is a convenience function that unsubscribes r from the
+// given events from src and also clears them from r.
+func UnsubscribeAndAck(src Source, r *Receiver, filter Set, id SubscriptionID) {
+	src.UnsubscribeEvents(id)
+	r.Ack(filter)
+}
+
+// NoopSource implements Source by never sending events to subscribed
+// Receivers.
+type NoopSource struct{}
+
+// SubscribeEvents implements Source.SubscribeEvents.
+func (NoopSource) SubscribeEvents(*Receiver, Set) SubscriptionID {
+	return 0
+}
+
+// UnsubscribeEvents implements Source.UnsubscribeEvents.
+func (NoopSource) UnsubscribeEvents(SubscriptionID) {
+}
+
+// See Broadcaster for a non-noop implementations of Source.
diff --git a/pkg/syncevent/syncevent.go b/pkg/syncevent/syncevent.go
new file mode 100644
index 000000000..9fb6a06de
--- /dev/null
+++ b/pkg/syncevent/syncevent.go
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package syncevent provides efficient primitives for goroutine
+// synchronization based on event bitmasks.
+package syncevent
+
+// Set is a bitmask where each bit represents a distinct user-defined event.
+// The event package does not treat any bits in Set specially.
+type Set uint64
+
+const (
+	// NoEvents is a Set containing no events.
+	NoEvents = Set(0)
+
+	// AllEvents is a Set containing all possible events.
+	AllEvents = ^Set(0)
+
+	// MaxEvents is the number of distinct events that can be represented by a Set.
+	MaxEvents = 64
+)
diff --git a/pkg/syncevent/syncevent_example_test.go b/pkg/syncevent/syncevent_example_test.go
new file mode 100644
index 000000000..bfb18e2ea
--- /dev/null
+++ b/pkg/syncevent/syncevent_example_test.go
@@ -0,0 +1,108 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"fmt"
+	"sync/atomic"
+	"time"
+)
+
+func Example_ioReadinessInterrputible() {
+	const (
+		evReady = Set(1 << iota)
+		evInterrupt
+	)
+	errNotReady := fmt.Errorf("not ready for I/O")
+
+	// State of some I/O object.
+	var (
+		br    Broadcaster
+		ready uint32
+	)
+	doIO := func() error {
+		if atomic.LoadUint32(&ready) == 0 {
+			return errNotReady
+		}
+		return nil
+	}
+	go func() {
+		// The I/O object eventually becomes ready for I/O.
+		time.Sleep(100 * time.Millisecond)
+		// When it does, it first ensures that future calls to isReady() return
+		// true, then broadcasts the readiness event to Receivers.
+		atomic.StoreUint32(&ready, 1)
+		br.Broadcast(evReady)
+	}()
+
+	// Each user of the I/O object owns a Waiter.
+	var w Waiter
+	w.Init()
+	// The Waiter may be asynchronously interruptible, e.g. for signal
+	// handling in the sentry.
+	go func() {
+		time.Sleep(200 * time.Millisecond)
+		w.Receiver().Notify(evInterrupt)
+	}()
+
+	// To use the I/O object:
+	//
+	// Optionally, if the I/O object is likely to be ready, attempt I/O first.
+	err := doIO()
+	if err == nil {
+		// Success, we're done.
+		return /* nil */
+	}
+	if err != errNotReady {
+		// Failure, I/O failed for some reason other than readiness.
+		return /* err */
+	}
+	// Subscribe for readiness events from the I/O object.
+	id := br.SubscribeEvents(w.Receiver(), evReady)
+	// When we are finished blocking, unsubscribe from readiness events and
+	// remove readiness events from the pending event set.
+	defer UnsubscribeAndAck(&br, w.Receiver(), evReady, id)
+	for {
+		// Attempt I/O again. This must be done after the call to SubscribeEvents,
+		// since the I/O object might have become ready between the previous call
+		// to doIO and the call to SubscribeEvents.
+		err = doIO()
+		if err == nil {
+			return /* nil */
+		}
+		if err != errNotReady {
+			return /* err */
+		}
+		// Block until either the I/O object indicates it is ready, or we are
+		// interrupted.
+		events := w.Wait()
+		if events&evInterrupt != 0 {
+			// In the specific case of sentry signal handling, signal delivery
+			// is handled by another system, so we aren't responsible for
+			// acknowledging evInterrupt.
+			return /* errInterrupted */
+		}
+		// Note that, in a concurrent context, the I/O object might become
+		// ready and then not ready again. To handle this:
+		//
+		// - evReady must be acknowledged before calling doIO() again (rather
+		// than after), so that if the I/O object becomes ready *again* after
+		// the call to doIO(), the readiness event is not lost.
+		//
+		// - We must loop instead of just calling doIO() once after receiving
+		// evReady.
+		w.Ack(evReady)
+	}
+}
diff --git a/pkg/syncevent/waiter_amd64.s b/pkg/syncevent/waiter_amd64.s
new file mode 100644
index 000000000..985b56ae5
--- /dev/null
+++ b/pkg/syncevent/waiter_amd64.s
@@ -0,0 +1,32 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// See waiter_noasm_unsafe.go for a description of waiterUnlock.
+//
+// func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool
+TEXT ·waiterUnlock(SB),NOSPLIT,$0-24
+	MOVQ g+0(FP), DI
+	MOVQ wg+8(FP), SI
+
+	MOVQ $·preparingG(SB), AX
+	LOCK
+	CMPXCHGQ DI, 0(SI)
+
+	SETEQ AX
+	MOVB AX, ret+16(FP)
+
+	RET
+
diff --git a/pkg/syncevent/waiter_arm64.s b/pkg/syncevent/waiter_arm64.s
new file mode 100644
index 000000000..20d7ac23b
--- /dev/null
+++ b/pkg/syncevent/waiter_arm64.s
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// See waiter_noasm_unsafe.go for a description of waiterUnlock.
+//
+// func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool
+TEXT ·waiterUnlock(SB),NOSPLIT,$0-24
+	MOVD wg+8(FP), R0
+	MOVD $·preparingG(SB), R1
+	MOVD g+0(FP), R2
+again:
+	LDAXR (R0), R3
+	CMP R1, R3
+	BNE ok
+	STLXR R2, (R0), R3
+	CBNZ R3, again
+ok:
+	CSET EQ, R0
+	MOVB R0, ret+16(FP)
+	RET
+
diff --git a/pkg/syncevent/waiter_asm_unsafe.go b/pkg/syncevent/waiter_asm_unsafe.go
new file mode 100644
index 000000000..0995e9053
--- /dev/null
+++ b/pkg/syncevent/waiter_asm_unsafe.go
@@ -0,0 +1,24 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 arm64
+
+package syncevent
+
+import (
+	"unsafe"
+)
+
+// See waiter_noasm_unsafe.go for a description of waiterUnlock.
+func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool
diff --git a/pkg/syncevent/waiter_noasm_unsafe.go b/pkg/syncevent/waiter_noasm_unsafe.go
new file mode 100644
index 000000000..1c4b0e39a
--- /dev/null
+++ b/pkg/syncevent/waiter_noasm_unsafe.go
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// waiterUnlock is called from g0, so when the race detector is enabled,
+// waiterUnlock must be implemented in assembly since no race context is
+// available.
+//
+// +build !race
+// +build !amd64,!arm64
+
+package syncevent
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// waiterUnlock is the "unlock function" passed to runtime.gopark by
+// Waiter.Wait*. wg is &Waiter.g, and g is a pointer to the calling runtime.g.
+// waiterUnlock returns true if Waiter.Wait should sleep and false if sleeping
+// should be aborted.
+//
+//go:nosplit
+func waiterUnlock(g unsafe.Pointer, wg *unsafe.Pointer) bool {
+	// The only way this CAS can fail is if a call to Waiter.NotifyPending()
+	// has replaced *wg with nil, in which case we should not sleep.
+	return atomic.CompareAndSwapPointer(wg, (unsafe.Pointer)(&preparingG), g)
+}
diff --git a/pkg/syncevent/waiter_test.go b/pkg/syncevent/waiter_test.go
new file mode 100644
index 000000000..3c8cbcdd8
--- /dev/null
+++ b/pkg/syncevent/waiter_test.go
@@ -0,0 +1,414 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syncevent
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+func TestWaiterAlreadyPending(t *testing.T) {
+	var w Waiter
+	w.Init()
+	want := Set(1)
+	w.Notify(want)
+	if got := w.Wait(); got != want {
+		t.Errorf("Waiter.Wait: got %#x, wanted %#x", got, want)
+	}
+}
+
+func TestWaiterAsyncNotify(t *testing.T) {
+	var w Waiter
+	w.Init()
+	want := Set(1)
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		w.Notify(want)
+	}()
+	if got := w.Wait(); got != want {
+		t.Errorf("Waiter.Wait: got %#x, wanted %#x", got, want)
+	}
+}
+
+func TestWaiterWaitFor(t *testing.T) {
+	var w Waiter
+	w.Init()
+	evWaited := Set(1)
+	evOther := Set(2)
+	w.Notify(evOther)
+	notifiedEvent := uint32(0)
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		atomic.StoreUint32(&notifiedEvent, 1)
+		w.Notify(evWaited)
+	}()
+	if got, want := w.WaitFor(evWaited), evWaited|evOther; got != want {
+		t.Errorf("Waiter.WaitFor: got %#x, wanted %#x", got, want)
+	}
+	if atomic.LoadUint32(&notifiedEvent) == 0 {
+		t.Errorf("Waiter.WaitFor returned before goroutine notified waited-for event")
+	}
+}
+
+func TestWaiterWaitAndAckAll(t *testing.T) {
+	var w Waiter
+	w.Init()
+	w.Notify(AllEvents)
+	if got := w.WaitAndAckAll(); got != AllEvents {
+		t.Errorf("Waiter.WaitAndAckAll: got %#x, wanted %#x", got, AllEvents)
+	}
+	if got := w.Pending(); got != NoEvents {
+		t.Errorf("Waiter.WaitAndAckAll did not ack all events: got %#x, wanted 0", got)
+	}
+}
+
+// BenchmarkWaiterX, BenchmarkSleeperX, and BenchmarkChannelX benchmark usage
+// pattern X (described in terms of Waiter) with Waiter, sleep.Sleeper, and
+// buffered chan struct{} respectively. When the maximum number of event
+// sources is relevant, we use 3 event sources because this is representative
+// of the kernel.Task.block() use case: an interrupt source, a timeout source,
+// and the actual event source being waited on.
+
+// Event set used by most benchmarks.
+const evBench Set = 1
+
+// BenchmarkXxxNotifyRedundant measures how long it takes to notify a Waiter of
+// an event that is already pending.
+
+func BenchmarkWaiterNotifyRedundant(b *testing.B) {
+	var w Waiter
+	w.Init()
+	w.Notify(evBench)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Notify(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyRedundant(b *testing.B) {
+	var s sleep.Sleeper
+	var w sleep.Waker
+	s.AddWaker(&w, 0)
+	w.Assert()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Assert()
+	}
+}
+
+func BenchmarkChannelNotifyRedundant(b *testing.B) {
+	ch := make(chan struct{}, 1)
+	ch <- struct{}{}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		select {
+		case ch <- struct{}{}:
+		default:
+		}
+	}
+}
+
+// BenchmarkXxxNotifyWaitAck measures how long it takes to notify a Waiter an
+// event, return that event using a blocking check, and then unset the event as
+// pending.
+
+func BenchmarkWaiterNotifyWaitAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Notify(evBench)
+		w.Wait()
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyWaitAck(b *testing.B) {
+	var s sleep.Sleeper
+	var w sleep.Waker
+	s.AddWaker(&w, 0)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Assert()
+		s.Fetch(true)
+	}
+}
+
+func BenchmarkChannelNotifyWaitAck(b *testing.B) {
+	ch := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		// notify
+		select {
+		case ch <- struct{}{}:
+		default:
+		}
+
+		// wait + ack
+		<-ch
+	}
+}
+
+// BenchmarkSleeperMultiNotifyWaitAck is equivalent to
+// BenchmarkSleeperNotifyWaitAck, but also includes allocation of a
+// temporary sleep.Waker. This is necessary when multiple goroutines may wait
+// for the same event, since each sleep.Waker can wake only a single
+// sleep.Sleeper.
+//
+// The syncevent package does not require a distinct object for each
+// waiter-waker relationship, so BenchmarkWaiterNotifyWaitAck and
+// BenchmarkWaiterMultiNotifyWaitAck would be identical. The analogous state
+// for channels, runtime.sudog, is inescapably runtime-allocated, so
+// BenchmarkChannelNotifyWaitAck and BenchmarkChannelMultiNotifyWaitAck would
+// also be identical.
+
+func BenchmarkSleeperMultiNotifyWaitAck(b *testing.B) {
+	var s sleep.Sleeper
+	// The sleep package doesn't provide sync.Pool allocation of Wakers;
+	// we do for a fairer comparison.
+	wakerPool := sync.Pool{
+		New: func() interface{} {
+			return &sleep.Waker{}
+		},
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w := wakerPool.Get().(*sleep.Waker)
+		s.AddWaker(w, 0)
+		w.Assert()
+		s.Fetch(true)
+		s.Done()
+		wakerPool.Put(w)
+	}
+}
+
+// BenchmarkXxxTempNotifyWaitAck is equivalent to NotifyWaitAck, but also
+// includes allocation of a temporary Waiter. This models the case where a
+// goroutine not already associated with a Waiter needs one in order to block.
+//
+// The analogous state for channels is built into runtime.g, so
+// BenchmarkChannelNotifyWaitAck and BenchmarkChannelTempNotifyWaitAck would be
+// identical.
+
+func BenchmarkWaiterTempNotifyWaitAck(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w := GetWaiter()
+		w.Notify(evBench)
+		w.Wait()
+		w.Ack(evBench)
+		PutWaiter(w)
+	}
+}
+
+func BenchmarkSleeperTempNotifyWaitAck(b *testing.B) {
+	// The sleep package doesn't provide sync.Pool allocation of Sleepers;
+	// we do for a fairer comparison.
+	sleeperPool := sync.Pool{
+		New: func() interface{} {
+			return &sleep.Sleeper{}
+		},
+	}
+	var w sleep.Waker
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s := sleeperPool.Get().(*sleep.Sleeper)
+		s.AddWaker(&w, 0)
+		w.Assert()
+		s.Fetch(true)
+		s.Done()
+		sleeperPool.Put(s)
+	}
+}
+
+// BenchmarkXxxNotifyWaitMultiAck is equivalent to NotifyWaitAck, but allows
+// for multiple event sources.
+
+func BenchmarkWaiterNotifyWaitMultiAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		w.Notify(evBench)
+		if e := w.Wait(); e != evBench {
+			b.Fatalf("Wait: got %#x, wanted %#x", e, evBench)
+		}
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyWaitMultiAck(b *testing.B) {
+	var s sleep.Sleeper
+	var ws [3]sleep.Waker
+	for i := range ws {
+		s.AddWaker(&ws[i], i)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		ws[0].Assert()
+		if id, _ := s.Fetch(true); id != 0 {
+			b.Fatalf("Fetch: got %d, wanted 0", id)
+		}
+	}
+}
+
+func BenchmarkChannelNotifyWaitMultiAck(b *testing.B) {
+	ch0 := make(chan struct{}, 1)
+	ch1 := make(chan struct{}, 1)
+	ch2 := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		// notify
+		select {
+		case ch0 <- struct{}{}:
+		default:
+		}
+
+		// wait + clear
+		select {
+		case <-ch0:
+			// ok
+		case <-ch1:
+			b.Fatalf("received from ch1")
+		case <-ch2:
+			b.Fatalf("received from ch2")
+		}
+	}
+}
+
+// BenchmarkXxxNotifyAsyncWaitAck measures how long it takes to wait for an
+// event while another goroutine signals the event. This assumes that a new
+// goroutine doesn't run immediately (i.e. the creator of a new goroutine is
+// allowed to go to sleep before the new goroutine has a chance to run).
+
+func BenchmarkWaiterNotifyAsyncWaitAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			w.Notify(1)
+		}()
+		w.Wait()
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyAsyncWaitAck(b *testing.B) {
+	var s sleep.Sleeper
+	var w sleep.Waker
+	s.AddWaker(&w, 0)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			w.Assert()
+		}()
+		s.Fetch(true)
+	}
+}
+
+func BenchmarkChannelNotifyAsyncWaitAck(b *testing.B) {
+	ch := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			select {
+			case ch <- struct{}{}:
+			default:
+			}
+		}()
+		<-ch
+	}
+}
+
+// BenchmarkXxxNotifyAsyncWaitMultiAck is equivalent to NotifyAsyncWaitAck, but
+// allows for multiple event sources.
+
+func BenchmarkWaiterNotifyAsyncWaitMultiAck(b *testing.B) {
+	var w Waiter
+	w.Init()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			w.Notify(evBench)
+		}()
+		if e := w.Wait(); e != evBench {
+			b.Fatalf("Wait: got %#x, wanted %#x", e, evBench)
+		}
+		w.Ack(evBench)
+	}
+}
+
+func BenchmarkSleeperNotifyAsyncWaitMultiAck(b *testing.B) {
+	var s sleep.Sleeper
+	var ws [3]sleep.Waker
+	for i := range ws {
+		s.AddWaker(&ws[i], i)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			ws[0].Assert()
+		}()
+		if id, _ := s.Fetch(true); id != 0 {
+			b.Fatalf("Fetch: got %d, expected 0", id)
+		}
+	}
+}
+
+func BenchmarkChannelNotifyAsyncWaitMultiAck(b *testing.B) {
+	ch0 := make(chan struct{}, 1)
+	ch1 := make(chan struct{}, 1)
+	ch2 := make(chan struct{}, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		go func() {
+			select {
+			case ch0 <- struct{}{}:
+			default:
+			}
+		}()
+
+		select {
+		case <-ch0:
+			// ok
+		case <-ch1:
+			b.Fatalf("received from ch1")
+		case <-ch2:
+			b.Fatalf("received from ch2")
+		}
+	}
+}
diff --git a/pkg/syncevent/waiter_unsafe.go b/pkg/syncevent/waiter_unsafe.go
new file mode 100644
index 000000000..ad271e1a0
--- /dev/null
+++ b/pkg/syncevent/waiter_unsafe.go
@@ -0,0 +1,206 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.11
+// +build !go1.16
+
+// Check go:linkname function signatures when updating Go version.
+
+package syncevent
+
+import (
+	"sync/atomic"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+//go:linkname gopark runtime.gopark
+func gopark(unlockf func(unsafe.Pointer, *unsafe.Pointer) bool, wg *unsafe.Pointer, reason uint8, traceEv byte, traceskip int)
+
+//go:linkname goready runtime.goready
+func goready(g unsafe.Pointer, traceskip int)
+
+const (
+	waitReasonSelect     = 9  // Go: src/runtime/runtime2.go
+	traceEvGoBlockSelect = 24 // Go: src/runtime/trace.go
+)
+
+// Waiter allows a goroutine to block on pending events received by a Receiver.
+//
+// Waiter.Init() must be called before first use.
+type Waiter struct {
+	r Receiver
+
+	// g is one of:
+	//
+	// - nil: No goroutine is blocking in Wait.
+	//
+	// - &preparingG: A goroutine is in Wait preparing to sleep, but hasn't yet
+	// completed waiterUnlock(). Thus the wait can only be interrupted by
+	// replacing the value of g with nil (the G may not be in state Gwaiting
+	// yet, so we can't call goready.)
+	//
+	// - Otherwise: g is a pointer to the runtime.g in state Gwaiting for the
+	// goroutine blocked in Wait, which can only be woken by calling goready.
+	g unsafe.Pointer `state:"zerovalue"`
+}
+
+// Sentinel object for Waiter.g.
+var preparingG struct{}
+
+// Init must be called before first use of w.
+func (w *Waiter) Init() {
+	w.r.Init(w)
+}
+
+// Receiver returns the Receiver that receives events that unblock calls to
+// w.Wait().
+func (w *Waiter) Receiver() *Receiver {
+	return &w.r
+}
+
+// Pending returns the set of pending events.
+func (w *Waiter) Pending() Set {
+	return w.r.Pending()
+}
+
+// Wait blocks until at least one event is pending, then returns the set of
+// pending events. It does not affect the set of pending events; callers must
+// call w.Ack() to do so, or use w.WaitAndAck() instead.
+//
+// Precondition: Only one goroutine may call any Wait* method at a time.
+func (w *Waiter) Wait() Set {
+	return w.WaitFor(AllEvents)
+}
+
+// WaitFor blocks until at least one event in es is pending, then returns the
+// set of pending events (including those not in es). It does not affect the
+// set of pending events; callers must call w.Ack() to do so.
+//
+// Precondition: Only one goroutine may call any Wait* method at a time.
+func (w *Waiter) WaitFor(es Set) Set {
+	for {
+		// Optimization: Skip the atomic store to w.g if an event is already
+		// pending.
+		if p := w.r.Pending(); p&es != NoEvents {
+			return p
+		}
+
+		// Indicate that we're preparing to go to sleep.
+		atomic.StorePointer(&w.g, (unsafe.Pointer)(&preparingG))
+
+		// If an event is pending, abort the sleep.
+		if p := w.r.Pending(); p&es != NoEvents {
+			atomic.StorePointer(&w.g, nil)
+			return p
+		}
+
+		// If w.g is still preparingG (i.e. w.NotifyPending() has not been
+		// called or has not reached atomic.SwapPointer()), go to sleep until
+		// w.NotifyPending() => goready().
+		gopark(waiterUnlock, &w.g, waitReasonSelect, traceEvGoBlockSelect, 0)
+	}
+}
+
+// Ack marks the given events as not pending.
+func (w *Waiter) Ack(es Set) {
+	w.r.Ack(es)
+}
+
+// WaitAndAckAll blocks until at least one event is pending, then marks all
+// events as not pending and returns the set of previously-pending events.
+//
+// Precondition: Only one goroutine may call any Wait* method at a time.
+func (w *Waiter) WaitAndAckAll() Set {
+	// Optimization: Skip the atomic store to w.g if an event is already
+	// pending. Call Pending() first since, in the common case that events are
+	// not yet pending, this skips an atomic swap on w.r.pending.
+	if w.r.Pending() != NoEvents {
+		if p := w.r.PendingAndAckAll(); p != NoEvents {
+			return p
+		}
+	}
+
+	for {
+		// Indicate that we're preparing to go to sleep.
+		atomic.StorePointer(&w.g, (unsafe.Pointer)(&preparingG))
+
+		// If an event is pending, abort the sleep.
+		if w.r.Pending() != NoEvents {
+			if p := w.r.PendingAndAckAll(); p != NoEvents {
+				atomic.StorePointer(&w.g, nil)
+				return p
+			}
+		}
+
+		// If w.g is still preparingG (i.e. w.NotifyPending() has not been
+		// called or has not reached atomic.SwapPointer()), go to sleep until
+		// w.NotifyPending() => goready().
+		gopark(waiterUnlock, &w.g, waitReasonSelect, traceEvGoBlockSelect, 0)
+
+		// Check for pending events. We call PendingAndAckAll() directly now since
+		// we only expect to be woken after events become pending.
+		if p := w.r.PendingAndAckAll(); p != NoEvents {
+			return p
+		}
+	}
+}
+
+// Notify marks the given events as pending, possibly unblocking concurrent
+// calls to w.Wait() or w.WaitFor().
+func (w *Waiter) Notify(es Set) {
+	w.r.Notify(es)
+}
+
+// NotifyPending implements ReceiverCallback.NotifyPending. Users of Waiter
+// should not call NotifyPending.
+func (w *Waiter) NotifyPending() {
+	// Optimization: Skip the atomic swap on w.g if there is no sleeping
+	// goroutine. NotifyPending is called after w.r.Pending() is updated, so
+	// concurrent and future calls to w.Wait() will observe pending events and
+	// abort sleeping.
+	if atomic.LoadPointer(&w.g) == nil {
+		return
+	}
+	// Wake a sleeping G, or prevent a G that is preparing to sleep from doing
+	// so. Swap is needed here to ensure that only one call to NotifyPending
+	// calls goready.
+	if g := atomic.SwapPointer(&w.g, nil); g != nil && g != (unsafe.Pointer)(&preparingG) {
+		goready(g, 0)
+	}
+}
+
+var waiterPool = sync.Pool{
+	New: func() interface{} {
+		w := &Waiter{}
+		w.Init()
+		return w
+	},
+}
+
+// GetWaiter returns an unused Waiter. PutWaiter should be called to release
+// the Waiter once it is no longer needed.
+//
+// Where possible, users should prefer to associate each goroutine that calls
+// Waiter.Wait() with a distinct pre-allocated Waiter to avoid allocation of
+// Waiters in hot paths.
+func GetWaiter() *Waiter {
+	return waiterPool.Get().(*Waiter)
+}
+
+// PutWaiter releases an unused Waiter previously returned by GetWaiter.
+func PutWaiter(w *Waiter) {
+	waiterPool.Put(w)
+}
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 2269f6237..f86db0999 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -27,8 +27,10 @@ import (
 var (
 	E2BIG        = error(syscall.E2BIG)
 	EACCES       = error(syscall.EACCES)
+	EADDRINUSE   = error(syscall.EADDRINUSE)
 	EAGAIN       = error(syscall.EAGAIN)
 	EBADF        = error(syscall.EBADF)
+	EBADFD       = error(syscall.EBADFD)
 	EBUSY        = error(syscall.EBUSY)
 	ECHILD       = error(syscall.ECHILD)
 	ECONNREFUSED = error(syscall.ECONNREFUSED)
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 26f7ba86b..454e07662 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -5,8 +5,6 @@ package(licenses = ["notice"])
 go_library(
     name = "tcpip",
     srcs = [
-        "packet_buffer.go",
-        "packet_buffer_state.go",
         "tcpip.go",
         "time_unsafe.go",
         "timer.go",
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 6e0db2741..d82ed5205 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -335,6 +335,11 @@ func (c *TCPConn) Read(b []byte) (int, error) {
 	deadline := c.readCancel()
 
 	numRead := 0
+	defer func() {
+		if numRead != 0 {
+			c.ep.ModerateRecvBuf(numRead)
+		}
+	}()
 	for numRead != len(b) {
 		if len(c.read) == 0 {
 			var err error
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index ea0a0409a..3c552988a 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -127,6 +127,10 @@ func TestCloseReader(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 
@@ -175,6 +179,10 @@ func TestCloseReaderWithForwarder(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
@@ -225,30 +233,21 @@ func TestCloseRead(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("newLoopbackStack() = %v", terr)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
 
 	fwd := tcp.NewForwarder(s, 30000, 10, func(r *tcp.ForwarderRequest) {
 		var wq waiter.Queue
-		ep, err := r.CreateEndpoint(&wq)
+		_, err := r.CreateEndpoint(&wq)
 		if err != nil {
 			t.Fatalf("r.CreateEndpoint() = %v", err)
 		}
-		defer ep.Close()
-		r.Complete(false)
-
-		c := NewTCPConn(&wq, ep)
-
-		buf := make([]byte, 256)
-		n, e := c.Read(buf)
-		if e != nil || string(buf[:n]) != "abc123" {
-			t.Fatalf("c.Read() = (%d, %v), want (6, nil)", n, e)
-		}
-
-		if n, e = c.Write([]byte("abc123")); e != nil {
-			t.Errorf("c.Write() = (%d, %v), want (6, nil)", n, e)
-		}
+		// Endpoint will be closed in deferred s.Close (above).
 	})
 
 	s.SetTransportProtocolHandler(tcp.ProtocolNumber, fwd.HandlePacket)
@@ -278,6 +277,10 @@ func TestCloseWrite(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("newLoopbackStack() = %v", terr)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
@@ -334,6 +337,10 @@ func TestUDPForwarder(t *testing.T) {
 	if terr != nil {
 		t.Fatalf("newLoopbackStack() = %v", terr)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip1 := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr1 := tcpip.FullAddress{NICID, ip1, 11211}
@@ -391,6 +398,10 @@ func TestDeadlineChange(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 
@@ -440,6 +451,10 @@ func TestPacketConnTransfer(t *testing.T) {
 	if e != nil {
 		t.Fatalf("newLoopbackStack() = %v", e)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip1 := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr1 := tcpip.FullAddress{NICID, ip1, 11211}
@@ -492,6 +507,10 @@ func TestConnectedPacketConnTransfer(t *testing.T) {
 	if e != nil {
 		t.Fatalf("newLoopbackStack() = %v", e)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr := tcpip.FullAddress{NICID, ip, 11211}
@@ -562,6 +581,8 @@ func makePipe() (c1, c2 net.Conn, stop func(), err error) {
 	stop = func() {
 		c1.Close()
 		c2.Close()
+		s.Close()
+		s.Wait()
 	}
 
 	if err := l.Close(); err != nil {
@@ -624,6 +645,10 @@ func TestTCPDialError(t *testing.T) {
 	if e != nil {
 		t.Fatalf("newLoopbackStack() = %v", e)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
 	addr := tcpip.FullAddress{NICID, ip, 11211}
@@ -641,6 +666,10 @@ func TestDialContextTCPCanceled(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
@@ -659,6 +688,10 @@ func TestDialContextTCPTimeout(t *testing.T) {
 	if err != nil {
 		t.Fatalf("newLoopbackStack() = %v", err)
 	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
 
 	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
 	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 150310c11..9a3c5d6c3 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -15,6 +15,11 @@
 // Package buffer provides the implementation of a buffer view.
 package buffer
 
+import (
+	"bytes"
+	"io"
+)
+
 // View is a slice of a buffer, with convenience methods.
 type View []byte
 
@@ -45,8 +50,18 @@ func (v *View) CapLength(length int) {
 	*v = (*v)[:length:length]
 }
 
+// Reader returns a bytes.Reader for v.
+func (v *View) Reader() bytes.Reader {
+	var r bytes.Reader
+	r.Reset(*v)
+	return r
+}
+
 // ToVectorisedView returns a VectorisedView containing the receiver.
 func (v View) ToVectorisedView() VectorisedView {
+	if len(v) == 0 {
+		return VectorisedView{}
+	}
 	return NewVectorisedView(len(v), []View{v})
 }
 
@@ -65,7 +80,8 @@ func NewVectorisedView(size int, views []View) VectorisedView {
 	return VectorisedView{views: views, size: size}
 }
 
-// TrimFront removes the first "count" bytes of the vectorised view.
+// TrimFront removes the first "count" bytes of the vectorised view. It panics
+// if count > vv.Size().
 func (vv *VectorisedView) TrimFront(count int) {
 	for count > 0 && len(vv.views) > 0 {
 		if count < len(vv.views[0]) {
@@ -74,8 +90,49 @@ func (vv *VectorisedView) TrimFront(count int) {
 			return
 		}
 		count -= len(vv.views[0])
-		vv.RemoveFirst()
+		vv.removeFirst()
+	}
+}
+
+// Read implements io.Reader.
+func (vv *VectorisedView) Read(v View) (copied int, err error) {
+	count := len(v)
+	for count > 0 && len(vv.views) > 0 {
+		if count < len(vv.views[0]) {
+			vv.size -= count
+			copy(v[copied:], vv.views[0][:count])
+			vv.views[0].TrimFront(count)
+			copied += count
+			return copied, nil
+		}
+		count -= len(vv.views[0])
+		copy(v[copied:], vv.views[0])
+		copied += len(vv.views[0])
+		vv.removeFirst()
+	}
+	if copied == 0 {
+		return 0, io.EOF
+	}
+	return copied, nil
+}
+
+// ReadToVV reads up to n bytes from vv to dstVV and removes them from vv. It
+// returns the number of bytes copied.
+func (vv *VectorisedView) ReadToVV(dstVV *VectorisedView, count int) (copied int) {
+	for count > 0 && len(vv.views) > 0 {
+		if count < len(vv.views[0]) {
+			vv.size -= count
+			dstVV.AppendView(vv.views[0][:count])
+			vv.views[0].TrimFront(count)
+			copied += count
+			return
+		}
+		count -= len(vv.views[0])
+		dstVV.AppendView(vv.views[0])
+		copied += len(vv.views[0])
+		vv.removeFirst()
 	}
+	return copied
 }
 
 // CapLength irreversibly reduces the length of the vectorised view.
@@ -105,29 +162,45 @@ func (vv *VectorisedView) CapLength(length int) {
 // Clone returns a clone of this VectorisedView.
 // If the buffer argument is large enough to contain all the Views of this VectorisedView,
 // the method will avoid allocations and use the buffer to store the Views of the clone.
-func (vv VectorisedView) Clone(buffer []View) VectorisedView {
+func (vv *VectorisedView) Clone(buffer []View) VectorisedView {
 	return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size}
 }
 
-// First returns the first view of the vectorised view.
-func (vv VectorisedView) First() View {
+// PullUp returns the first "count" bytes of the vectorised view. If those
+// bytes aren't already contiguous inside the vectorised view, PullUp will
+// reallocate as needed to make them contiguous. PullUp fails and returns false
+// when count > vv.Size().
+func (vv *VectorisedView) PullUp(count int) (View, bool) {
 	if len(vv.views) == 0 {
-		return nil
+		return nil, count == 0
+	}
+	if count <= len(vv.views[0]) {
+		return vv.views[0][:count], true
+	}
+	if count > vv.size {
+		return nil, false
 	}
-	return vv.views[0]
-}
 
-// RemoveFirst removes the first view of the vectorised view.
-func (vv *VectorisedView) RemoveFirst() {
-	if len(vv.views) == 0 {
-		return
+	newFirst := NewView(count)
+	i := 0
+	for offset := 0; offset < count; i++ {
+		copy(newFirst[offset:], vv.views[i])
+		if count-offset < len(vv.views[i]) {
+			vv.views[i].TrimFront(count - offset)
+			break
+		}
+		offset += len(vv.views[i])
+		vv.views[i] = nil
 	}
-	vv.size -= len(vv.views[0])
-	vv.views = vv.views[1:]
+	// We're guaranteed that i > 0, since count is too large for the first
+	// view.
+	vv.views[i-1] = newFirst
+	vv.views = vv.views[i-1:]
+	return newFirst, true
 }
 
 // Size returns the size in bytes of the entire content stored in the vectorised view.
-func (vv VectorisedView) Size() int {
+func (vv *VectorisedView) Size() int {
 	return vv.size
 }
 
@@ -135,7 +208,7 @@ func (vv VectorisedView) Size() int {
 //
 // If the vectorised view contains a single view, that view will be returned
 // directly.
-func (vv VectorisedView) ToView() View {
+func (vv *VectorisedView) ToView() View {
 	if len(vv.views) == 1 {
 		return vv.views[0]
 	}
@@ -147,7 +220,7 @@ func (vv VectorisedView) ToView() View {
 }
 
 // Views returns the slice containing the all views.
-func (vv VectorisedView) Views() []View {
+func (vv *VectorisedView) Views() []View {
 	return vv.views
 }
 
@@ -156,3 +229,28 @@ func (vv *VectorisedView) Append(vv2 VectorisedView) {
 	vv.views = append(vv.views, vv2.views...)
 	vv.size += vv2.size
 }
+
+// AppendView appends the given view into this vectorised view.
+func (vv *VectorisedView) AppendView(v View) {
+	if len(v) == 0 {
+		return
+	}
+	vv.views = append(vv.views, v)
+	vv.size += len(v)
+}
+
+// Readers returns a bytes.Reader for each of vv's views.
+func (vv *VectorisedView) Readers() []bytes.Reader {
+	readers := make([]bytes.Reader, 0, len(vv.views))
+	for _, v := range vv.views {
+		readers = append(readers, v.Reader())
+	}
+	return readers
+}
+
+// removeFirst panics when len(vv.views) < 1.
+func (vv *VectorisedView) removeFirst() {
+	vv.size -= len(vv.views[0])
+	vv.views[0] = nil
+	vv.views = vv.views[1:]
+}
diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go
index ebc3a17b7..726e54de9 100644
--- a/pkg/tcpip/buffer/view_test.go
+++ b/pkg/tcpip/buffer/view_test.go
@@ -16,6 +16,7 @@
 package buffer
 
 import (
+	"bytes"
 	"reflect"
 	"testing"
 )
@@ -233,3 +234,288 @@ func TestToClone(t *testing.T) {
 		})
 	}
 }
+
+func TestVVReadToVV(t *testing.T) {
+	testCases := []struct {
+		comment     string
+		vv          VectorisedView
+		bytesToRead int
+		wantBytes   string
+		leftVV      VectorisedView
+	}{
+		{
+			comment:     "large VV, short read",
+			vv:          vv(30, "012345678901234567890123456789"),
+			bytesToRead: 10,
+			wantBytes:   "0123456789",
+			leftVV:      vv(20, "01234567890123456789"),
+		},
+		{
+			comment:     "largeVV, multiple views, short read",
+			vv:          vv(13, "123", "345", "567", "8910"),
+			bytesToRead: 6,
+			wantBytes:   "123345",
+			leftVV:      vv(7, "567", "8910"),
+		},
+		{
+			comment:     "smallVV (multiple views), large read",
+			vv:          vv(3, "1", "2", "3"),
+			bytesToRead: 10,
+			wantBytes:   "123",
+			leftVV:      vv(0, ""),
+		},
+		{
+			comment:     "smallVV (single view), large read",
+			vv:          vv(1, "1"),
+			bytesToRead: 10,
+			wantBytes:   "1",
+			leftVV:      vv(0, ""),
+		},
+		{
+			comment:     "emptyVV, large read",
+			vv:          vv(0, ""),
+			bytesToRead: 10,
+			wantBytes:   "",
+			leftVV:      vv(0, ""),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.comment, func(t *testing.T) {
+			var readTo VectorisedView
+			inSize := tc.vv.Size()
+			copied := tc.vv.ReadToVV(&readTo, tc.bytesToRead)
+			if got, want := copied, len(tc.wantBytes); got != want {
+				t.Errorf("incorrect number of bytes copied returned in ReadToVV got: %d, want: %d, tc: %+v", got, want, tc)
+			}
+			if got, want := string(readTo.ToView()), tc.wantBytes; got != want {
+				t.Errorf("unexpected content in readTo got: %s, want: %s", got, want)
+			}
+			if got, want := tc.vv.Size(), inSize-copied; got != want {
+				t.Errorf("test VV has incorrect size after reading got: %d, want: %d, tc.vv: %+v", got, want, tc.vv)
+			}
+			if got, want := string(tc.vv.ToView()), string(tc.leftVV.ToView()); got != want {
+				t.Errorf("unexpected data left in vv after read got: %+v, want: %+v", got, want)
+			}
+		})
+	}
+}
+
+func TestVVRead(t *testing.T) {
+	testCases := []struct {
+		comment     string
+		vv          VectorisedView
+		bytesToRead int
+		readBytes   string
+		leftBytes   string
+		wantError   bool
+	}{
+		{
+			comment:     "large VV, short read",
+			vv:          vv(30, "012345678901234567890123456789"),
+			bytesToRead: 10,
+			readBytes:   "0123456789",
+			leftBytes:   "01234567890123456789",
+		},
+		{
+			comment:     "largeVV, multiple buffers, short read",
+			vv:          vv(13, "123", "345", "567", "8910"),
+			bytesToRead: 6,
+			readBytes:   "123345",
+			leftBytes:   "5678910",
+		},
+		{
+			comment:     "smallVV, large read",
+			vv:          vv(3, "1", "2", "3"),
+			bytesToRead: 10,
+			readBytes:   "123",
+			leftBytes:   "",
+		},
+		{
+			comment:     "smallVV, large read",
+			vv:          vv(1, "1"),
+			bytesToRead: 10,
+			readBytes:   "1",
+			leftBytes:   "",
+		},
+		{
+			comment:     "emptyVV, large read",
+			vv:          vv(0, ""),
+			bytesToRead: 10,
+			readBytes:   "",
+			wantError:   true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.comment, func(t *testing.T) {
+			readTo := NewView(tc.bytesToRead)
+			inSize := tc.vv.Size()
+			copied, err := tc.vv.Read(readTo)
+			if !tc.wantError && err != nil {
+				t.Fatalf("unexpected error in tc.vv.Read(..) = %s", err)
+			}
+			readTo = readTo[:copied]
+			if got, want := copied, len(tc.readBytes); got != want {
+				t.Errorf("incorrect number of bytes copied returned in ReadToVV got: %d, want: %d, tc.vv: %+v", got, want, tc.vv)
+			}
+			if got, want := string(readTo), tc.readBytes; got != want {
+				t.Errorf("unexpected data in readTo got: %s, want: %s", got, want)
+			}
+			if got, want := tc.vv.Size(), inSize-copied; got != want {
+				t.Errorf("test VV has incorrect size after reading got: %d, want: %d, tc.vv: %+v", got, want, tc.vv)
+			}
+			if got, want := string(tc.vv.ToView()), tc.leftBytes; got != want {
+				t.Errorf("vv has incorrect data after Read got: %s, want: %s", got, want)
+			}
+		})
+	}
+}
+
+var pullUpTestCases = []struct {
+	comment string
+	in      VectorisedView
+	count   int
+	want    []byte
+	result  VectorisedView
+	ok      bool
+}{
+	{
+		comment: "simple case",
+		in:      vv(2, "12"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(2, "12"),
+		ok:      true,
+	},
+	{
+		comment: "entire View",
+		in:      vv(2, "1", "2"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(2, "1", "2"),
+		ok:      true,
+	},
+	{
+		comment: "spanning across two Views",
+		in:      vv(3, "1", "23"),
+		count:   2,
+		want:    []byte("12"),
+		result:  vv(3, "12", "3"),
+		ok:      true,
+	},
+	{
+		comment: "spanning across all Views",
+		in:      vv(5, "1", "23", "45"),
+		count:   5,
+		want:    []byte("12345"),
+		result:  vv(5, "12345"),
+		ok:      true,
+	},
+	{
+		comment: "count = 0",
+		in:      vv(1, "1"),
+		count:   0,
+		want:    []byte{},
+		result:  vv(1, "1"),
+		ok:      true,
+	},
+	{
+		comment: "count = size",
+		in:      vv(1, "1"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(1, "1"),
+		ok:      true,
+	},
+	{
+		comment: "count too large",
+		in:      vv(3, "1", "23"),
+		count:   4,
+		want:    nil,
+		result:  vv(3, "1", "23"),
+		ok:      false,
+	},
+	{
+		comment: "empty vv",
+		in:      vv(0, ""),
+		count:   1,
+		want:    nil,
+		result:  vv(0, ""),
+		ok:      false,
+	},
+	{
+		comment: "empty vv, count = 0",
+		in:      vv(0, ""),
+		count:   0,
+		want:    nil,
+		result:  vv(0, ""),
+		ok:      true,
+	},
+	{
+		comment: "empty views",
+		in:      vv(3, "", "1", "", "23"),
+		count:   2,
+		want:    []byte("12"),
+		result:  vv(3, "12", "3"),
+		ok:      true,
+	},
+}
+
+func TestPullUp(t *testing.T) {
+	for _, c := range pullUpTestCases {
+		got, ok := c.in.PullUp(c.count)
+
+		// Is the return value right?
+		if ok != c.ok {
+			t.Errorf("Test %q failed when calling PullUp(%d) on %v. Got an ok of %t. Want %t",
+				c.comment, c.count, c.in, ok, c.ok)
+		}
+		if bytes.Compare(got, View(c.want)) != 0 {
+			t.Errorf("Test %q failed when calling PullUp(%d) on %v. Got %v. Want %v",
+				c.comment, c.count, c.in, got, c.want)
+		}
+
+		// Is the underlying structure right?
+		if !reflect.DeepEqual(c.in, c.result) {
+			t.Errorf("Test %q failed when calling PullUp(%d). Got vv with structure %v. Wanted %v",
+				c.comment, c.count, c.in, c.result)
+		}
+	}
+}
+
+func TestToVectorisedView(t *testing.T) {
+	testCases := []struct {
+		in   View
+		want VectorisedView
+	}{
+		{nil, VectorisedView{}},
+		{View{}, VectorisedView{}},
+		{View{'a'}, VectorisedView{size: 1, views: []View{{'a'}}}},
+	}
+	for _, tc := range testCases {
+		if got, want := tc.in.ToVectorisedView(), tc.want; !reflect.DeepEqual(got, want) {
+			t.Errorf("(%v).ToVectorisedView failed got: %+v, want: %+v", tc.in, got, want)
+		}
+	}
+}
+
+func TestAppendView(t *testing.T) {
+	testCases := []struct {
+		vv   VectorisedView
+		in   View
+		want VectorisedView
+	}{
+		{VectorisedView{}, nil, VectorisedView{}},
+		{VectorisedView{}, View{}, VectorisedView{}},
+		{VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, nil, VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}},
+		{VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, View{}, VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}},
+		{VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, View{'e'}, VectorisedView{[]View{{'a', 'b', 'c', 'd'}, {'e'}}, 5}},
+	}
+	for _, tc := range testCases {
+		tc.vv.AppendView(tc.in)
+		if got, want := tc.vv, tc.want; !reflect.DeepEqual(got, want) {
+			t.Errorf("(%v).ToVectorisedView failed got: %+v, want: %+v", tc.in, got, want)
+		}
+	}
+}
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 4d6ae0871..c1745ba6a 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -107,6 +107,8 @@ func DstAddr(addr tcpip.Address) NetworkChecker {
 // TTL creates a checker that checks the TTL (ipv4) or HopLimit (ipv6).
 func TTL(ttl uint8) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
 		var v uint8
 		switch ip := h[0].(type) {
 		case header.IPv4:
@@ -161,6 +163,20 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTClass creates a checker that checks the TCLASS field in
+// ControlMessages.
+func ReceiveTClass(want uint32) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTClass {
+			t.Fatalf("got cm.HasTClass = %t, want cm.TClass = %d", cm.HasTClass, want)
+		}
+		if got := cm.TClass; got != want {
+			t.Fatalf("got cm.TClass = %d, want %d", got, want)
+		}
+	}
+}
+
 // ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
 func ReceiveTOS(want uint8) ControlMessagesChecker {
 	return func(t *testing.T, cm tcpip.ControlMessages) {
@@ -296,6 +312,8 @@ func SrcPort(port uint16) TransportChecker {
 // DstPort creates a checker that checks the destination port.
 func DstPort(port uint16) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		if p := h.DestinationPort(); p != port {
 			t.Errorf("Bad destination port, got %v, want %v", p, port)
 		}
@@ -322,6 +340,7 @@ func SeqNum(seq uint32) TransportChecker {
 func AckNum(seq uint32) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
+
 		tcp, ok := h.(header.TCP)
 		if !ok {
 			return
@@ -336,6 +355,8 @@ func AckNum(seq uint32) TransportChecker {
 // Window creates a checker that checks the tcp window.
 func Window(window uint16) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		tcp, ok := h.(header.TCP)
 		if !ok {
 			return
@@ -367,6 +388,8 @@ func TCPFlags(flags uint8) TransportChecker {
 // given mask, match the supplied flags.
 func TCPFlagsMatch(flags, mask uint8) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		tcp, ok := h.(header.TCP)
 		if !ok {
 			return
@@ -384,6 +407,8 @@ func TCPFlagsMatch(flags, mask uint8) TransportChecker {
 // If wndscale is negative, the window scale option must not be present.
 func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		tcp, ok := h.(header.TCP)
 		if !ok {
 			return
@@ -480,6 +505,8 @@ func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
 // skipped.
 func TCPTimestampChecker(wantTS bool, wantTSVal uint32, wantTSEcr uint32) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		tcp, ok := h.(header.TCP)
 		if !ok {
 			return
@@ -598,6 +625,8 @@ func TCPSACKBlockChecker(sackBlocks []header.SACKBlock) TransportChecker {
 // Payload creates a checker that checks the payload.
 func Payload(want []byte) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
 		if got := h.Payload(); !reflect.DeepEqual(got, want) {
 			t.Errorf("Wrong payload, got %v, want %v", got, want)
 		}
@@ -630,6 +659,7 @@ func ICMPv4(checkers ...TransportChecker) NetworkChecker {
 func ICMPv4Type(want header.ICMPv4Type) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
+
 		icmpv4, ok := h.(header.ICMPv4)
 		if !ok {
 			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv4", h)
@@ -644,6 +674,7 @@ func ICMPv4Type(want header.ICMPv4Type) TransportChecker {
 func ICMPv4Code(want byte) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
+
 		icmpv4, ok := h.(header.ICMPv4)
 		if !ok {
 			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv4", h)
@@ -686,6 +717,7 @@ func ICMPv6(checkers ...TransportChecker) NetworkChecker {
 func ICMPv6Type(want header.ICMPv6Type) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
+
 		icmpv6, ok := h.(header.ICMPv6)
 		if !ok {
 			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv6", h)
@@ -700,6 +732,7 @@ func ICMPv6Type(want header.ICMPv6Type) TransportChecker {
 func ICMPv6Code(want byte) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
+
 		icmpv6, ok := h.(header.ICMPv6)
 		if !ok {
 			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv6", h)
@@ -714,7 +747,7 @@ func ICMPv6Code(want byte) TransportChecker {
 // message for type of ty, with potentially additional checks specified by
 // checkers.
 //
-// checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
 // NDP message as far as the size of the message (minSize) is concerned. The
 // values within the message are up to checkers to validate.
 func NDP(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) NetworkChecker {
@@ -746,9 +779,9 @@ func NDP(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) N
 // Neighbor Solicitation message (as per the raw wire format), with potentially
 // additional checks specified by checkers.
 //
-// checkers may assume that a valid ICMPv6 is passed to it containing a valid
-// NDPNS message as far as the size of the messages concerned. The values within
-// the message are up to checkers to validate.
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPNS message as far as the size of the message is concerned. The values
+// within the message are up to checkers to validate.
 func NDPNS(checkers ...TransportChecker) NetworkChecker {
 	return NDP(header.ICMPv6NeighborSolicit, header.NDPNSMinimumSize, checkers...)
 }
@@ -766,63 +799,162 @@ func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
 		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
 
 		if got := ns.TargetAddress(); got != want {
-			t.Fatalf("got %T.TargetAddress = %s, want = %s", ns, got, want)
+			t.Errorf("got %T.TargetAddress() = %s, want = %s", ns, got, want)
 		}
 	}
 }
 
-// NDPNSOptions creates a checker that checks that the packet contains the
-// provided NDP options within an NDP Neighbor Solicitation message.
+// NDPNA creates a checker that checks that the packet contains a valid NDP
+// Neighbor Advertisement message (as per the raw wire format), with potentially
+// additional checks specified by checkers.
+//
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPNA message as far as the size of the message is concerned. The values
+// within the message are up to checkers to validate.
+func NDPNA(checkers ...TransportChecker) NetworkChecker {
+	return NDP(header.ICMPv6NeighborAdvert, header.NDPNAMinimumSize, checkers...)
+}
+
+// NDPNATargetAddress creates a checker that checks the Target Address field of
+// a header.NDPNeighborAdvert.
 //
 // The returned TransportChecker assumes that a valid ICMPv6 is passed to it
-// containing a valid NDPNS message as far as the size is concerned.
-func NDPNSOptions(opts []header.NDPOption) TransportChecker {
+// containing a valid NDPNA message as far as the size is concerned.
+func NDPNATargetAddress(want tcpip.Address) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
 		t.Helper()
 
 		icmp := h.(header.ICMPv6)
-		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
-		it, err := ns.Options().Iter(true)
-		if err != nil {
-			t.Errorf("opts.Iter(true): %s", err)
-			return
+		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+
+		if got := na.TargetAddress(); got != want {
+			t.Errorf("got %T.TargetAddress() = %s, want = %s", na, got, want)
 		}
+	}
+}
 
-		i := 0
-		for {
-			opt, done, _ := it.Next()
-			if done {
-				break
-			}
+// NDPNASolicitedFlag creates a checker that checks the Solicited field of
+// a header.NDPNeighborAdvert.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNA message as far as the size is concerned.
+func NDPNASolicitedFlag(want bool) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
 
-			if i >= len(opts) {
-				t.Errorf("got unexpected option: %s", opt)
-				continue
-			}
+		icmp := h.(header.ICMPv6)
+		na := header.NDPNeighborAdvert(icmp.NDPPayload())
 
-			switch wantOpt := opts[i].(type) {
-			case header.NDPSourceLinkLayerAddressOption:
-				gotOpt, ok := opt.(header.NDPSourceLinkLayerAddressOption)
-				if !ok {
-					t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
-				} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
-					t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
-				}
-			default:
-				panic("not implemented")
-			}
+		if got := na.SolicitedFlag(); got != want {
+			t.Errorf("got %T.SolicitedFlag = %t, want = %t", na, got, want)
+		}
+	}
+}
+
+// ndpOptions checks that optsBuf only contains opts.
+func ndpOptions(t *testing.T, optsBuf header.NDPOptions, opts []header.NDPOption) {
+	t.Helper()
+
+	it, err := optsBuf.Iter(true)
+	if err != nil {
+		t.Errorf("optsBuf.Iter(true): %s", err)
+		return
+	}
 
-			i++
+	i := 0
+	for {
+		opt, done, err := it.Next()
+		if err != nil {
+			// This should never happen as Iter(true) above did not return an error.
+			t.Fatalf("unexpected error when iterating over NDP options: %s", err)
+		}
+		if done {
+			break
+		}
+
+		if i >= len(opts) {
+			t.Errorf("got unexpected option: %s", opt)
+			continue
 		}
 
-		if missing := opts[i:]; len(missing) > 0 {
-			t.Errorf("missing options: %s", missing)
+		switch wantOpt := opts[i].(type) {
+		case header.NDPSourceLinkLayerAddressOption:
+			gotOpt, ok := opt.(header.NDPSourceLinkLayerAddressOption)
+			if !ok {
+				t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
+			} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
+				t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
+			}
+		case header.NDPTargetLinkLayerAddressOption:
+			gotOpt, ok := opt.(header.NDPTargetLinkLayerAddressOption)
+			if !ok {
+				t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
+			} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
+				t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
+			}
+		default:
+			t.Fatalf("checker not implemented for expected NDP option: %T", wantOpt)
 		}
+
+		i++
+	}
+
+	if missing := opts[i:]; len(missing) > 0 {
+		t.Errorf("missing options: %s", missing)
+	}
+}
+
+// NDPNAOptions creates a checker that checks that the packet contains the
+// provided NDP options within an NDP Neighbor Solicitation message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNA message as far as the size is concerned.
+func NDPNAOptions(opts []header.NDPOption) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+		ndpOptions(t, na.Options(), opts)
+	}
+}
+
+// NDPNSOptions creates a checker that checks that the packet contains the
+// provided NDP options within an NDP Neighbor Solicitation message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNS message as far as the size is concerned.
+func NDPNSOptions(opts []header.NDPOption) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+		ndpOptions(t, ns.Options(), opts)
 	}
 }
 
 // NDPRS creates a checker that checks that the packet contains a valid NDP
 // Router Solicitation message (as per the raw wire format).
-func NDPRS() NetworkChecker {
-	return NDP(header.ICMPv6RouterSolicit, header.NDPRSMinimumSize)
+//
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPRS as far as the size of the message is concerned. The values within the
+// message are up to checkers to validate.
+func NDPRS(checkers ...TransportChecker) NetworkChecker {
+	return NDP(header.ICMPv6RouterSolicit, header.NDPRSMinimumSize, checkers...)
+}
+
+// NDPRSOptions creates a checker that checks that the packet contains the
+// provided NDP options within an NDP Router Solicitation message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPRS message as far as the size is concerned.
+func NDPRSOptions(opts []header.NDPOption) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		rs := header.NDPRouterSolicit(icmp.NDPPayload())
+		ndpOptions(t, rs.Options(), opts)
+	}
 }
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 9da0d71f8..0cde694dc 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -14,12 +14,14 @@ go_library(
         "interfaces.go",
         "ipv4.go",
         "ipv6.go",
+        "ipv6_extension_headers.go",
         "ipv6_fragment.go",
         "ndp_neighbor_advert.go",
         "ndp_neighbor_solicit.go",
         "ndp_options.go",
         "ndp_router_advert.go",
         "ndp_router_solicit.go",
+        "ndpoptionidentifier_string.go",
         "tcp.go",
         "udp.go",
     ],
@@ -55,11 +57,13 @@ go_test(
     size = "small",
     srcs = [
         "eth_test.go",
+        "ipv6_extension_headers_test.go",
         "ndp_test.go",
     ],
     library = ":header",
     deps = [
         "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
         "@com_github_google_go-cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go
index 7a0014ad9..14413f2ce 100644
--- a/pkg/tcpip/header/eth_test.go
+++ b/pkg/tcpip/header/eth_test.go
@@ -88,7 +88,7 @@ func TestEthernetAddressFromMulticastIPv4Address(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			if got := EthernetAddressFromMulticastIPv4Address(test.addr); got != test.expectedLinkAddr {
-				t.Fatalf("got EthernetAddressFromMulticastIPv4Address(%s) = %s, want = %s", got, test.expectedLinkAddr)
+				t.Fatalf("got EthernetAddressFromMulticastIPv4Address(%s) = %s, want = %s", test.addr, got, test.expectedLinkAddr)
 			}
 		})
 	}
diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go
index 0cac6c0a5..7908c5744 100644
--- a/pkg/tcpip/header/icmpv4.go
+++ b/pkg/tcpip/header/icmpv4.go
@@ -71,6 +71,7 @@ const (
 
 // Values for ICMP code as defined in RFC 792.
 const (
+	ICMPv4TTLExceeded         = 0
 	ICMPv4PortUnreachable     = 3
 	ICMPv4FragmentationNeeded = 4
 )
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index e5360e7c1..62ac932bb 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -38,7 +38,8 @@ const (
 // IPv4Fields contains the fields of an IPv4 packet. It is used to describe the
 // fields of a packet that needs to be encoded.
 type IPv4Fields struct {
-	// IHL is the "internet header length" field of an IPv4 packet.
+	// IHL is the "internet header length" field of an IPv4 packet. The value
+	// is in bytes.
 	IHL uint8
 
 	// TOS is the "type of service" field of an IPv4 packet.
@@ -138,7 +139,7 @@ func IPVersion(b []byte) int {
 }
 
 // HeaderLength returns the value of the "header length" field of the ipv4
-// header.
+// header. The length returned is in bytes.
 func (b IPv4) HeaderLength() uint8 {
 	return (b[versIHL] & 0xf) * 4
 }
@@ -158,6 +159,11 @@ func (b IPv4) Flags() uint8 {
 	return uint8(binary.BigEndian.Uint16(b[flagsFO:]) >> 13)
 }
 
+// More returns whether the more fragments flag is set.
+func (b IPv4) More() bool {
+	return b.Flags()&IPv4FlagMoreFragments != 0
+}
+
 // TTL returns the "TTL" field of the ipv4 header.
 func (b IPv4) TTL() uint8 {
 	return b[ttl]
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 70e6ce095..4f367fe4c 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -17,6 +17,7 @@ package header
 import (
 	"crypto/sha256"
 	"encoding/binary"
+	"fmt"
 	"strings"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -27,7 +28,9 @@ const (
 	// IPv6PayloadLenOffset is the offset of the PayloadLength field in
 	// IPv6 header.
 	IPv6PayloadLenOffset = 4
-	nextHdr              = 6
+	// IPv6NextHeaderOffset is the offset of the NextHeader field in
+	// IPv6 header.
+	IPv6NextHeaderOffset = 6
 	hopLimit             = 7
 	v6SrcAddr            = 8
 	v6DstAddr            = v6SrcAddr + IPv6AddressSize
@@ -115,6 +118,19 @@ const (
 	// for the secret key used to generate an opaque interface identifier as
 	// outlined by RFC 7217.
 	OpaqueIIDSecretKeyMinBytes = 16
+
+	// ipv6MulticastAddressScopeByteIdx is the byte where the scope (scop) field
+	// is located within a multicast IPv6 address, as per RFC 4291 section 2.7.
+	ipv6MulticastAddressScopeByteIdx = 1
+
+	// ipv6MulticastAddressScopeMask is the mask for the scope (scop) field,
+	// within the byte holding the field, as per RFC 4291 section 2.7.
+	ipv6MulticastAddressScopeMask = 0xF
+
+	// ipv6LinkLocalMulticastScope is the value of the scope (scop) field within
+	// a multicast IPv6 address that indicates the address has link-local scope,
+	// as per RFC 4291 section 2.7.
+	ipv6LinkLocalMulticastScope = 2
 )
 
 // IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the
@@ -150,7 +166,7 @@ func (b IPv6) HopLimit() uint8 {
 
 // NextHeader returns the value of the "next header" field of the ipv6 header.
 func (b IPv6) NextHeader() uint8 {
-	return b[nextHdr]
+	return b[IPv6NextHeaderOffset]
 }
 
 // TransportProtocol implements Network.TransportProtocol.
@@ -210,7 +226,7 @@ func (b IPv6) SetDestinationAddress(addr tcpip.Address) {
 
 // SetNextHeader sets the value of the "next header" field of the ipv6 header.
 func (b IPv6) SetNextHeader(v uint8) {
-	b[nextHdr] = v
+	b[IPv6NextHeaderOffset] = v
 }
 
 // SetChecksum implements Network.SetChecksum. Given that IPv6 doesn't have a
@@ -222,7 +238,7 @@ func (IPv6) SetChecksum(uint16) {
 func (b IPv6) Encode(i *IPv6Fields) {
 	b.SetTOS(i.TrafficClass, i.FlowLabel)
 	b.SetPayloadLength(i.PayloadLength)
-	b[nextHdr] = i.NextHeader
+	b[IPv6NextHeaderOffset] = i.NextHeader
 	b[hopLimit] = i.HopLimit
 	b.SetSourceAddress(i.SrcAddr)
 	b.SetDestinationAddress(i.DstAddr)
@@ -340,6 +356,12 @@ func IsV6LinkLocalAddress(addr tcpip.Address) bool {
 	return addr[0] == 0xfe && (addr[1]&0xc0) == 0x80
 }
 
+// IsV6LinkLocalMulticastAddress determines if the provided address is an IPv6
+// link-local multicast address.
+func IsV6LinkLocalMulticastAddress(addr tcpip.Address) bool {
+	return IsV6MulticastAddress(addr) && addr[ipv6MulticastAddressScopeByteIdx]&ipv6MulticastAddressScopeMask == ipv6LinkLocalMulticastScope
+}
+
 // IsV6UniqueLocalAddress determines if the provided address is an IPv6
 // unique-local address (within the prefix FC00::/7).
 func IsV6UniqueLocalAddress(addr tcpip.Address) bool {
@@ -411,6 +433,9 @@ func ScopeForIPv6Address(addr tcpip.Address) (IPv6AddressScope, *tcpip.Error) {
 	}
 
 	switch {
+	case IsV6LinkLocalMulticastAddress(addr):
+		return LinkLocalScope, nil
+
 	case IsV6LinkLocalAddress(addr):
 		return LinkLocalScope, nil
 
@@ -421,3 +446,54 @@ func ScopeForIPv6Address(addr tcpip.Address) (IPv6AddressScope, *tcpip.Error) {
 		return GlobalScope, nil
 	}
 }
+
+// InitialTempIID generates the initial temporary IID history value to generate
+// temporary SLAAC addresses with.
+//
+// Panics if initialTempIIDHistory is not at least IIDSize bytes.
+func InitialTempIID(initialTempIIDHistory []byte, seed []byte, nicID tcpip.NICID) {
+	h := sha256.New()
+	// h.Write never returns an error.
+	h.Write(seed)
+	var nicIDBuf [4]byte
+	binary.BigEndian.PutUint32(nicIDBuf[:], uint32(nicID))
+	h.Write(nicIDBuf[:])
+
+	var sumBuf [sha256.Size]byte
+	sum := h.Sum(sumBuf[:0])
+
+	if n := copy(initialTempIIDHistory, sum[sha256.Size-IIDSize:]); n != IIDSize {
+		panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IIDSize))
+	}
+}
+
+// GenerateTempIPv6SLAACAddr generates a temporary SLAAC IPv6 address for an
+// associated stable/permanent SLAAC address.
+//
+// GenerateTempIPv6SLAACAddr will update the temporary IID history value to be
+// used when generating a new temporary IID.
+//
+// Panics if tempIIDHistory is not at least IIDSize bytes.
+func GenerateTempIPv6SLAACAddr(tempIIDHistory []byte, stableAddr tcpip.Address) tcpip.AddressWithPrefix {
+	addrBytes := []byte(stableAddr)
+	h := sha256.New()
+	h.Write(tempIIDHistory)
+	h.Write(addrBytes[IIDOffsetInIPv6Address:])
+	var sumBuf [sha256.Size]byte
+	sum := h.Sum(sumBuf[:0])
+
+	// The rightmost 64 bits of sum are saved for the next iteration.
+	if n := copy(tempIIDHistory, sum[sha256.Size-IIDSize:]); n != IIDSize {
+		panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IIDSize))
+	}
+
+	// The leftmost 64 bits of sum is used as the IID.
+	if n := copy(addrBytes[IIDOffsetInIPv6Address:], sum); n != IIDSize {
+		panic(fmt.Sprintf("copied %d IID bytes, expected %d bytes", n, IIDSize))
+	}
+
+	return tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(addrBytes),
+		PrefixLen: IIDOffsetInIPv6Address * 8,
+	}
+}
diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
new file mode 100644
index 000000000..3499d8399
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -0,0 +1,551 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// IPv6ExtensionHeaderIdentifier is an IPv6 extension header identifier.
+type IPv6ExtensionHeaderIdentifier uint8
+
+const (
+	// IPv6HopByHopOptionsExtHdrIdentifier is the header identifier of a Hop by
+	// Hop Options extension header, as per RFC 8200 section 4.3.
+	IPv6HopByHopOptionsExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 0
+
+	// IPv6RoutingExtHdrIdentifier is the header identifier of a Routing extension
+	// header, as per RFC 8200 section 4.4.
+	IPv6RoutingExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 43
+
+	// IPv6FragmentExtHdrIdentifier is the header identifier of a Fragment
+	// extension header, as per RFC 8200 section 4.5.
+	IPv6FragmentExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 44
+
+	// IPv6DestinationOptionsExtHdrIdentifier is the header identifier of a
+	// Destination Options extension header, as per RFC 8200 section 4.6.
+	IPv6DestinationOptionsExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 60
+
+	// IPv6NoNextHeaderIdentifier is the header identifier used to signify the end
+	// of an IPv6 payload, as per RFC 8200 section 4.7.
+	IPv6NoNextHeaderIdentifier IPv6ExtensionHeaderIdentifier = 59
+)
+
+const (
+	// ipv6UnknownExtHdrOptionActionMask is the mask of the action to take when
+	// a node encounters an unrecognized option.
+	ipv6UnknownExtHdrOptionActionMask = 192
+
+	// ipv6UnknownExtHdrOptionActionShift is the least significant bits to discard
+	// from the action value for an unrecognized option identifier.
+	ipv6UnknownExtHdrOptionActionShift = 6
+
+	// ipv6RoutingExtHdrSegmentsLeftIdx is the index to the Segments Left field
+	// within an IPv6RoutingExtHdr.
+	ipv6RoutingExtHdrSegmentsLeftIdx = 1
+
+	// IPv6FragmentExtHdrLength is the length of an IPv6 extension header, in
+	// bytes.
+	IPv6FragmentExtHdrLength = 8
+
+	// ipv6FragmentExtHdrFragmentOffsetOffset is the offset to the start of the
+	// Fragment Offset field within an IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrFragmentOffsetOffset = 0
+
+	// ipv6FragmentExtHdrFragmentOffsetShift is the least significant bits to
+	// discard from the Fragment Offset.
+	ipv6FragmentExtHdrFragmentOffsetShift = 3
+
+	// ipv6FragmentExtHdrFlagsIdx is the index to the flags field within an
+	// IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrFlagsIdx = 1
+
+	// ipv6FragmentExtHdrMFlagMask is the mask of the More (M) flag within the
+	// flags field of an IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrMFlagMask = 1
+
+	// ipv6FragmentExtHdrIdentificationOffset is the offset to the Identification
+	// field within an IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrIdentificationOffset = 2
+
+	// ipv6ExtHdrLenBytesPerUnit is the unit size of an extension header's length
+	// field. That is, given a Length field of 2, the extension header expects
+	// 16 bytes following the first 8 bytes (see ipv6ExtHdrLenBytesExcluded for
+	// details about the first 8 bytes' exclusion from the Length field).
+	ipv6ExtHdrLenBytesPerUnit = 8
+
+	// ipv6ExtHdrLenBytesExcluded is the number of bytes excluded from an
+	// extension header's Length field following the Length field.
+	//
+	// The Length field excludes the first 8 bytes, but the Next Header and Length
+	// field take up the first 2 of the 8 bytes so we expect (at minimum) 6 bytes
+	// after the Length field.
+	//
+	// This ensures that every extension header is at least 8 bytes.
+	ipv6ExtHdrLenBytesExcluded = 6
+
+	// IPv6FragmentExtHdrFragmentOffsetBytesPerUnit is the unit size of a Fragment
+	// extension header's Fragment Offset field. That is, given a Fragment Offset
+	// of 2, the extension header is indiciating that the fragment's payload
+	// starts at the 16th byte in the reassembled packet.
+	IPv6FragmentExtHdrFragmentOffsetBytesPerUnit = 8
+)
+
+// IPv6PayloadHeader is implemented by the various headers that can be found
+// in an IPv6 payload.
+//
+// These headers include IPv6 extension headers or upper layer data.
+type IPv6PayloadHeader interface {
+	isIPv6PayloadHeader()
+}
+
+// IPv6RawPayloadHeader the remainder of an IPv6 payload after an iterator
+// encounters a Next Header field it does not recognize as an IPv6 extension
+// header.
+type IPv6RawPayloadHeader struct {
+	Identifier IPv6ExtensionHeaderIdentifier
+	Buf        buffer.VectorisedView
+}
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6RawPayloadHeader) isIPv6PayloadHeader() {}
+
+// ipv6OptionsExtHdr is an IPv6 extension header that holds options.
+type ipv6OptionsExtHdr []byte
+
+// Iter returns an iterator over the IPv6 extension header options held in b.
+func (b ipv6OptionsExtHdr) Iter() IPv6OptionsExtHdrOptionsIterator {
+	it := IPv6OptionsExtHdrOptionsIterator{}
+	it.reader.Reset(b)
+	return it
+}
+
+// IPv6OptionsExtHdrOptionsIterator is an iterator over IPv6 extension header
+// options.
+//
+// Note, between when an IPv6OptionsExtHdrOptionsIterator is obtained and last
+// used, no changes to the underlying buffer may happen. Doing so may cause
+// undefined and unexpected behaviour. It is fine to obtain an
+// IPv6OptionsExtHdrOptionsIterator, iterate over the first few options then
+// modify the backing payload so long as the IPv6OptionsExtHdrOptionsIterator
+// obtained before modification is no longer used.
+type IPv6OptionsExtHdrOptionsIterator struct {
+	reader bytes.Reader
+}
+
+// IPv6OptionUnknownAction is the action that must be taken if the processing
+// IPv6 node does not recognize the option, as outlined in RFC 8200 section 4.2.
+type IPv6OptionUnknownAction int
+
+const (
+	// IPv6OptionUnknownActionSkip indicates that the unrecognized option must
+	// be skipped and the node should continue processing the header.
+	IPv6OptionUnknownActionSkip IPv6OptionUnknownAction = 0
+
+	// IPv6OptionUnknownActionDiscard indicates that the packet must be silently
+	// discarded.
+	IPv6OptionUnknownActionDiscard IPv6OptionUnknownAction = 1
+
+	// IPv6OptionUnknownActionDiscardSendICMP indicates that the packet must be
+	// discarded and the node must send an ICMP Parameter Problem, Code 2, message
+	// to the packet's source, regardless of whether or not the packet's
+	// Destination was a multicast address.
+	IPv6OptionUnknownActionDiscardSendICMP IPv6OptionUnknownAction = 2
+
+	// IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest indicates that the
+	// packet must be discarded and the node must send an ICMP Parameter Problem,
+	// Code 2, message to the packet's source only if the packet's Destination was
+	// not a multicast address.
+	IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest IPv6OptionUnknownAction = 3
+)
+
+// IPv6ExtHdrOption is implemented by the various IPv6 extension header options.
+type IPv6ExtHdrOption interface {
+	// UnknownAction returns the action to take in response to an unrecognized
+	// option.
+	UnknownAction() IPv6OptionUnknownAction
+
+	// isIPv6ExtHdrOption is used to "lock" this interface so it is not
+	// implemented by other packages.
+	isIPv6ExtHdrOption()
+}
+
+// IPv6ExtHdrOptionIndentifier is an IPv6 extension header option identifier.
+type IPv6ExtHdrOptionIndentifier uint8
+
+const (
+	// ipv6Pad1ExtHdrOptionIdentifier is the identifier for a padding option that
+	// provides 1 byte padding, as outlined in RFC 8200 section 4.2.
+	ipv6Pad1ExtHdrOptionIdentifier IPv6ExtHdrOptionIndentifier = 0
+
+	// ipv6PadBExtHdrOptionIdentifier is the identifier for a padding option that
+	// provides variable length byte padding, as outlined in RFC 8200 section 4.2.
+	ipv6PadNExtHdrOptionIdentifier IPv6ExtHdrOptionIndentifier = 1
+)
+
+// IPv6UnknownExtHdrOption holds the identifier and data for an IPv6 extension
+// header option that is unknown by the parsing utilities.
+type IPv6UnknownExtHdrOption struct {
+	Identifier IPv6ExtHdrOptionIndentifier
+	Data       []byte
+}
+
+// UnknownAction implements IPv6OptionUnknownAction.UnknownAction.
+func (o *IPv6UnknownExtHdrOption) UnknownAction() IPv6OptionUnknownAction {
+	return IPv6OptionUnknownAction((o.Identifier & ipv6UnknownExtHdrOptionActionMask) >> ipv6UnknownExtHdrOptionActionShift)
+}
+
+// isIPv6ExtHdrOption implements IPv6ExtHdrOption.isIPv6ExtHdrOption.
+func (*IPv6UnknownExtHdrOption) isIPv6ExtHdrOption() {}
+
+// Next returns the next option in the options data.
+//
+// If the next item is not a known extension header option,
+// IPv6UnknownExtHdrOption will be returned with the option identifier and data.
+//
+// The return is of the format (option, done, error). done will be true when
+// Next is unable to return anything because the iterator has reached the end of
+// the options data, or an error occured.
+func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error) {
+	for {
+		temp, err := i.reader.ReadByte()
+		if err != nil {
+			// If we can't read the first byte of a new option, then we know the
+			// options buffer has been exhausted and we are done iterating.
+			return nil, true, nil
+		}
+		id := IPv6ExtHdrOptionIndentifier(temp)
+
+		// If the option identifier indicates the option is a Pad1 option, then we
+		// know the option does not have Length and Data fields. End processing of
+		// the Pad1 option and continue processing the buffer as a new option.
+		if id == ipv6Pad1ExtHdrOptionIdentifier {
+			continue
+		}
+
+		length, err := i.reader.ReadByte()
+		if err != nil {
+			if err != io.EOF {
+				// ReadByte should only ever return nil or io.EOF.
+				panic(fmt.Sprintf("unexpected error when reading the option's Length field for option with id = %d: %s", id, err))
+			}
+
+			// We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected once
+			// we start parsing an option; we expect the reader to contain enough
+			// bytes for the whole option.
+			return nil, true, fmt.Errorf("error when reading the option's Length field for option with id = %d: %w", id, io.ErrUnexpectedEOF)
+		}
+
+		// Special-case the variable length padding option to avoid a copy.
+		if id == ipv6PadNExtHdrOptionIdentifier {
+			// Do we have enough bytes in the reader for the PadN option?
+			if n := i.reader.Len(); n < int(length) {
+				// Reset the reader to effectively consume the remaining buffer.
+				i.reader.Reset(nil)
+
+				// We return the same error as if we failed to read a non-padding option
+				// so consumers of this iterator don't need to differentiate between
+				// padding and non-padding options.
+				return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, io.ErrUnexpectedEOF)
+			}
+
+			if _, err := i.reader.Seek(int64(length), io.SeekCurrent); err != nil {
+				panic(fmt.Sprintf("error when skipping PadN (N = %d) option's data bytes: %s", length, err))
+			}
+
+			// End processing of the PadN option and continue processing the buffer as
+			// a new option.
+			continue
+		}
+
+		bytes := make([]byte, length)
+		if n, err := io.ReadFull(&i.reader, bytes); err != nil {
+			// io.ReadFull may return io.EOF if i.reader has been exhausted. We use
+			// io.ErrUnexpectedEOF instead as the io.EOF is unexpected given the
+			// Length field found in the option.
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+
+			return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, err)
+		}
+
+		return &IPv6UnknownExtHdrOption{Identifier: id, Data: bytes}, false, nil
+	}
+}
+
+// IPv6HopByHopOptionsExtHdr is a buffer holding the Hop By Hop Options
+// extension header.
+type IPv6HopByHopOptionsExtHdr struct {
+	ipv6OptionsExtHdr
+}
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6HopByHopOptionsExtHdr) isIPv6PayloadHeader() {}
+
+// IPv6DestinationOptionsExtHdr is a buffer holding the Destination Options
+// extension header.
+type IPv6DestinationOptionsExtHdr struct {
+	ipv6OptionsExtHdr
+}
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6DestinationOptionsExtHdr) isIPv6PayloadHeader() {}
+
+// IPv6RoutingExtHdr is a buffer holding the Routing extension header specific
+// data as outlined in RFC 8200 section 4.4.
+type IPv6RoutingExtHdr []byte
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6RoutingExtHdr) isIPv6PayloadHeader() {}
+
+// SegmentsLeft returns the Segments Left field.
+func (b IPv6RoutingExtHdr) SegmentsLeft() uint8 {
+	return b[ipv6RoutingExtHdrSegmentsLeftIdx]
+}
+
+// IPv6FragmentExtHdr is a buffer holding the Fragment extension header specific
+// data as outlined in RFC 8200 section 4.5.
+//
+// Note, the buffer does not include the Next Header and Reserved fields.
+type IPv6FragmentExtHdr [6]byte
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6FragmentExtHdr) isIPv6PayloadHeader() {}
+
+// FragmentOffset returns the Fragment Offset field.
+//
+// This value indicates where the buffer following the Fragment extension header
+// starts in the target (reassembled) packet.
+func (b IPv6FragmentExtHdr) FragmentOffset() uint16 {
+	return binary.BigEndian.Uint16(b[ipv6FragmentExtHdrFragmentOffsetOffset:]) >> ipv6FragmentExtHdrFragmentOffsetShift
+}
+
+// More returns the More (M) flag.
+//
+// This indicates whether any fragments are expected to succeed b.
+func (b IPv6FragmentExtHdr) More() bool {
+	return b[ipv6FragmentExtHdrFlagsIdx]&ipv6FragmentExtHdrMFlagMask != 0
+}
+
+// ID returns the Identification field.
+//
+// This value is used to uniquely identify the packet, between a
+// souce and destination.
+func (b IPv6FragmentExtHdr) ID() uint32 {
+	return binary.BigEndian.Uint32(b[ipv6FragmentExtHdrIdentificationOffset:])
+}
+
+// IsAtomic returns whether the fragment header indicates an atomic fragment. An
+// atomic fragment is a fragment that contains all the data required to
+// reassemble a full packet.
+func (b IPv6FragmentExtHdr) IsAtomic() bool {
+	return !b.More() && b.FragmentOffset() == 0
+}
+
+// IPv6PayloadIterator is an iterator over the contents of an IPv6 payload.
+//
+// The IPv6 payload may contain IPv6 extension headers before any upper layer
+// data.
+//
+// Note, between when an IPv6PayloadIterator is obtained and last used, no
+// changes to the payload may happen. Doing so may cause undefined and
+// unexpected behaviour. It is fine to obtain an IPv6PayloadIterator, iterate
+// over the first few headers then modify the backing payload so long as the
+// IPv6PayloadIterator obtained before modification is no longer used.
+type IPv6PayloadIterator struct {
+	// The identifier of the next header to parse.
+	nextHdrIdentifier IPv6ExtensionHeaderIdentifier
+
+	// reader is an io.Reader over payload.
+	reader  bufio.Reader
+	payload buffer.VectorisedView
+
+	// Indicates to the iterator that it should return the remaining payload as a
+	// raw payload on the next call to Next.
+	forceRaw bool
+}
+
+// MakeIPv6PayloadIterator returns an iterator over the IPv6 payload containing
+// extension headers, or a raw payload if the payload cannot be parsed.
+func MakeIPv6PayloadIterator(nextHdrIdentifier IPv6ExtensionHeaderIdentifier, payload buffer.VectorisedView) IPv6PayloadIterator {
+	readers := payload.Readers()
+	readerPs := make([]io.Reader, 0, len(readers))
+	for i := range readers {
+		readerPs = append(readerPs, &readers[i])
+	}
+
+	return IPv6PayloadIterator{
+		nextHdrIdentifier: nextHdrIdentifier,
+		payload:           payload.Clone(nil),
+		// We need a buffer of size 1 for calls to bufio.Reader.ReadByte.
+		reader: *bufio.NewReaderSize(io.MultiReader(readerPs...), 1),
+	}
+}
+
+// AsRawHeader returns the remaining payload of i as a raw header and
+// optionally consumes the iterator.
+//
+// If consume is true, calls to Next after calling AsRawHeader on i will
+// indicate that the iterator is done.
+func (i *IPv6PayloadIterator) AsRawHeader(consume bool) IPv6RawPayloadHeader {
+	identifier := i.nextHdrIdentifier
+
+	var buf buffer.VectorisedView
+	if consume {
+		// Since we consume the iterator, we return the payload as is.
+		buf = i.payload
+
+		// Mark i as done.
+		*i = IPv6PayloadIterator{
+			nextHdrIdentifier: IPv6NoNextHeaderIdentifier,
+		}
+	} else {
+		buf = i.payload.Clone(nil)
+	}
+
+	return IPv6RawPayloadHeader{Identifier: identifier, Buf: buf}
+}
+
+// Next returns the next item in the payload.
+//
+// If the next item is not a known IPv6 extension header, IPv6RawPayloadHeader
+// will be returned with the remaining bytes and next header identifier.
+//
+// The return is of the format (header, done, error). done will be true when
+// Next is unable to return anything because the iterator has reached the end of
+// the payload, or an error occured.
+func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
+	// We could be forced to return i as a raw header when the previous header was
+	// a fragment extension header as the data following the fragment extension
+	// header may not be complete.
+	if i.forceRaw {
+		return i.AsRawHeader(true /* consume */), false, nil
+	}
+
+	// Is the header we are parsing a known extension header?
+	switch i.nextHdrIdentifier {
+	case IPv6HopByHopOptionsExtHdrIdentifier:
+		nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
+		if err != nil {
+			return nil, true, err
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: bytes}, false, nil
+	case IPv6RoutingExtHdrIdentifier:
+		nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
+		if err != nil {
+			return nil, true, err
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return IPv6RoutingExtHdr(bytes), false, nil
+	case IPv6FragmentExtHdrIdentifier:
+		var data [6]byte
+		// We ignore the returned bytes becauase we know the fragment extension
+		// header specific data will fit in data.
+		nextHdrIdentifier, _, err := i.nextHeaderData(true /* fragmentHdr */, data[:])
+		if err != nil {
+			return nil, true, err
+		}
+
+		fragmentExtHdr := IPv6FragmentExtHdr(data)
+
+		// If the packet is not the first fragment, do not attempt to parse anything
+		// after the fragment extension header as the payload following the fragment
+		// extension header should not contain any headers; the first fragment must
+		// hold all the headers up to and including any upper layer headers, as per
+		// RFC 8200 section 4.5.
+		if fragmentExtHdr.FragmentOffset() != 0 {
+			i.forceRaw = true
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return fragmentExtHdr, false, nil
+	case IPv6DestinationOptionsExtHdrIdentifier:
+		nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
+		if err != nil {
+			return nil, true, err
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: bytes}, false, nil
+	case IPv6NoNextHeaderIdentifier:
+		// This indicates the end of the IPv6 payload.
+		return nil, true, nil
+
+	default:
+		// The header we are parsing is not a known extension header. Return the
+		// raw payload.
+		return i.AsRawHeader(true /* consume */), false, nil
+	}
+}
+
+// nextHeaderData returns the extension header's Next Header field and raw data.
+//
+// fragmentHdr indicates that the extension header being parsed is the Fragment
+// extension header so the Length field should be ignored as it is Reserved
+// for the Fragment extension header.
+//
+// If bytes is not nil, extension header specific data will be read into bytes
+// if it has enough capacity. If bytes is provided but does not have enough
+// capacity for the data, nextHeaderData will panic.
+func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IPv6ExtensionHeaderIdentifier, []byte, error) {
+	// We ignore the number of bytes read because we know we will only ever read
+	// at max 1 bytes since rune has a length of 1. If we read 0 bytes, the Read
+	// would return io.EOF to indicate that io.Reader has reached the end of the
+	// payload.
+	nextHdrIdentifier, err := i.reader.ReadByte()
+	i.payload.TrimFront(1)
+	if err != nil {
+		return 0, nil, fmt.Errorf("error when reading the Next Header field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
+	}
+
+	var length uint8
+	length, err = i.reader.ReadByte()
+	i.payload.TrimFront(1)
+	if err != nil {
+		if fragmentHdr {
+			return 0, nil, fmt.Errorf("error when reading the Length field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
+		}
+
+		return 0, nil, fmt.Errorf("error when reading the Reserved field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
+	}
+	if fragmentHdr {
+		length = 0
+	}
+
+	bytesLen := int(length)*ipv6ExtHdrLenBytesPerUnit + ipv6ExtHdrLenBytesExcluded
+	if bytes == nil {
+		bytes = make([]byte, bytesLen)
+	} else if n := len(bytes); n < bytesLen {
+		panic(fmt.Sprintf("bytes only has space for %d bytes but need space for %d bytes (length = %d) for extension header with id = %d", n, bytesLen, length, i.nextHdrIdentifier))
+	}
+
+	n, err := io.ReadFull(&i.reader, bytes)
+	i.payload.TrimFront(n)
+	if err != nil {
+		return 0, nil, fmt.Errorf("read %d out of %d extension header data bytes (length = %d) for header with id = %d: %w", n, bytesLen, length, i.nextHdrIdentifier, err)
+	}
+
+	return IPv6ExtensionHeaderIdentifier(nextHdrIdentifier), bytes, nil
+}
diff --git a/pkg/tcpip/header/ipv6_extension_headers_test.go b/pkg/tcpip/header/ipv6_extension_headers_test.go
new file mode 100644
index 000000000..ab20c5f37
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_extension_headers_test.go
@@ -0,0 +1,992 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// Equal returns true of a and b are equivalent.
+//
+// Note, Equal will return true if a and b hold the same Identifier value and
+// contain the same bytes in Buf, even if the bytes are split across views
+// differently.
+//
+// Needed to use cmp.Equal on IPv6RawPayloadHeader as it contains unexported
+// fields.
+func (a IPv6RawPayloadHeader) Equal(b IPv6RawPayloadHeader) bool {
+	return a.Identifier == b.Identifier && bytes.Equal(a.Buf.ToView(), b.Buf.ToView())
+}
+
+// Equal returns true of a and b are equivalent.
+//
+// Note, Equal will return true if a and b hold equivalent ipv6OptionsExtHdrs.
+//
+// Needed to use cmp.Equal on IPv6RawPayloadHeader as it contains unexported
+// fields.
+func (a IPv6HopByHopOptionsExtHdr) Equal(b IPv6HopByHopOptionsExtHdr) bool {
+	return bytes.Equal(a.ipv6OptionsExtHdr, b.ipv6OptionsExtHdr)
+}
+
+// Equal returns true of a and b are equivalent.
+//
+// Note, Equal will return true if a and b hold equivalent ipv6OptionsExtHdrs.
+//
+// Needed to use cmp.Equal on IPv6RawPayloadHeader as it contains unexported
+// fields.
+func (a IPv6DestinationOptionsExtHdr) Equal(b IPv6DestinationOptionsExtHdr) bool {
+	return bytes.Equal(a.ipv6OptionsExtHdr, b.ipv6OptionsExtHdr)
+}
+
+func TestIPv6UnknownExtHdrOption(t *testing.T) {
+	tests := []struct {
+		name                  string
+		identifier            IPv6ExtHdrOptionIndentifier
+		expectedUnknownAction IPv6OptionUnknownAction
+	}{
+		{
+			name:                  "Skip with zero LSBs",
+			identifier:            0,
+			expectedUnknownAction: IPv6OptionUnknownActionSkip,
+		},
+		{
+			name:                  "Discard with zero LSBs",
+			identifier:            64,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscard,
+		},
+		{
+			name:                  "Discard and ICMP with zero LSBs",
+			identifier:            128,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMP,
+		},
+		{
+			name:                  "Discard and ICMP for non multicast destination with zero LSBs",
+			identifier:            192,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest,
+		},
+		{
+			name:                  "Skip with non-zero LSBs",
+			identifier:            63,
+			expectedUnknownAction: IPv6OptionUnknownActionSkip,
+		},
+		{
+			name:                  "Discard with non-zero LSBs",
+			identifier:            127,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscard,
+		},
+		{
+			name:                  "Discard and ICMP with non-zero LSBs",
+			identifier:            191,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMP,
+		},
+		{
+			name:                  "Discard and ICMP for non multicast destination with non-zero LSBs",
+			identifier:            255,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opt := &IPv6UnknownExtHdrOption{Identifier: test.identifier, Data: []byte{1, 2, 3, 4}}
+			if a := opt.UnknownAction(); a != test.expectedUnknownAction {
+				t.Fatalf("got UnknownAction() = %d, want = %d", a, test.expectedUnknownAction)
+			}
+		})
+	}
+
+}
+
+func TestIPv6OptionsExtHdrIterErr(t *testing.T) {
+	tests := []struct {
+		name  string
+		bytes []byte
+		err   error
+	}{
+		{
+			name:  "Single unknown with zero length",
+			bytes: []byte{255, 0},
+		},
+		{
+			name:  "Single unknown with non-zero length",
+			bytes: []byte{255, 3, 1, 2, 3},
+		},
+		{
+			name: "Two options",
+			bytes: []byte{
+				255, 0,
+				254, 1, 1,
+			},
+		},
+		{
+			name: "Three options",
+			bytes: []byte{
+				255, 0,
+				254, 1, 1,
+				253, 4, 2, 3, 4, 5,
+			},
+		},
+		{
+			name:  "Single unknown only identifier",
+			bytes: []byte{255},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "Single unknown too small with length = 1",
+			bytes: []byte{255, 1},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "Single unknown too small with length = 2",
+			bytes: []byte{255, 2, 1},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name: "Valid first with second unknown only identifier",
+			bytes: []byte{
+				255, 0,
+				254,
+			},
+			err: io.ErrUnexpectedEOF,
+		},
+		{
+			name: "Valid first with second unknown missing data",
+			bytes: []byte{
+				255, 0,
+				254, 1,
+			},
+			err: io.ErrUnexpectedEOF,
+		},
+		{
+			name: "Valid first with second unknown too small",
+			bytes: []byte{
+				255, 0,
+				254, 2, 1,
+			},
+			err: io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "One Pad1",
+			bytes: []byte{0},
+		},
+		{
+			name:  "Multiple Pad1",
+			bytes: []byte{0, 0, 0},
+		},
+		{
+			name: "Multiple PadN",
+			bytes: []byte{
+				// Pad3
+				1, 1, 1,
+
+				// Pad5
+				1, 3, 1, 2, 3,
+			},
+		},
+		{
+			name:  "Pad5 too small middle of data buffer",
+			bytes: []byte{1, 3, 1, 2},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "Pad5 no data",
+			bytes: []byte{1, 3},
+			err:   io.ErrUnexpectedEOF,
+		},
+	}
+
+	check := func(t *testing.T, it IPv6OptionsExtHdrOptionsIterator, expectedErr error) {
+		for i := 0; ; i++ {
+			_, done, err := it.Next()
+			if err != nil {
+				// If we encountered a non-nil error while iterating, make sure it is
+				// is the same error as expectedErr.
+				if !errors.Is(err, expectedErr) {
+					t.Fatalf("got %d-th Next() = %v, want = %v", i, err, expectedErr)
+				}
+
+				return
+			}
+			if done {
+				// If we are done (without an error), make sure that we did not expect
+				// an error.
+				if expectedErr != nil {
+					t.Fatalf("expected error when iterating; want = %s", expectedErr)
+				}
+
+				return
+			}
+		}
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			t.Run("Hop By Hop", func(t *testing.T) {
+				extHdr := IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				check(t, extHdr.Iter(), test.err)
+			})
+
+			t.Run("Destination", func(t *testing.T) {
+				extHdr := IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				check(t, extHdr.Iter(), test.err)
+			})
+		})
+	}
+}
+
+func TestIPv6OptionsExtHdrIter(t *testing.T) {
+	tests := []struct {
+		name     string
+		bytes    []byte
+		expected []IPv6ExtHdrOption
+	}{
+		{
+			name:  "Single unknown with zero length",
+			bytes: []byte{255, 0},
+			expected: []IPv6ExtHdrOption{
+				&IPv6UnknownExtHdrOption{Identifier: 255, Data: []byte{}},
+			},
+		},
+		{
+			name:  "Single unknown with non-zero length",
+			bytes: []byte{255, 3, 1, 2, 3},
+			expected: []IPv6ExtHdrOption{
+				&IPv6UnknownExtHdrOption{Identifier: 255, Data: []byte{1, 2, 3}},
+			},
+		},
+		{
+			name:  "Single Pad1",
+			bytes: []byte{0},
+		},
+		{
+			name:  "Two Pad1",
+			bytes: []byte{0, 0},
+		},
+		{
+			name:  "Single Pad3",
+			bytes: []byte{1, 1, 1},
+		},
+		{
+			name:  "Single Pad5",
+			bytes: []byte{1, 3, 1, 2, 3},
+		},
+		{
+			name: "Multiple Pad",
+			bytes: []byte{
+				// Pad1
+				0,
+
+				// Pad2
+				1, 0,
+
+				// Pad3
+				1, 1, 1,
+
+				// Pad4
+				1, 2, 1, 2,
+
+				// Pad5
+				1, 3, 1, 2, 3,
+			},
+		},
+		{
+			name: "Multiple options",
+			bytes: []byte{
+				// Pad1
+				0,
+
+				// Unknown
+				255, 0,
+
+				// Pad2
+				1, 0,
+
+				// Unknown
+				254, 1, 1,
+
+				// Pad3
+				1, 1, 1,
+
+				// Unknown
+				253, 4, 2, 3, 4, 5,
+
+				// Pad4
+				1, 2, 1, 2,
+			},
+			expected: []IPv6ExtHdrOption{
+				&IPv6UnknownExtHdrOption{Identifier: 255, Data: []byte{}},
+				&IPv6UnknownExtHdrOption{Identifier: 254, Data: []byte{1}},
+				&IPv6UnknownExtHdrOption{Identifier: 253, Data: []byte{2, 3, 4, 5}},
+			},
+		},
+	}
+
+	checkIter := func(t *testing.T, it IPv6OptionsExtHdrOptionsIterator, expected []IPv6ExtHdrOption) {
+		for i, e := range expected {
+			opt, done, err := it.Next()
+			if err != nil {
+				t.Errorf("(i=%d) Next(): %s", i, err)
+			}
+			if done {
+				t.Errorf("(i=%d) unexpectedly done iterating", i)
+			}
+			if diff := cmp.Diff(e, opt); diff != "" {
+				t.Errorf("(i=%d) got option mismatch (-want +got):\n%s", i, diff)
+			}
+
+			if t.Failed() {
+				t.FailNow()
+			}
+		}
+
+		opt, done, err := it.Next()
+		if err != nil {
+			t.Errorf("(last) Next(): %s", err)
+		}
+		if !done {
+			t.Errorf("(last) iterator unexpectedly not done")
+		}
+		if opt != nil {
+			t.Errorf("(last) got Next() = %T, want = nil", opt)
+		}
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			t.Run("Hop By Hop", func(t *testing.T) {
+				extHdr := IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				checkIter(t, extHdr.Iter(), test.expected)
+			})
+
+			t.Run("Destination", func(t *testing.T) {
+				extHdr := IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				checkIter(t, extHdr.Iter(), test.expected)
+			})
+		})
+	}
+}
+
+func TestIPv6RoutingExtHdr(t *testing.T) {
+	tests := []struct {
+		name         string
+		bytes        []byte
+		segmentsLeft uint8
+	}{
+		{
+			name:         "Zeroes",
+			bytes:        []byte{0, 0, 0, 0, 0, 0},
+			segmentsLeft: 0,
+		},
+		{
+			name:         "Ones",
+			bytes:        []byte{1, 1, 1, 1, 1, 1},
+			segmentsLeft: 1,
+		},
+		{
+			name:         "Mixed",
+			bytes:        []byte{1, 2, 3, 4, 5, 6},
+			segmentsLeft: 2,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			extHdr := IPv6RoutingExtHdr(test.bytes)
+			if got := extHdr.SegmentsLeft(); got != test.segmentsLeft {
+				t.Errorf("got SegmentsLeft() = %d, want = %d", got, test.segmentsLeft)
+			}
+		})
+	}
+}
+
+func TestIPv6FragmentExtHdr(t *testing.T) {
+	tests := []struct {
+		name           string
+		bytes          [6]byte
+		fragmentOffset uint16
+		more           bool
+		id             uint32
+	}{
+		{
+			name:           "Zeroes",
+			bytes:          [6]byte{0, 0, 0, 0, 0, 0},
+			fragmentOffset: 0,
+			more:           false,
+			id:             0,
+		},
+		{
+			name:           "Ones",
+			bytes:          [6]byte{0, 9, 0, 0, 0, 1},
+			fragmentOffset: 1,
+			more:           true,
+			id:             1,
+		},
+		{
+			name:           "Mixed",
+			bytes:          [6]byte{68, 9, 128, 4, 2, 1},
+			fragmentOffset: 2177,
+			more:           true,
+			id:             2147746305,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			extHdr := IPv6FragmentExtHdr(test.bytes)
+			if got := extHdr.FragmentOffset(); got != test.fragmentOffset {
+				t.Errorf("got FragmentOffset() = %d, want = %d", got, test.fragmentOffset)
+			}
+			if got := extHdr.More(); got != test.more {
+				t.Errorf("got More() = %t, want = %t", got, test.more)
+			}
+			if got := extHdr.ID(); got != test.id {
+				t.Errorf("got ID() = %d, want = %d", got, test.id)
+			}
+		})
+	}
+}
+
+func makeVectorisedViewFromByteBuffers(bs ...[]byte) buffer.VectorisedView {
+	size := 0
+	var vs []buffer.View
+
+	for _, b := range bs {
+		vs = append(vs, buffer.View(b))
+		size += len(b)
+	}
+
+	return buffer.NewVectorisedView(size, vs)
+}
+
+func TestIPv6ExtHdrIterErr(t *testing.T) {
+	tests := []struct {
+		name         string
+		firstNextHdr IPv6ExtensionHeaderIdentifier
+		payload      buffer.VectorisedView
+		err          error
+	}{
+		{
+			name:         "Upper layer only without data",
+			firstNextHdr: 255,
+		},
+		{
+			name:         "Upper layer only with data",
+			firstNextHdr: 255,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{1, 2, 3, 4}),
+		},
+		{
+			name:         "No next header",
+			firstNextHdr: IPv6NoNextHeaderIdentifier,
+		},
+		{
+			name:         "No next header with data",
+			firstNextHdr: IPv6NoNextHeaderIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{1, 2, 3, 4}),
+		},
+		{
+			name:         "Valid single hop by hop",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3, 4}),
+		},
+		{
+			name:         "Hop by hop too small",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Valid single fragment",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 68, 9, 128, 4, 2, 1}),
+		},
+		{
+			name:         "Fragment too small",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 68, 9, 128, 4, 2}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Valid single destination",
+			firstNextHdr: IPv6DestinationOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3, 4}),
+		},
+		{
+			name:         "Destination too small",
+			firstNextHdr: IPv6DestinationOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Valid single routing",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 2, 3, 4, 5, 6}),
+		},
+		{
+			name:         "Valid single routing across views",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 2}, []byte{3, 4, 5, 6}),
+		},
+		{
+			name:         "Routing too small with zero length field",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 2, 3, 4, 5}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Valid routing with non-zero length field",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 8}),
+		},
+		{
+			name:         "Valid routing with non-zero length field across views",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6}, []byte{1, 2, 3, 4, 5, 6, 7, 8}),
+		},
+		{
+			name:         "Routing too small with non-zero length field",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Routing too small with non-zero length field across views",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6}, []byte{1, 2, 3, 4, 5, 6, 7}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Mixed",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// (Atomic) Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 255, 4, 1, 2, 3, 4,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+		},
+		{
+			name:         "Mixed without upper layer data",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// (Atomic) Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 255, 4, 1, 2, 3, 4,
+			}),
+		},
+		{
+			name:         "Mixed without upper layer data but last ext hdr too small",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// (Atomic) Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 255, 4, 1, 2, 3,
+			}),
+			err: io.ErrUnexpectedEOF,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			it := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload)
+
+			for i := 0; ; i++ {
+				_, done, err := it.Next()
+				if err != nil {
+					// If we encountered a non-nil error while iterating, make sure it is
+					// is the same error as test.err.
+					if !errors.Is(err, test.err) {
+						t.Fatalf("got %d-th Next() = %v, want = %v", i, err, test.err)
+					}
+
+					return
+				}
+				if done {
+					// If we are done (without an error), make sure that we did not expect
+					// an error.
+					if test.err != nil {
+						t.Fatalf("expected error when iterating; want = %s", test.err)
+					}
+
+					return
+				}
+			}
+		})
+	}
+}
+
+func TestIPv6ExtHdrIter(t *testing.T) {
+	routingExtHdrWithUpperLayerData := buffer.View([]byte{255, 0, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4})
+	upperLayerData := buffer.View([]byte{1, 2, 3, 4})
+	tests := []struct {
+		name         string
+		firstNextHdr IPv6ExtensionHeaderIdentifier
+		payload      buffer.VectorisedView
+		expected     []IPv6PayloadHeader
+	}{
+		// With a non-atomic fragment that is not the first fragment, the payload
+		// after the fragment will not be parsed because the payload is expected to
+		// only hold upper layer data.
+		{
+			name:         "hopbyhop - fragment (not first) - routing - upper",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// Fragment extension header.
+				//
+				// More = 1, Fragment Offset = 2117, ID = 2147746305
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 68, 9, 128, 4, 2, 1,
+
+				// Routing extension header.
+				//
+				// Even though we have a routing ext header here, it should be
+				// be interpretted as raw bytes as only the first fragment is expected
+				// to hold headers.
+				255, 0, 1, 2, 3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+				IPv6FragmentExtHdr([6]byte{68, 9, 128, 4, 2, 1}),
+				IPv6RawPayloadHeader{
+					Identifier: IPv6RoutingExtHdrIdentifier,
+					Buf:        routingExtHdrWithUpperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+		{
+			name:         "hopbyhop - fragment (first) - routing - upper",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// Fragment extension header.
+				//
+				// More = 1, Fragment Offset = 0, ID = 2147746305
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 0, 1, 128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2, 3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+				IPv6FragmentExtHdr([6]byte{0, 1, 128, 4, 2, 1}),
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6RawPayloadHeader{
+					Identifier: 255,
+					Buf:        upperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+		{
+			name:         "fragment - routing - upper (across views)",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 68, 9, 128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2}, []byte{3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{68, 9, 128, 4, 2, 1}),
+				IPv6RawPayloadHeader{
+					Identifier: IPv6RoutingExtHdrIdentifier,
+					Buf:        routingExtHdrWithUpperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+
+		// If we have an atomic fragment, the payload following the fragment
+		// extension header should be parsed normally.
+		{
+			name:         "atomic fragment - routing - destination - upper",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 1, 4, 1, 2, 3, 4,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+				IPv6RawPayloadHeader{
+					Identifier: 255,
+					Buf:        upperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+		{
+			name:         "atomic fragment - routing - upper (across views)",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6}, []byte{128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2}, []byte{3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2}, []byte{3, 4}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6RawPayloadHeader{
+					Identifier: 255,
+					Buf:        makeVectorisedViewFromByteBuffers(upperLayerData[:2], upperLayerData[2:]),
+				},
+			},
+		},
+		{
+			name:         "atomic fragment - destination - no next header",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				//
+				// Res (Reserved) bits are 1 which should not affect anything.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 0, 6, 128, 4, 2, 1,
+
+				// Destination Options extension header.
+				uint8(IPv6NoNextHeaderIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+				IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+			},
+		},
+		{
+			name:         "routing - atomic fragment - no next header",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Routing extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6NoNextHeaderIdentifier), 0, 0, 6, 128, 4, 2, 1,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+			},
+		},
+		{
+			name:         "routing - atomic fragment - no next header (across views)",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Routing extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6NoNextHeaderIdentifier), 255, 0, 6}, []byte{128, 4, 2, 1,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+			},
+		},
+		{
+			name:         "hopbyhop - routing - fragment - no next header",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// Routing extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Fragment extension header.
+				//
+				// Fragment Offset = 32; Res = 6.
+				uint8(IPv6NoNextHeaderIdentifier), 0, 1, 6, 128, 4, 2, 1,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6FragmentExtHdr([6]byte{1, 6, 128, 4, 2, 1}),
+				IPv6RawPayloadHeader{
+					Identifier: IPv6NoNextHeaderIdentifier,
+					Buf:        upperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+
+		// Test the raw payload for common transport layer protocol numbers.
+		{
+			name:         "TCP raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(TCPProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(TCPProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "UDP raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(UDPProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(UDPProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "ICMPv4 raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(ICMPv4ProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(ICMPv4ProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "ICMPv6 raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(ICMPv6ProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(ICMPv6ProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "Unknwon next header raw payload",
+			firstNextHdr: 255,
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: 255,
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "Unknwon next header raw payload (across views)",
+			firstNextHdr: 255,
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData[:2], upperLayerData[2:]),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: 255,
+				Buf:        makeVectorisedViewFromByteBuffers(upperLayerData[:2], upperLayerData[2:]),
+			}},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			it := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload)
+
+			for i, e := range test.expected {
+				extHdr, done, err := it.Next()
+				if err != nil {
+					t.Errorf("(i=%d) Next(): %s", i, err)
+				}
+				if done {
+					t.Errorf("(i=%d) unexpectedly done iterating", i)
+				}
+				if diff := cmp.Diff(e, extHdr); diff != "" {
+					t.Errorf("(i=%d) got ext hdr mismatch (-want +got):\n%s", i, diff)
+				}
+
+				if t.Failed() {
+					t.FailNow()
+				}
+			}
+
+			extHdr, done, err := it.Next()
+			if err != nil {
+				t.Errorf("(last) Next(): %s", err)
+			}
+			if !done {
+				t.Errorf("(last) iterator unexpectedly not done")
+			}
+			if extHdr != nil {
+				t.Errorf("(last) got Next() = %T, want = nil", extHdr)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
index c3ad503aa..426a873b1 100644
--- a/pkg/tcpip/header/ipv6_test.go
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -27,11 +27,12 @@ import (
 )
 
 const (
-	linkAddr         = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
-	linkLocalAddr    = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-	uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-	uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-	globalAddr       = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	linkAddr               = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	linkLocalAddr          = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	linkLocalMulticastAddr = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	uniqueLocalAddr1       = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	uniqueLocalAddr2       = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+	globalAddr             = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
 )
 
 func TestEthernetAdddressToModifiedEUI64(t *testing.T) {
@@ -256,6 +257,85 @@ func TestIsV6UniqueLocalAddress(t *testing.T) {
 	}
 }
 
+func TestIsV6LinkLocalMulticastAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.Address
+		expected bool
+	}{
+		{
+			name:     "Valid Link Local Multicast",
+			addr:     linkLocalMulticastAddr,
+			expected: true,
+		},
+		{
+			name:     "Valid Link Local Multicast with flags",
+			addr:     "\xff\xf2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+			expected: true,
+		},
+		{
+			name:     "Link Local Unicast",
+			addr:     linkLocalAddr,
+			expected: false,
+		},
+		{
+			name:     "IPv4 Multicast",
+			addr:     "\xe0\x00\x00\x01",
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := header.IsV6LinkLocalMulticastAddress(test.addr); got != test.expected {
+				t.Errorf("got header.IsV6LinkLocalMulticastAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+			}
+		})
+	}
+}
+
+func TestIsV6LinkLocalAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.Address
+		expected bool
+	}{
+		{
+			name:     "Valid Link Local Unicast",
+			addr:     linkLocalAddr,
+			expected: true,
+		},
+		{
+			name:     "Link Local Multicast",
+			addr:     linkLocalMulticastAddr,
+			expected: false,
+		},
+		{
+			name:     "Unique Local",
+			addr:     uniqueLocalAddr1,
+			expected: false,
+		},
+		{
+			name:     "Global",
+			addr:     globalAddr,
+			expected: false,
+		},
+		{
+			name:     "IPv4 Link Local",
+			addr:     "\xa9\xfe\x00\x01",
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := header.IsV6LinkLocalAddress(test.addr); got != test.expected {
+				t.Errorf("got header.IsV6LinkLocalAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+			}
+		})
+	}
+}
+
 func TestScopeForIPv6Address(t *testing.T) {
 	tests := []struct {
 		name  string
@@ -270,12 +350,18 @@ func TestScopeForIPv6Address(t *testing.T) {
 			err:   nil,
 		},
 		{
-			name:  "Link Local",
+			name:  "Link Local Unicast",
 			addr:  linkLocalAddr,
 			scope: header.LinkLocalScope,
 			err:   nil,
 		},
 		{
+			name:  "Link Local Multicast",
+			addr:  linkLocalMulticastAddr,
+			scope: header.LinkLocalScope,
+			err:   nil,
+		},
+		{
 			name:  "Global",
 			addr:  globalAddr,
 			scope: header.GlobalScope,
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index e6a6ad39b..5d3975c56 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -15,32 +15,47 @@
 package header
 
 import (
+	"bytes"
 	"encoding/binary"
 	"errors"
 	"fmt"
+	"io"
 	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
+// NDPOptionIdentifier is an NDP option type identifier.
+type NDPOptionIdentifier uint8
+
 const (
 	// NDPSourceLinkLayerAddressOptionType is the type of the Source Link Layer
 	// Address option, as per RFC 4861 section 4.6.1.
-	NDPSourceLinkLayerAddressOptionType = 1
+	NDPSourceLinkLayerAddressOptionType NDPOptionIdentifier = 1
 
 	// NDPTargetLinkLayerAddressOptionType is the type of the Target Link Layer
 	// Address option, as per RFC 4861 section 4.6.1.
-	NDPTargetLinkLayerAddressOptionType = 2
+	NDPTargetLinkLayerAddressOptionType NDPOptionIdentifier = 2
+
+	// NDPPrefixInformationType is the type of the Prefix Information
+	// option, as per RFC 4861 section 4.6.2.
+	NDPPrefixInformationType NDPOptionIdentifier = 3
+
+	// NDPRecursiveDNSServerOptionType is the type of the Recursive DNS
+	// Server option, as per RFC 8106 section 5.1.
+	NDPRecursiveDNSServerOptionType NDPOptionIdentifier = 25
 
+	// NDPDNSSearchListOptionType is the type of the DNS Search List option,
+	// as per RFC 8106 section 5.2.
+	NDPDNSSearchListOptionType = 31
+)
+
+const (
 	// NDPLinkLayerAddressSize is the size of a Source or Target Link Layer
 	// Address option for an Ethernet address.
 	NDPLinkLayerAddressSize = 8
 
-	// NDPPrefixInformationType is the type of the Prefix Information
-	// option, as per RFC 4861 section 4.6.2.
-	NDPPrefixInformationType = 3
-
 	// ndpPrefixInformationLength is the expected length, in bytes, of the
 	// body of an NDP Prefix Information option, as per RFC 4861 section
 	// 4.6.2 which specifies that the Length field is 4. Given this, the
@@ -91,10 +106,6 @@ const (
 	// within an NDPPrefixInformation.
 	ndpPrefixInformationPrefixOffset = 14
 
-	// NDPRecursiveDNSServerOptionType is the type of the Recursive DNS
-	// Server option, as per RFC 8106 section 5.1.
-	NDPRecursiveDNSServerOptionType = 25
-
 	// ndpRecursiveDNSServerLifetimeOffset is the start of the 4-byte
 	// Lifetime field within an NDPRecursiveDNSServer.
 	ndpRecursiveDNSServerLifetimeOffset = 2
@@ -103,10 +114,31 @@ const (
 	// for IPv6 Recursive DNS Servers within an NDPRecursiveDNSServer.
 	ndpRecursiveDNSServerAddressesOffset = 6
 
-	// minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS
-	// Server option's length field value when it contains at least one
-	// IPv6 address.
-	minNDPRecursiveDNSServerLength = 3
+	// minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS Server
+	// option's body size when it contains at least one IPv6 address, as per
+	// RFC 8106 section 5.3.1.
+	minNDPRecursiveDNSServerBodySize = 22
+
+	// ndpDNSSearchListLifetimeOffset is the start of the 4-byte
+	// Lifetime field within an NDPDNSSearchList.
+	ndpDNSSearchListLifetimeOffset = 2
+
+	// ndpDNSSearchListDomainNamesOffset is the start of the DNS search list
+	// domain names within an NDPDNSSearchList.
+	ndpDNSSearchListDomainNamesOffset = 6
+
+	// minNDPDNSSearchListBodySize is the minimum NDP DNS Search List option's
+	// body size when it contains at least one domain name, as per RFC 8106
+	// section 5.3.1.
+	minNDPDNSSearchListBodySize = 14
+
+	// maxDomainNameLabelLength is the maximum length of a domain name
+	// label, as per RFC 1035 section 3.1.
+	maxDomainNameLabelLength = 63
+
+	// maxDomainNameLength is the maximum length of a domain name, including
+	// label AND label length octet, as per RFC 1035 section 3.1.
+	maxDomainNameLength = 255
 
 	// lengthByteUnits is the multiplier factor for the Length field of an
 	// NDP option. That is, the length field for NDP options is in units of
@@ -132,16 +164,13 @@ var (
 // few NDPOption then modify the backing NDPOptions so long as the
 // NDPOptionIterator obtained before modification is no longer used.
 type NDPOptionIterator struct {
-	// The NDPOptions this NDPOptionIterator is iterating over.
-	opts NDPOptions
+	opts *bytes.Buffer
 }
 
 // Potential errors when iterating over an NDPOptions.
 var (
-	ErrNDPOptBufExhausted  = errors.New("Buffer unexpectedly exhausted")
-	ErrNDPOptZeroLength    = errors.New("NDP option has zero-valued Length field")
-	ErrNDPOptMalformedBody = errors.New("NDP option has a malformed body")
-	ErrNDPInvalidLength    = errors.New("NDP option's Length value is invalid as per relevant RFC")
+	ErrNDPOptMalformedBody   = errors.New("NDP option has a malformed body")
+	ErrNDPOptMalformedHeader = errors.New("NDP option has a malformed header")
 )
 
 // Next returns the next element in the backing NDPOptions, or true if we are
@@ -152,48 +181,50 @@ var (
 func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
 	for {
 		// Do we still have elements to look at?
-		if len(i.opts) == 0 {
+		if i.opts.Len() == 0 {
 			return nil, true, nil
 		}
 
-		// Do we have enough bytes for an NDP option that has a Length
-		// field of at least 1? Note, 0 in the Length field is invalid.
-		if len(i.opts) < lengthByteUnits {
-			return nil, true, ErrNDPOptBufExhausted
-		}
-
 		// Get the Type field.
-		t := i.opts[0]
-
-		// Get the Length field.
-		l := i.opts[1]
+		temp, err := i.opts.ReadByte()
+		if err != nil {
+			if err != io.EOF {
+				// ReadByte should only ever return nil or io.EOF.
+				panic(fmt.Sprintf("unexpected error when reading the option's Type field: %s", err))
+			}
 
-		// This would indicate an erroneous NDP option as the Length
-		// field should never be 0.
-		if l == 0 {
-			return nil, true, ErrNDPOptZeroLength
+			// We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected once
+			// we start parsing an option; we expect the buffer to contain enough
+			// bytes for the whole option.
+			return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Type field: %w", io.ErrUnexpectedEOF)
 		}
+		kind := NDPOptionIdentifier(temp)
 
-		// How many bytes are in the option body?
-		numBytes := int(l) * lengthByteUnits
-		numBodyBytes := numBytes - 2
-
-		potentialBody := i.opts[2:]
+		// Get the Length field.
+		length, err := i.opts.ReadByte()
+		if err != nil {
+			if err != io.EOF {
+				panic(fmt.Sprintf("unexpected error when reading the option's Length field for %s: %s", kind, err))
+			}
 
-		// This would indicate an erroenous NDPOptions buffer as we ran
-		// out of the buffer in the middle of an NDP option.
-		if left := len(potentialBody); left < numBodyBytes {
-			return nil, true, ErrNDPOptBufExhausted
+			return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Length field for %s: %w", kind, io.ErrUnexpectedEOF)
 		}
 
-		// Get only the options body, leaving the rest of the options
-		// buffer alone.
-		body := potentialBody[:numBodyBytes]
+		// This would indicate an erroneous NDP option as the Length field should
+		// never be 0.
+		if length == 0 {
+			return nil, true, fmt.Errorf("zero valued Length field for %s: %w", kind, ErrNDPOptMalformedHeader)
+		}
 
-		// Update opts with the remaining options body.
-		i.opts = i.opts[numBytes:]
+		// Get the body.
+		numBytes := int(length) * lengthByteUnits
+		numBodyBytes := numBytes - 2
+		body := i.opts.Next(numBodyBytes)
+		if len(body) < numBodyBytes {
+			return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Body for %s: %w", kind, io.ErrUnexpectedEOF)
+		}
 
-		switch t {
+		switch kind {
 		case NDPSourceLinkLayerAddressOptionType:
 			return NDPSourceLinkLayerAddressOption(body), false, nil
 
@@ -205,22 +236,23 @@ func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
 			// body is ndpPrefixInformationLength, as per RFC 4861
 			// section 4.6.2.
 			if numBodyBytes != ndpPrefixInformationLength {
-				return nil, true, ErrNDPOptMalformedBody
+				return nil, true, fmt.Errorf("got %d bytes for NDP Prefix Information option's body, expected %d bytes: %w", numBodyBytes, ndpPrefixInformationLength, ErrNDPOptMalformedBody)
 			}
 
 			return NDPPrefixInformation(body), false, nil
 
 		case NDPRecursiveDNSServerOptionType:
-			// RFC 8106 section 5.3.1 outlines that the RDNSS option
-			// must have a minimum length of 3 so it contains at
-			// least one IPv6 address.
-			if l < minNDPRecursiveDNSServerLength {
-				return nil, true, ErrNDPInvalidLength
+			opt := NDPRecursiveDNSServer(body)
+			if err := opt.checkAddresses(); err != nil {
+				return nil, true, err
 			}
 
-			opt := NDPRecursiveDNSServer(body)
-			if len(opt.Addresses()) == 0 {
-				return nil, true, ErrNDPOptMalformedBody
+			return opt, false, nil
+
+		case NDPDNSSearchListOptionType:
+			opt := NDPDNSSearchList(body)
+			if err := opt.checkDomainNames(); err != nil {
+				return nil, true, err
 			}
 
 			return opt, false, nil
@@ -247,10 +279,16 @@ type NDPOptions []byte
 //
 // See NDPOptionIterator for more information.
 func (b NDPOptions) Iter(check bool) (NDPOptionIterator, error) {
-	it := NDPOptionIterator{opts: b}
+	it := NDPOptionIterator{
+		opts: bytes.NewBuffer(b),
+	}
 
 	if check {
-		for it2 := it; true; {
+		it2 := NDPOptionIterator{
+			opts: bytes.NewBuffer(b),
+		}
+
+		for {
 			if _, done, err := it2.Next(); err != nil || done {
 				return it, err
 			}
@@ -278,7 +316,7 @@ func (b NDPOptions) Serialize(s NDPOptionsSerializer) int {
 			continue
 		}
 
-		b[0] = o.Type()
+		b[0] = byte(o.Type())
 
 		// We know this safe because paddedLength would have returned
 		// 0 if o had an invalid length (> 255 * lengthByteUnits).
@@ -304,7 +342,7 @@ type NDPOption interface {
 	fmt.Stringer
 
 	// Type returns the type of the receiver.
-	Type() uint8
+	Type() NDPOptionIdentifier
 
 	// Length returns the length of the body of the receiver, in bytes.
 	Length() int
@@ -386,7 +424,7 @@ func (b NDPOptionsSerializer) Length() int {
 type NDPSourceLinkLayerAddressOption tcpip.LinkAddress
 
 // Type implements NDPOption.Type.
-func (o NDPSourceLinkLayerAddressOption) Type() uint8 {
+func (o NDPSourceLinkLayerAddressOption) Type() NDPOptionIdentifier {
 	return NDPSourceLinkLayerAddressOptionType
 }
 
@@ -426,7 +464,7 @@ func (o NDPSourceLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress {
 type NDPTargetLinkLayerAddressOption tcpip.LinkAddress
 
 // Type implements NDPOption.Type.
-func (o NDPTargetLinkLayerAddressOption) Type() uint8 {
+func (o NDPTargetLinkLayerAddressOption) Type() NDPOptionIdentifier {
 	return NDPTargetLinkLayerAddressOptionType
 }
 
@@ -466,7 +504,7 @@ func (o NDPTargetLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress {
 type NDPPrefixInformation []byte
 
 // Type implements NDPOption.Type.
-func (o NDPPrefixInformation) Type() uint8 {
+func (o NDPPrefixInformation) Type() NDPOptionIdentifier {
 	return NDPPrefixInformationType
 }
 
@@ -590,7 +628,7 @@ type NDPRecursiveDNSServer []byte
 // Type returns the type of an NDP Recursive DNS Server option.
 //
 // Type implements NDPOption.Type.
-func (NDPRecursiveDNSServer) Type() uint8 {
+func (NDPRecursiveDNSServer) Type() NDPOptionIdentifier {
 	return NDPRecursiveDNSServerOptionType
 }
 
@@ -613,7 +651,12 @@ func (o NDPRecursiveDNSServer) serializeInto(b []byte) int {
 
 // String implements fmt.Stringer.String.
 func (o NDPRecursiveDNSServer) String() string {
-	return fmt.Sprintf("%T(%s valid for %s)", o, o.Addresses(), o.Lifetime())
+	lt := o.Lifetime()
+	addrs, err := o.Addresses()
+	if err != nil {
+		return fmt.Sprintf("%T([] valid for %s; err = %s)", o, lt, err)
+	}
+	return fmt.Sprintf("%T(%s valid for %s)", o, addrs, lt)
 }
 
 // Lifetime returns the length of time that the DNS server addresses
@@ -632,29 +675,225 @@ func (o NDPRecursiveDNSServer) Lifetime() time.Duration {
 // Addresses returns the recursive DNS server IPv6 addresses that may be
 // used for name resolution.
 //
-// Note, some of the addresses returned MAY be link-local addresses.
+// Note, the addresses MAY be link-local addresses.
+func (o NDPRecursiveDNSServer) Addresses() ([]tcpip.Address, error) {
+	var addrs []tcpip.Address
+	return addrs, o.iterAddresses(func(addr tcpip.Address) { addrs = append(addrs, addr) })
+}
+
+// checkAddresses iterates over the addresses in an NDP Recursive DNS Server
+// option and returns any error it encounters.
+func (o NDPRecursiveDNSServer) checkAddresses() error {
+	return o.iterAddresses(nil)
+}
+
+// iterAddresses iterates over the addresses in an NDP Recursive DNS Server
+// option and calls a function with each valid unicast IPv6 address.
 //
-// Addresses may panic if o does not hold valid IPv6 addresses.
-func (o NDPRecursiveDNSServer) Addresses() []tcpip.Address {
-	l := len(o)
-	if l < ndpRecursiveDNSServerAddressesOffset {
-		return nil
+// Note, the addresses MAY be link-local addresses.
+func (o NDPRecursiveDNSServer) iterAddresses(fn func(tcpip.Address)) error {
+	if l := len(o); l < minNDPRecursiveDNSServerBodySize {
+		return fmt.Errorf("got %d bytes for NDP Recursive DNS Server option's body, expected at least %d bytes: %w", l, minNDPRecursiveDNSServerBodySize, io.ErrUnexpectedEOF)
 	}
 
-	l -= ndpRecursiveDNSServerAddressesOffset
+	o = o[ndpRecursiveDNSServerAddressesOffset:]
+	l := len(o)
 	if l%IPv6AddressSize != 0 {
-		return nil
+		return fmt.Errorf("NDP Recursive DNS Server option's body ends in the middle of an IPv6 address (addresses body size = %d bytes): %w", l, ErrNDPOptMalformedBody)
 	}
 
-	buf := o[ndpRecursiveDNSServerAddressesOffset:]
-	var addrs []tcpip.Address
-	for len(buf) > 0 {
-		addr := tcpip.Address(buf[:IPv6AddressSize])
+	for i := 0; len(o) != 0; i++ {
+		addr := tcpip.Address(o[:IPv6AddressSize])
 		if !IsV6UnicastAddress(addr) {
-			return nil
+			return fmt.Errorf("%d-th address (%s) in NDP Recursive DNS Server option is not a valid unicast IPv6 address: %w", i, addr, ErrNDPOptMalformedBody)
+		}
+
+		if fn != nil {
+			fn(addr)
 		}
-		addrs = append(addrs, addr)
-		buf = buf[IPv6AddressSize:]
+
+		o = o[IPv6AddressSize:]
 	}
-	return addrs
+
+	return nil
+}
+
+// NDPDNSSearchList is the NDP DNS Search List option, as defined by
+// RFC 8106 section 5.2.
+type NDPDNSSearchList []byte
+
+// Type implements NDPOption.Type.
+func (o NDPDNSSearchList) Type() NDPOptionIdentifier {
+	return NDPDNSSearchListOptionType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPDNSSearchList) Length() int {
+	return len(o)
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPDNSSearchList) serializeInto(b []byte) int {
+	used := copy(b, o)
+
+	// Zero out the reserved bytes that are before the Lifetime field.
+	for i := 0; i < ndpDNSSearchListLifetimeOffset; i++ {
+		b[i] = 0
+	}
+
+	return used
+}
+
+// String implements fmt.Stringer.String.
+func (o NDPDNSSearchList) String() string {
+	lt := o.Lifetime()
+	domainNames, err := o.DomainNames()
+	if err != nil {
+		return fmt.Sprintf("%T([] valid for %s; err = %s)", o, lt, err)
+	}
+	return fmt.Sprintf("%T(%s valid for %s)", o, domainNames, lt)
+}
+
+// Lifetime returns the length of time that the DNS search list of domain names
+// in this option may be used for name resolution.
+//
+// Note, a value of 0 implies the domain names should no longer be used,
+// and a value of infinity/forever is represented by NDPInfiniteLifetime.
+func (o NDPDNSSearchList) Lifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 8106 section 5.1.
+	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpDNSSearchListLifetimeOffset:]))
+}
+
+// DomainNames returns a DNS search list of domain names.
+//
+// DomainNames will parse the backing buffer as outlined by RFC 1035 section
+// 3.1 and return a list of strings, with all domain names in lower case.
+func (o NDPDNSSearchList) DomainNames() ([]string, error) {
+	var domainNames []string
+	return domainNames, o.iterDomainNames(func(domainName string) { domainNames = append(domainNames, domainName) })
+}
+
+// checkDomainNames iterates over the domain names in an NDP DNS Search List
+// option and returns any error it encounters.
+func (o NDPDNSSearchList) checkDomainNames() error {
+	return o.iterDomainNames(nil)
+}
+
+// iterDomainNames iterates over the domain names in an NDP DNS Search List
+// option and calls a function with each valid domain name.
+func (o NDPDNSSearchList) iterDomainNames(fn func(string)) error {
+	if l := len(o); l < minNDPDNSSearchListBodySize {
+		return fmt.Errorf("got %d bytes for NDP DNS Search List  option's body, expected at least %d bytes: %w", l, minNDPDNSSearchListBodySize, io.ErrUnexpectedEOF)
+	}
+
+	var searchList bytes.Reader
+	searchList.Reset(o[ndpDNSSearchListDomainNamesOffset:])
+
+	var scratch [maxDomainNameLength]byte
+	domainName := bytes.NewBuffer(scratch[:])
+
+	// Parse the domain names, as per RFC 1035 section 3.1.
+	for searchList.Len() != 0 {
+		domainName.Reset()
+
+		// Parse a label within a domain name, as per RFC 1035 section 3.1.
+		for {
+			// The first byte is the label length.
+			labelLenByte, err := searchList.ReadByte()
+			if err != nil {
+				if err != io.EOF {
+					// ReadByte should only ever return nil or io.EOF.
+					panic(fmt.Sprintf("unexpected error when reading a label's length: %s", err))
+				}
+
+				// We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected
+				// once we start parsing a domain name; we expect the buffer to contain
+				// enough bytes for the whole domain name.
+				return fmt.Errorf("unexpected exhausted buffer while parsing a new label for a domain from NDP Search List option: %w", io.ErrUnexpectedEOF)
+			}
+			labelLen := int(labelLenByte)
+
+			// A zero-length label implies the end of a domain name.
+			if labelLen == 0 {
+				// If the domain name is empty or we have no callback function, do
+				// nothing further with the current domain name.
+				if domainName.Len() == 0 || fn == nil {
+					break
+				}
+
+				// Ignore the trailing period in the parsed domain name.
+				domainName.Truncate(domainName.Len() - 1)
+				fn(domainName.String())
+				break
+			}
+
+			// The label's length must not exceed the maximum length for a label.
+			if labelLen > maxDomainNameLabelLength {
+				return fmt.Errorf("label length of %d bytes is greater than the max label length of %d bytes for an NDP Search List option: %w", labelLen, maxDomainNameLabelLength, ErrNDPOptMalformedBody)
+			}
+
+			// The label (and trailing period) must not make the domain name too long.
+			if labelLen+1 > domainName.Cap()-domainName.Len() {
+				return fmt.Errorf("label would make an NDP Search List option's domain name longer than the max domain name length of %d bytes: %w", maxDomainNameLength, ErrNDPOptMalformedBody)
+			}
+
+			// Copy the label and add a trailing period.
+			for i := 0; i < labelLen; i++ {
+				b, err := searchList.ReadByte()
+				if err != nil {
+					if err != io.EOF {
+						panic(fmt.Sprintf("unexpected error when reading domain name's label: %s", err))
+					}
+
+					return fmt.Errorf("read %d out of %d bytes for a domain name's label from NDP Search List option: %w", i, labelLen, io.ErrUnexpectedEOF)
+				}
+
+				// As per RFC 1035 section 2.3.1:
+				//  1) the label must only contain ASCII include letters, digits and
+				//     hyphens
+				//  2) the first character in a label must be a letter
+				//  3) the last letter in a label must be a letter or digit
+
+				if !isLetter(b) {
+					if i == 0 {
+						return fmt.Errorf("first character of a domain name's label in an NDP Search List option must be a letter, got character code = %d: %w", b, ErrNDPOptMalformedBody)
+					}
+
+					if b == '-' {
+						if i == labelLen-1 {
+							return fmt.Errorf("last character of a domain name's label in an NDP Search List option must not be a hyphen (-): %w", ErrNDPOptMalformedBody)
+						}
+					} else if !isDigit(b) {
+						return fmt.Errorf("domain name's label in an NDP Search List option may only contain letters, digits and hyphens, got character code = %d: %w", b, ErrNDPOptMalformedBody)
+					}
+				}
+
+				// If b is an upper case character, make it lower case.
+				if isUpperLetter(b) {
+					b = b - 'A' + 'a'
+				}
+
+				if err := domainName.WriteByte(b); err != nil {
+					panic(fmt.Sprintf("unexpected error writing label to domain name buffer: %s", err))
+				}
+			}
+			if err := domainName.WriteByte('.'); err != nil {
+				panic(fmt.Sprintf("unexpected error writing trailing period to domain name buffer: %s", err))
+			}
+		}
+	}
+
+	return nil
+}
+
+func isLetter(b byte) bool {
+	return b >= 'a' && b <= 'z' || isUpperLetter(b)
+}
+
+func isUpperLetter(b byte) bool {
+	return b >= 'A' && b <= 'Z'
+}
+
+func isDigit(b byte) bool {
+	return b >= '0' && b <= '9'
 }
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
index 1cb9f5dc8..dc4591253 100644
--- a/pkg/tcpip/header/ndp_test.go
+++ b/pkg/tcpip/header/ndp_test.go
@@ -16,6 +16,10 @@ package header
 
 import (
 	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"regexp"
 	"testing"
 	"time"
 
@@ -115,7 +119,7 @@ func TestNDPNeighborAdvert(t *testing.T) {
 
 	// Make sure flags got updated in the backing buffer.
 	if got := b[ndpNAFlagsOffset]; got != 64 {
-		t.Errorf("got flags byte = %d, want = 64")
+		t.Errorf("got flags byte = %d, want = 64", got)
 	}
 }
 
@@ -543,8 +547,12 @@ func TestNDPRecursiveDNSServerOptionSerialize(t *testing.T) {
 	want := []tcpip.Address{
 		"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
 	}
-	if got := opt.Addresses(); !cmp.Equal(got, want) {
-		t.Errorf("got Addresses = %v, want = %v", got, want)
+	addrs, err := opt.Addresses()
+	if err != nil {
+		t.Errorf("opt.Addresses() = %s", err)
+	}
+	if diff := cmp.Diff(addrs, want); diff != "" {
+		t.Errorf("mismatched addresses (-want +got):\n%s", diff)
 	}
 
 	// Iterator should not return anything else.
@@ -638,8 +646,12 @@ func TestNDPRecursiveDNSServerOption(t *testing.T) {
 			if got := opt.Lifetime(); got != test.lifetime {
 				t.Errorf("got Lifetime = %d, want = %d", got, test.lifetime)
 			}
-			if got := opt.Addresses(); !cmp.Equal(got, test.addrs) {
-				t.Errorf("got Addresses = %v, want = %v", got, test.addrs)
+			addrs, err := opt.Addresses()
+			if err != nil {
+				t.Errorf("opt.Addresses() = %s", err)
+			}
+			if diff := cmp.Diff(addrs, test.addrs); diff != "" {
+				t.Errorf("mismatched addresses (-want +got):\n%s", diff)
 			}
 
 			// Iterator should not return anything else.
@@ -657,42 +669,513 @@ func TestNDPRecursiveDNSServerOption(t *testing.T) {
 	}
 }
 
+// TestNDPDNSSearchListOption tests the getters of NDPDNSSearchList.
+func TestNDPDNSSearchListOption(t *testing.T) {
+	tests := []struct {
+		name        string
+		buf         []byte
+		lifetime    time.Duration
+		domainNames []string
+		err         error
+	}{
+		{
+			name: "Valid1Label",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, 'a', 'b', 'c',
+				0,
+				0, 0, 0,
+			},
+			lifetime: time.Second,
+			domainNames: []string{
+				"abc",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid2Label",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 5,
+				3, 'a', 'b', 'c',
+				4, 'a', 'b', 'c', 'd',
+				0,
+				0, 0, 0, 0, 0, 0,
+			},
+			lifetime: 5 * time.Second,
+			domainNames: []string{
+				"abc.abcd",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid3Label",
+			buf: []byte{
+				0, 0,
+				1, 0, 0, 0,
+				3, 'a', 'b', 'c',
+				4, 'a', 'b', 'c', 'd',
+				1, 'e',
+				0,
+				0, 0, 0, 0,
+			},
+			lifetime: 16777216 * time.Second,
+			domainNames: []string{
+				"abc.abcd.e",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid2Domains",
+			buf: []byte{
+				0, 0,
+				1, 2, 3, 4,
+				3, 'a', 'b', 'c',
+				0,
+				2, 'd', 'e',
+				3, 'x', 'y', 'z',
+				0,
+				0, 0, 0,
+			},
+			lifetime: 16909060 * time.Second,
+			domainNames: []string{
+				"abc",
+				"de.xyz",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid3DomainsMixedCase",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				3, 'a', 'B', 'c',
+				0,
+				2, 'd', 'E',
+				3, 'X', 'y', 'z',
+				0,
+				1, 'J',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abc",
+				"de.xyz",
+				"j",
+			},
+			err: nil,
+		},
+		{
+			name: "ValidDomainAfterNULL",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				3, 'a', 'B', 'c',
+				0, 0, 0, 0,
+				2, 'd', 'E',
+				3, 'X', 'y', 'z',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abc",
+				"de.xyz",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid0Domains",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				0,
+				0, 0, 0, 0, 0, 0, 0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         nil,
+		},
+		{
+			name: "NoTrailingNull",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				7, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         io.ErrUnexpectedEOF,
+		},
+		{
+			name: "IncorrectLength",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				8, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         io.ErrUnexpectedEOF,
+		},
+		{
+			name: "IncorrectLengthWithNULL",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				7, 'a', 'b', 'c', 'd', 'e', 'f',
+				0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "LabelOfLength63",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk",
+			},
+			err: nil,
+		},
+		{
+			name: "LabelOfLength64",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				64, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l',
+				0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "DomainNameOfLength255",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				62, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghij",
+			},
+			err: nil,
+		},
+		{
+			name: "DomainNameOfLength256",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "StartingDigitForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, '9', 'b', 'c',
+				0,
+				0, 0, 0,
+			},
+			lifetime:    time.Second,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "StartingHyphenForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, '-', 'b', 'c',
+				0,
+				0, 0, 0,
+			},
+			lifetime:    time.Second,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "EndingHyphenForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, 'a', 'b', '-',
+				0,
+				0, 0, 0,
+			},
+			lifetime:    time.Second,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "EndingDigitForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, 'a', 'b', '9',
+				0,
+				0, 0, 0,
+			},
+			lifetime: time.Second,
+			domainNames: []string{
+				"ab9",
+			},
+			err: nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opt := NDPDNSSearchList(test.buf)
+
+			if got := opt.Lifetime(); got != test.lifetime {
+				t.Errorf("got Lifetime = %d, want = %d", got, test.lifetime)
+			}
+			domainNames, err := opt.DomainNames()
+			if !errors.Is(err, test.err) {
+				t.Errorf("opt.DomainNames() = %s", err)
+			}
+			if diff := cmp.Diff(domainNames, test.domainNames); diff != "" {
+				t.Errorf("mismatched domain names (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestNDPSearchListOptionDomainNameLabelInvalidSymbols(t *testing.T) {
+	for r := rune(0); r <= 255; r++ {
+		t.Run(fmt.Sprintf("RuneVal=%d", r), func(t *testing.T) {
+			buf := []byte{
+				0, 0,
+				0, 0, 0, 0,
+				3, 'a', 0 /* will be replaced */, 'c',
+				0,
+				0, 0, 0,
+			}
+			buf[8] = uint8(r)
+			opt := NDPDNSSearchList(buf)
+
+			// As per RFC 1035 section 2.3.1, the label must only include ASCII
+			// letters, digits and hyphens (a-z, A-Z, 0-9, -).
+			var expectedErr error
+			re := regexp.MustCompile(`[a-zA-Z0-9-]`)
+			if !re.Match([]byte{byte(r)}) {
+				expectedErr = ErrNDPOptMalformedBody
+			}
+
+			if domainNames, err := opt.DomainNames(); !errors.Is(err, expectedErr) {
+				t.Errorf("got opt.DomainNames() = (%s, %v), want = (_, %v)", domainNames, err, ErrNDPOptMalformedBody)
+			}
+		})
+	}
+}
+
+func TestNDPDNSSearchListOptionSerialize(t *testing.T) {
+	b := []byte{
+		9, 8,
+		1, 0, 0, 0,
+		3, 'a', 'b', 'c',
+		4, 'a', 'b', 'c', 'd',
+		1, 'e',
+		0,
+	}
+	targetBuf := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+	expected := []byte{
+		31, 3, 0, 0,
+		1, 0, 0, 0,
+		3, 'a', 'b', 'c',
+		4, 'a', 'b', 'c', 'd',
+		1, 'e',
+		0,
+		0, 0, 0, 0,
+	}
+	opts := NDPOptions(targetBuf)
+	serializer := NDPOptionsSerializer{
+		NDPDNSSearchList(b),
+	}
+	if got, want := opts.Serialize(serializer), len(expected); got != want {
+		t.Errorf("got Serialize = %d, want = %d", got, want)
+	}
+	if !bytes.Equal(targetBuf, expected) {
+		t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expected)
+	}
+
+	it, err := opts.Iter(true)
+	if err != nil {
+		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+	}
+
+	next, done, err := it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got := next.Type(); got != NDPDNSSearchListOptionType {
+		t.Errorf("got Type = %d, want = %d", got, NDPDNSSearchListOptionType)
+	}
+
+	opt, ok := next.(NDPDNSSearchList)
+	if !ok {
+		t.Fatalf("next (type = %T) cannot be casted to an NDPDNSSearchList", next)
+	}
+	if got := opt.Type(); got != 31 {
+		t.Errorf("got Type = %d, want = 31", got)
+	}
+	if got := opt.Length(); got != 22 {
+		t.Errorf("got Length = %d, want = 22", got)
+	}
+	if got, want := opt.Lifetime(), 16777216*time.Second; got != want {
+		t.Errorf("got Lifetime = %s, want = %s", got, want)
+	}
+	domainNames, err := opt.DomainNames()
+	if err != nil {
+		t.Errorf("opt.DomainNames() = %s", err)
+	}
+	if diff := cmp.Diff(domainNames, []string{"abc.abcd.e"}); diff != "" {
+		t.Errorf("domain names mismatch (-want +got):\n%s", diff)
+	}
+
+	// Iterator should not return anything else.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if !done {
+		t.Error("got Next = (_, false, _), want = (_, true, _)")
+	}
+	if next != nil {
+		t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+	}
+}
+
 // TestNDPOptionsIterCheck tests that Iter will return false if the NDPOptions
 // the iterator was returned for is malformed.
 func TestNDPOptionsIterCheck(t *testing.T) {
 	tests := []struct {
-		name     string
-		buf      []byte
-		expected error
+		name        string
+		buf         []byte
+		expectedErr error
 	}{
 		{
-			"ZeroLengthField",
-			[]byte{0, 0, 0, 0, 0, 0, 0, 0},
-			ErrNDPOptZeroLength,
+			name:        "ZeroLengthField",
+			buf:         []byte{0, 0, 0, 0, 0, 0, 0, 0},
+			expectedErr: ErrNDPOptMalformedHeader,
 		},
 		{
-			"ValidSourceLinkLayerAddressOption",
-			[]byte{1, 1, 1, 2, 3, 4, 5, 6},
-			nil,
+			name:        "ValidSourceLinkLayerAddressOption",
+			buf:         []byte{1, 1, 1, 2, 3, 4, 5, 6},
+			expectedErr: nil,
 		},
 		{
-			"TooSmallSourceLinkLayerAddressOption",
-			[]byte{1, 1, 1, 2, 3, 4, 5},
-			ErrNDPOptBufExhausted,
+			name:        "TooSmallSourceLinkLayerAddressOption",
+			buf:         []byte{1, 1, 1, 2, 3, 4, 5},
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 		{
-			"ValidTargetLinkLayerAddressOption",
-			[]byte{2, 1, 1, 2, 3, 4, 5, 6},
-			nil,
+			name:        "ValidTargetLinkLayerAddressOption",
+			buf:         []byte{2, 1, 1, 2, 3, 4, 5, 6},
+			expectedErr: nil,
 		},
 		{
-			"TooSmallTargetLinkLayerAddressOption",
-			[]byte{2, 1, 1, 2, 3, 4, 5},
-			ErrNDPOptBufExhausted,
+			name:        "TooSmallTargetLinkLayerAddressOption",
+			buf:         []byte{2, 1, 1, 2, 3, 4, 5},
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 		{
-			"ValidPrefixInformation",
-			[]byte{
+			name: "ValidPrefixInformation",
+			buf: []byte{
 				3, 4, 43, 64,
 				1, 2, 3, 4,
 				5, 6, 7, 8,
@@ -702,11 +1185,11 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 				17, 18, 19, 20,
 				21, 22, 23, 24,
 			},
-			nil,
+			expectedErr: nil,
 		},
 		{
-			"TooSmallPrefixInformation",
-			[]byte{
+			name: "TooSmallPrefixInformation",
+			buf: []byte{
 				3, 4, 43, 64,
 				1, 2, 3, 4,
 				5, 6, 7, 8,
@@ -716,11 +1199,11 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 				17, 18, 19, 20,
 				21, 22, 23,
 			},
-			ErrNDPOptBufExhausted,
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 		{
-			"InvalidPrefixInformationLength",
-			[]byte{
+			name: "InvalidPrefixInformationLength",
+			buf: []byte{
 				3, 3, 43, 64,
 				1, 2, 3, 4,
 				5, 6, 7, 8,
@@ -728,11 +1211,11 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 				9, 10, 11, 12,
 				13, 14, 15, 16,
 			},
-			ErrNDPOptMalformedBody,
+			expectedErr: ErrNDPOptMalformedBody,
 		},
 		{
-			"ValidSourceAndTargetLinkLayerAddressWithPrefixInformation",
-			[]byte{
+			name: "ValidSourceAndTargetLinkLayerAddressWithPrefixInformation",
+			buf: []byte{
 				// Source Link-Layer Address.
 				1, 1, 1, 2, 3, 4, 5, 6,
 
@@ -749,11 +1232,11 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 				17, 18, 19, 20,
 				21, 22, 23, 24,
 			},
-			nil,
+			expectedErr: nil,
 		},
 		{
-			"ValidSourceAndTargetLinkLayerAddressWithPrefixInformationWithUnrecognized",
-			[]byte{
+			name: "ValidSourceAndTargetLinkLayerAddressWithPrefixInformationWithUnrecognized",
+			buf: []byte{
 				// Source Link-Layer Address.
 				1, 1, 1, 2, 3, 4, 5, 6,
 
@@ -775,52 +1258,153 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 				17, 18, 19, 20,
 				21, 22, 23, 24,
 			},
-			nil,
+			expectedErr: nil,
 		},
 		{
-			"InvalidRecursiveDNSServerCutsOffAddress",
-			[]byte{
+			name: "InvalidRecursiveDNSServerCutsOffAddress",
+			buf: []byte{
 				25, 4, 0, 0,
 				0, 0, 0, 0,
 				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
 				0, 1, 2, 3, 4, 5, 6, 7,
 			},
-			ErrNDPOptMalformedBody,
+			expectedErr: ErrNDPOptMalformedBody,
 		},
 		{
-			"InvalidRecursiveDNSServerInvalidLengthField",
-			[]byte{
+			name: "InvalidRecursiveDNSServerInvalidLengthField",
+			buf: []byte{
 				25, 2, 0, 0,
 				0, 0, 0, 0,
 				0, 1, 2, 3, 4, 5, 6, 7, 8,
 			},
-			ErrNDPInvalidLength,
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 		{
-			"RecursiveDNSServerTooSmall",
-			[]byte{
+			name: "RecursiveDNSServerTooSmall",
+			buf: []byte{
 				25, 1, 0, 0,
 				0, 0, 0,
 			},
-			ErrNDPOptBufExhausted,
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 		{
-			"RecursiveDNSServerMulticast",
-			[]byte{
+			name: "RecursiveDNSServerMulticast",
+			buf: []byte{
 				25, 3, 0, 0,
 				0, 0, 0, 0,
 				255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
 			},
-			ErrNDPOptMalformedBody,
+			expectedErr: ErrNDPOptMalformedBody,
 		},
 		{
-			"RecursiveDNSServerUnspecified",
-			[]byte{
+			name: "RecursiveDNSServerUnspecified",
+			buf: []byte{
 				25, 3, 0, 0,
 				0, 0, 0, 0,
 				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 			},
-			ErrNDPOptMalformedBody,
+			expectedErr: ErrNDPOptMalformedBody,
+		},
+		{
+			name: "DNSSearchListLargeCompliantRFC1035",
+			buf: []byte{
+				31, 33, 0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				62, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j',
+				0,
+			},
+			expectedErr: nil,
+		},
+		{
+			name: "DNSSearchListNonCompliantRFC1035",
+			buf: []byte{
+				31, 33, 0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			expectedErr: ErrNDPOptMalformedBody,
+		},
+		{
+			name: "DNSSearchListValidSmall",
+			buf: []byte{
+				31, 2, 0, 0,
+				0, 0, 0, 0,
+				6, 'a', 'b', 'c', 'd', 'e', 'f',
+				0,
+			},
+			expectedErr: nil,
+		},
+		{
+			name: "DNSSearchListTooSmall",
+			buf: []byte{
+				31, 1, 0, 0,
+				0, 0, 0,
+			},
+			expectedErr: io.ErrUnexpectedEOF,
 		},
 	}
 
@@ -828,8 +1412,8 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			opts := NDPOptions(test.buf)
 
-			if _, err := opts.Iter(true); err != test.expected {
-				t.Fatalf("got Iter(true) = (_, %v), want = (_, %v)", err, test.expected)
+			if _, err := opts.Iter(true); !errors.Is(err, test.expectedErr) {
+				t.Fatalf("got Iter(true) = (_, %v), want = (_, %v)", err, test.expectedErr)
 			}
 
 			// test.buf may be malformed but we chose not to check
diff --git a/pkg/tcpip/header/ndpoptionidentifier_string.go b/pkg/tcpip/header/ndpoptionidentifier_string.go
new file mode 100644
index 000000000..6fe9a336b
--- /dev/null
+++ b/pkg/tcpip/header/ndpoptionidentifier_string.go
@@ -0,0 +1,50 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by "stringer -type NDPOptionIdentifier ."; DO NOT EDIT.
+
+package header
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[NDPSourceLinkLayerAddressOptionType-1]
+	_ = x[NDPTargetLinkLayerAddressOptionType-2]
+	_ = x[NDPPrefixInformationType-3]
+	_ = x[NDPRecursiveDNSServerOptionType-25]
+}
+
+const (
+	_NDPOptionIdentifier_name_0 = "NDPSourceLinkLayerAddressOptionTypeNDPTargetLinkLayerAddressOptionTypeNDPPrefixInformationType"
+	_NDPOptionIdentifier_name_1 = "NDPRecursiveDNSServerOptionType"
+)
+
+var (
+	_NDPOptionIdentifier_index_0 = [...]uint8{0, 35, 70, 94}
+)
+
+func (i NDPOptionIdentifier) String() string {
+	switch {
+	case 1 <= i && i <= 3:
+		i -= 1
+		return _NDPOptionIdentifier_name_0[_NDPOptionIdentifier_index_0[i]:_NDPOptionIdentifier_index_0[i+1]]
+	case i == 25:
+		return _NDPOptionIdentifier_name_1
+	default:
+		return "NDPOptionIdentifier(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+}
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index 82cfe785c..4c6f808e5 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -66,6 +66,14 @@ const (
 	TCPOptionSACK          = 5
 )
 
+// Option Lengths.
+const (
+	TCPOptionMSSLength           = 4
+	TCPOptionTSLength            = 10
+	TCPOptionWSLength            = 3
+	TCPOptionSackPermittedLength = 2
+)
+
 // TCPFields contains the fields of a TCP packet. It is used to describe the
 // fields of a packet that needs to be encoded.
 type TCPFields struct {
@@ -81,7 +89,8 @@ type TCPFields struct {
 	// AckNum is the "acknowledgement number" field of a TCP packet.
 	AckNum uint32
 
-	// DataOffset is the "data offset" field of a TCP packet.
+	// DataOffset is the "data offset" field of a TCP packet. It is the length of
+	// the TCP header in bytes.
 	DataOffset uint8
 
 	// Flags is the "flags" field of a TCP packet.
@@ -213,7 +222,8 @@ func (b TCP) AckNumber() uint32 {
 	return binary.BigEndian.Uint32(b[TCPAckNumOffset:])
 }
 
-// DataOffset returns the "data offset" field of the tcp header.
+// DataOffset returns the "data offset" field of the tcp header. The return
+// value is the length of the TCP header in bytes.
 func (b TCP) DataOffset() uint8 {
 	return (b[TCPDataOffset] >> 4) * 4
 }
@@ -238,6 +248,11 @@ func (b TCP) Checksum() uint16 {
 	return binary.BigEndian.Uint16(b[TCPChecksumOffset:])
 }
 
+// UrgentPointer returns the "urgent pointer" field of the tcp header.
+func (b TCP) UrgentPointer() uint16 {
+	return binary.BigEndian.Uint16(b[TCPUrgentPtrOffset:])
+}
+
 // SetSourcePort sets the "source port" field of the tcp header.
 func (b TCP) SetSourcePort(port uint16) {
 	binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], port)
@@ -253,6 +268,37 @@ func (b TCP) SetChecksum(checksum uint16) {
 	binary.BigEndian.PutUint16(b[TCPChecksumOffset:], checksum)
 }
 
+// SetDataOffset sets the data offset field of the tcp header. headerLen should
+// be the length of the TCP header in bytes.
+func (b TCP) SetDataOffset(headerLen uint8) {
+	b[TCPDataOffset] = (headerLen / 4) << 4
+}
+
+// SetSequenceNumber sets the sequence number field of the tcp header.
+func (b TCP) SetSequenceNumber(seqNum uint32) {
+	binary.BigEndian.PutUint32(b[TCPSeqNumOffset:], seqNum)
+}
+
+// SetAckNumber sets the ack number field of the tcp header.
+func (b TCP) SetAckNumber(ackNum uint32) {
+	binary.BigEndian.PutUint32(b[TCPAckNumOffset:], ackNum)
+}
+
+// SetFlags sets the flags field of the tcp header.
+func (b TCP) SetFlags(flags uint8) {
+	b[TCPFlagsOffset] = flags
+}
+
+// SetWindowSize sets the window size field of the tcp header.
+func (b TCP) SetWindowSize(rcvwnd uint16) {
+	binary.BigEndian.PutUint16(b[TCPWinSizeOffset:], rcvwnd)
+}
+
+// SetUrgentPoiner sets the window size field of the tcp header.
+func (b TCP) SetUrgentPoiner(urgentPointer uint16) {
+	binary.BigEndian.PutUint16(b[TCPUrgentPtrOffset:], urgentPointer)
+}
+
 // CalculateChecksum calculates the checksum of the tcp segment.
 // partialChecksum is the checksum of the network-layer pseudo-header
 // and the checksum of the segment data.
@@ -456,14 +502,11 @@ func ParseTCPOptions(b []byte) TCPOptions {
 // returns without encoding anything. It returns the number of bytes written to
 // the provided buffer.
 func EncodeMSSOption(mss uint32, b []byte) int {
-	// mssOptionSize is the number of bytes in a valid MSS option.
-	const mssOptionSize = 4
-
-	if len(b) < mssOptionSize {
+	if len(b) < TCPOptionMSSLength {
 		return 0
 	}
-	b[0], b[1], b[2], b[3] = TCPOptionMSS, mssOptionSize, byte(mss>>8), byte(mss)
-	return mssOptionSize
+	b[0], b[1], b[2], b[3] = TCPOptionMSS, TCPOptionMSSLength, byte(mss>>8), byte(mss)
+	return TCPOptionMSSLength
 }
 
 // EncodeWSOption encodes the WS TCP option with the WS value in the
@@ -471,10 +514,10 @@ func EncodeMSSOption(mss uint32, b []byte) int {
 // returns without encoding anything. It returns the number of bytes written to
 // the provided buffer.
 func EncodeWSOption(ws int, b []byte) int {
-	if len(b) < 3 {
+	if len(b) < TCPOptionWSLength {
 		return 0
 	}
-	b[0], b[1], b[2] = TCPOptionWS, 3, uint8(ws)
+	b[0], b[1], b[2] = TCPOptionWS, TCPOptionWSLength, uint8(ws)
 	return int(b[1])
 }
 
@@ -483,10 +526,10 @@ func EncodeWSOption(ws int, b []byte) int {
 // just returns without encoding anything. It returns the number of bytes
 // written to the provided buffer.
 func EncodeTSOption(tsVal, tsEcr uint32, b []byte) int {
-	if len(b) < 10 {
+	if len(b) < TCPOptionTSLength {
 		return 0
 	}
-	b[0], b[1] = TCPOptionTS, 10
+	b[0], b[1] = TCPOptionTS, TCPOptionTSLength
 	binary.BigEndian.PutUint32(b[2:], tsVal)
 	binary.BigEndian.PutUint32(b[6:], tsEcr)
 	return int(b[1])
@@ -497,11 +540,11 @@ func EncodeTSOption(tsVal, tsEcr uint32, b []byte) int {
 // encoding anything. It returns the number of bytes written to the provided
 // buffer.
 func EncodeSACKPermittedOption(b []byte) int {
-	if len(b) < 2 {
+	if len(b) < TCPOptionSackPermittedLength {
 		return 0
 	}
 
-	b[0], b[1] = TCPOptionSACKPermitted, 2
+	b[0], b[1] = TCPOptionSACKPermitted, TCPOptionSackPermittedLength
 	return int(b[1])
 }
 
@@ -556,3 +599,23 @@ func AddTCPOptionPadding(options []byte, offset int) int {
 	}
 	return paddingToAdd
 }
+
+// Acceptable checks if a segment that starts at segSeq and has length segLen is
+// "acceptable" for arriving in a receive window that starts at rcvNxt and ends
+// before rcvAcc, according to the table on page 26 and 69 of RFC 793.
+func Acceptable(segSeq seqnum.Value, segLen seqnum.Size, rcvNxt, rcvAcc seqnum.Value) bool {
+	if rcvNxt == rcvAcc {
+		return segLen == 0 && segSeq == rcvNxt
+	}
+	if segLen == 0 {
+		// rcvWnd is incremented by 1 because that is Linux's behavior despite the
+		// RFC.
+		return segSeq.InRange(rcvNxt, rcvAcc.Add(1))
+	}
+	// Page 70 of RFC 793 allows packets that can be made "acceptable" by trimming
+	// the payload, so we'll accept any payload that overlaps the receieve window.
+	// segSeq < rcvAcc is more correct according to RFC, however, Linux does it
+	// differently, it uses segSeq <= rcvAcc, we'd want to keep the same behavior
+	// as Linux.
+	return rcvNxt.LessThan(segSeq.Add(segLen)) && segSeq.LessThanEq(rcvAcc)
+}
diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go
index 74412c894..9339d637f 100644
--- a/pkg/tcpip/header/udp.go
+++ b/pkg/tcpip/header/udp.go
@@ -99,6 +99,11 @@ func (b UDP) SetChecksum(checksum uint16) {
 	binary.BigEndian.PutUint16(b[udpChecksum:], checksum)
 }
 
+// SetLength sets the "length" field of the udp header.
+func (b UDP) SetLength(length uint16) {
+	binary.BigEndian.PutUint16(b[udpLength:], length)
+}
+
 // CalculateChecksum calculates the checksum of the udp packet, given the
 // checksum of the network-layer pseudo-header and the checksum of the payload.
 func (b UDP) CalculateChecksum(partialChecksum uint16) uint16 {
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
deleted file mode 100644
index 75a433a3b..000000000
--- a/pkg/tcpip/iptables/iptables.go
+++ /dev/null
@@ -1,236 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package iptables supports packet filtering and manipulation via the iptables
-// tool.
-package iptables
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-)
-
-// Table names.
-const (
-	TablenameNat    = "nat"
-	TablenameMangle = "mangle"
-	TablenameFilter = "filter"
-)
-
-// Chain names as defined by net/ipv4/netfilter/ip_tables.c.
-const (
-	ChainNamePrerouting  = "PREROUTING"
-	ChainNameInput       = "INPUT"
-	ChainNameForward     = "FORWARD"
-	ChainNameOutput      = "OUTPUT"
-	ChainNamePostrouting = "POSTROUTING"
-)
-
-// HookUnset indicates that there is no hook set for an entrypoint or
-// underflow.
-const HookUnset = -1
-
-// DefaultTables returns a default set of tables. Each chain is set to accept
-// all packets.
-func DefaultTables() IPTables {
-	// TODO(gvisor.dev/issue/170): We may be able to swap out some strings for
-	// iotas.
-	return IPTables{
-		Tables: map[string]Table{
-			TablenameNat: Table{
-				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
-				},
-				BuiltinChains: map[Hook]int{
-					Prerouting:  0,
-					Input:       1,
-					Output:      2,
-					Postrouting: 3,
-				},
-				Underflows: map[Hook]int{
-					Prerouting:  0,
-					Input:       1,
-					Output:      2,
-					Postrouting: 3,
-				},
-				UserChains: map[string]int{},
-			},
-			TablenameMangle: Table{
-				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
-				},
-				BuiltinChains: map[Hook]int{
-					Prerouting: 0,
-					Output:     1,
-				},
-				Underflows: map[Hook]int{
-					Prerouting: 0,
-					Output:     1,
-				},
-				UserChains: map[string]int{},
-			},
-			TablenameFilter: Table{
-				Rules: []Rule{
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: AcceptTarget{}},
-					Rule{Target: ErrorTarget{}},
-				},
-				BuiltinChains: map[Hook]int{
-					Input:   0,
-					Forward: 1,
-					Output:  2,
-				},
-				Underflows: map[Hook]int{
-					Input:   0,
-					Forward: 1,
-					Output:  2,
-				},
-				UserChains: map[string]int{},
-			},
-		},
-		Priorities: map[Hook][]string{
-			Input:      []string{TablenameNat, TablenameFilter},
-			Prerouting: []string{TablenameMangle, TablenameNat},
-			Output:     []string{TablenameMangle, TablenameNat, TablenameFilter},
-		},
-	}
-}
-
-// EmptyFilterTable returns a Table with no rules and the filter table chains
-// mapped to HookUnset.
-func EmptyFilterTable() Table {
-	return Table{
-		Rules: []Rule{},
-		BuiltinChains: map[Hook]int{
-			Input:   HookUnset,
-			Forward: HookUnset,
-			Output:  HookUnset,
-		},
-		Underflows: map[Hook]int{
-			Input:   HookUnset,
-			Forward: HookUnset,
-			Output:  HookUnset,
-		},
-		UserChains: map[string]int{},
-	}
-}
-
-// Check runs pkt through the rules for hook. It returns true when the packet
-// should continue traversing the network stack and false when it should be
-// dropped.
-//
-// Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) Check(hook Hook, pkt tcpip.PacketBuffer) bool {
-	// TODO(gvisor.dev/issue/170): A lot of this is uncomplicated because
-	// we're missing features. Jumps, the call stack, etc. aren't checked
-	// for yet because we're yet to support them.
-
-	// Go through each table containing the hook.
-	for _, tablename := range it.Priorities[hook] {
-		switch verdict := it.checkTable(hook, pkt, tablename); verdict {
-		// If the table returns Accept, move on to the next table.
-		case TableAccept:
-			continue
-		// The Drop verdict is final.
-		case TableDrop:
-			return false
-		default:
-			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
-		}
-	}
-
-	// Every table returned Accept.
-	return true
-}
-
-// Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) checkTable(hook Hook, pkt tcpip.PacketBuffer, tablename string) TableVerdict {
-	// Start from ruleIdx and walk the list of rules until a rule gives us
-	// a verdict.
-	table := it.Tables[tablename]
-	for ruleIdx := table.BuiltinChains[hook]; ruleIdx < len(table.Rules); ruleIdx++ {
-		switch verdict := it.checkRule(hook, pkt, table, ruleIdx); verdict {
-		case RuleAccept:
-			return TableAccept
-
-		case RuleDrop:
-			return TableDrop
-
-		case RuleContinue:
-			continue
-
-		case RuleReturn:
-			// TODO(gvisor.dev/issue/170): We don't implement jump
-			// yet, so any Return is from a built-in chain. That
-			// means we have to to call the underflow.
-			underflow := table.Rules[table.Underflows[hook]]
-			// Underflow is guaranteed to be an unconditional
-			// ACCEPT or DROP.
-			switch v, _ := underflow.Target.Action(pkt); v {
-			case RuleAccept:
-				return TableAccept
-			case RuleDrop:
-				return TableDrop
-			case RuleContinue, RuleReturn:
-				panic("Underflows should only return RuleAccept or RuleDrop.")
-			default:
-				panic(fmt.Sprintf("Unknown verdict: %d", v))
-			}
-
-		default:
-			panic(fmt.Sprintf("Unknown verdict: %d", verdict))
-		}
-
-	}
-
-	// We got through the entire table without a decision. Default to DROP
-	// for safety.
-	return TableDrop
-}
-
-// Precondition: pk.NetworkHeader is set.
-func (it *IPTables) checkRule(hook Hook, pkt tcpip.PacketBuffer, table Table, ruleIdx int) RuleVerdict {
-	rule := table.Rules[ruleIdx]
-
-	// First check whether the packet matches the IP header filter.
-	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
-	if rule.Filter.Protocol != 0 && rule.Filter.Protocol != header.IPv4(pkt.NetworkHeader).TransportProtocol() {
-		return RuleContinue
-	}
-
-	// Go through each rule matcher. If they all match, run
-	// the rule target.
-	for _, matcher := range rule.Matchers {
-		matches, hotdrop := matcher.Match(hook, pkt, "")
-		if hotdrop {
-			return RuleDrop
-		}
-		if !matches {
-			return RuleContinue
-		}
-	}
-
-	// All the matchers matched, so run the target.
-	verdict, _ := rule.Target.Action(pkt)
-	return verdict
-}
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
deleted file mode 100644
index 9fc60cfad..000000000
--- a/pkg/tcpip/iptables/targets.go
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file contains various Targets.
-
-package iptables
-
-import (
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/tcpip"
-)
-
-// AcceptTarget accepts packets.
-type AcceptTarget struct{}
-
-// Action implements Target.Action.
-func (AcceptTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleAccept, ""
-}
-
-// DropTarget drops packets.
-type DropTarget struct{}
-
-// Action implements Target.Action.
-func (DropTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleDrop, ""
-}
-
-// ErrorTarget logs an error and drops the packet. It represents a target that
-// should be unreachable.
-type ErrorTarget struct{}
-
-// Action implements Target.Action.
-func (ErrorTarget) Action(packet tcpip.PacketBuffer) (RuleVerdict, string) {
-	log.Debugf("ErrorTarget triggered.")
-	return RuleDrop, ""
-}
-
-// UserChainTarget marks a rule as the beginning of a user chain.
-type UserChainTarget struct {
-	Name string
-}
-
-// Action implements Target.Action.
-func (UserChainTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
-	panic("UserChainTarget should never be called.")
-}
-
-// ReturnTarget returns from the current chain. If the chain is a built-in, the
-// hook's underflow should be called.
-type ReturnTarget struct{}
-
-// Action implements Target.Action.
-func (ReturnTarget) Action(tcpip.PacketBuffer) (RuleVerdict, string) {
-	return RuleReturn, ""
-}
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 3974c464e..b8b93e78e 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = ["channel.go"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 78d447acd..20b183da0 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -20,6 +20,7 @@ package channel
 import (
 	"context"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -27,12 +28,102 @@ import (
 
 // PacketInfo holds all the information about an outbound packet.
 type PacketInfo struct {
-	Pkt   tcpip.PacketBuffer
+	Pkt   *stack.PacketBuffer
 	Proto tcpip.NetworkProtocolNumber
 	GSO   *stack.GSO
 	Route stack.Route
 }
 
+// Notification is the interface for receiving notification from the packet
+// queue.
+type Notification interface {
+	// WriteNotify will be called when a write happens to the queue.
+	WriteNotify()
+}
+
+// NotificationHandle is an opaque handle to the registered notification target.
+// It can be used to unregister the notification when no longer interested.
+//
+// +stateify savable
+type NotificationHandle struct {
+	n Notification
+}
+
+type queue struct {
+	// c is the outbound packet channel.
+	c chan PacketInfo
+	// mu protects fields below.
+	mu     sync.RWMutex
+	notify []*NotificationHandle
+}
+
+func (q *queue) Close() {
+	close(q.c)
+}
+
+func (q *queue) Read() (PacketInfo, bool) {
+	select {
+	case p := <-q.c:
+		return p, true
+	default:
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) ReadContext(ctx context.Context) (PacketInfo, bool) {
+	select {
+	case pkt := <-q.c:
+		return pkt, true
+	case <-ctx.Done():
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) Write(p PacketInfo) bool {
+	wrote := false
+	select {
+	case q.c <- p:
+		wrote = true
+	default:
+	}
+	q.mu.Lock()
+	notify := q.notify
+	q.mu.Unlock()
+
+	if wrote {
+		// Send notification outside of lock.
+		for _, h := range notify {
+			h.n.WriteNotify()
+		}
+	}
+	return wrote
+}
+
+func (q *queue) Num() int {
+	return len(q.c)
+}
+
+func (q *queue) AddNotify(notify Notification) *NotificationHandle {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	h := &NotificationHandle{n: notify}
+	q.notify = append(q.notify, h)
+	return h
+}
+
+func (q *queue) RemoveNotify(handle *NotificationHandle) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	// Make a copy, since we reads the array outside of lock when notifying.
+	notify := make([]*NotificationHandle, 0, len(q.notify))
+	for _, h := range q.notify {
+		if h != handle {
+			notify = append(notify, h)
+		}
+	}
+	q.notify = notify
+}
+
 // Endpoint is link layer endpoint that stores outbound packets in a channel
 // and allows injection of inbound packets.
 type Endpoint struct {
@@ -41,14 +132,16 @@ type Endpoint struct {
 	linkAddr           tcpip.LinkAddress
 	LinkEPCapabilities stack.LinkEndpointCapabilities
 
-	// c is where outbound packets are queued.
-	c chan PacketInfo
+	// Outbound packet queue.
+	q *queue
 }
 
 // New creates a new channel endpoint.
 func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 	return &Endpoint{
-		c:        make(chan PacketInfo, size),
+		q: &queue{
+			c: make(chan PacketInfo, size),
+		},
 		mtu:      mtu,
 		linkAddr: linkAddr,
 	}
@@ -57,51 +150,44 @@ func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 // Close closes e. Further packet injections will panic. Reads continue to
 // succeed until all packets are read.
 func (e *Endpoint) Close() {
-	close(e.c)
+	e.q.Close()
 }
 
-// Read does non-blocking read for one packet from the outbound packet queue.
+// Read does non-blocking read one packet from the outbound packet queue.
 func (e *Endpoint) Read() (PacketInfo, bool) {
-	select {
-	case pkt := <-e.c:
-		return pkt, true
-	default:
-		return PacketInfo{}, false
-	}
+	return e.q.Read()
 }
 
 // ReadContext does blocking read for one packet from the outbound packet queue.
 // It can be cancelled by ctx, and in this case, it returns false.
 func (e *Endpoint) ReadContext(ctx context.Context) (PacketInfo, bool) {
-	select {
-	case pkt := <-e.c:
-		return pkt, true
-	case <-ctx.Done():
-		return PacketInfo{}, false
-	}
+	return e.q.ReadContext(ctx)
 }
 
 // Drain removes all outbound packets from the channel and counts them.
 func (e *Endpoint) Drain() int {
 	c := 0
 	for {
-		select {
-		case <-e.c:
-			c++
-		default:
+		if _, ok := e.Read(); !ok {
 			return c
 		}
+		c++
 	}
 }
 
+// NumQueued returns the number of packet queued for outbound.
+func (e *Endpoint) NumQueued() int {
+	return e.q.Num()
+}
+
 // InjectInbound injects an inbound packet.
-func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt tcpip.PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
+func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
 }
 
 // Attach saves the stack network-layer dispatcher for use later when packets
@@ -143,7 +229,7 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	// Clone r then release its resource so we only get the relevant fields from
 	// stack.Route without holding a reference to a NIC's endpoint.
 	route := r.Clone()
@@ -155,42 +241,30 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		Route: route,
 	}
 
-	select {
-	case e.c <- p:
-	default:
-	}
+	e.q.Write(p)
 
 	return nil
 }
 
 // WritePackets stores outbound packets into the channel.
-func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	// Clone r then release its resource so we only get the relevant fields from
 	// stack.Route without holding a reference to a NIC's endpoint.
 	route := r.Clone()
 	route.Release()
-	payloadView := pkts[0].Data.ToView()
 	n := 0
-packetLoop:
-	for _, pkt := range pkts {
-		off := pkt.DataOffset
-		size := pkt.DataSize
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
 		p := PacketInfo{
-			Pkt: tcpip.PacketBuffer{
-				Header: pkt.Header,
-				Data:   buffer.NewViewFromBytes(payloadView[off : off+size]).ToVectorisedView(),
-			},
+			Pkt:   pkt,
 			Proto: protocol,
 			GSO:   gso,
 			Route: route,
 		}
 
-		select {
-		case e.c <- p:
-			n++
-		default:
-			break packetLoop
+		if !e.q.Write(p) {
+			break
 		}
+		n++
 	}
 
 	return n, nil
@@ -199,18 +273,26 @@ packetLoop:
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	p := PacketInfo{
-		Pkt:   tcpip.PacketBuffer{Data: vv},
+		Pkt:   &stack.PacketBuffer{Data: vv},
 		Proto: 0,
 		GSO:   nil,
 	}
 
-	select {
-	case e.c <- p:
-	default:
-	}
+	e.q.Write(p)
 
 	return nil
 }
 
 // Wait implements stack.LinkEndpoint.Wait.
 func (*Endpoint) Wait() {}
+
+// AddNotify adds a notification target for receiving event about outgoing
+// packets.
+func (e *Endpoint) AddNotify(notify Notification) *NotificationHandle {
+	return e.q.AddNotify(notify)
+}
+
+// RemoveNotify removes handle from the list of notification targets.
+func (e *Endpoint) RemoveNotify(handle *NotificationHandle) {
+	e.q.RemoveNotify(handle)
+}
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index abe725548..aa6db9aea 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -14,6 +14,7 @@ go_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/binary",
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index b7f60178e..f34082e1a 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -44,6 +44,7 @@ import (
 	"syscall"
 
 	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -91,7 +92,7 @@ func (p PacketDispatchMode) String() string {
 	case PacketMMap:
 		return "PacketMMap"
 	default:
-		return fmt.Sprintf("unknown packet dispatch mode %v", p)
+		return fmt.Sprintf("unknown packet dispatch mode '%d'", p)
 	}
 }
 
@@ -386,7 +387,7 @@ const (
 
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	if e.hdrSize > 0 {
 		// Add ethernet header if needed.
 		eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
@@ -405,9 +406,9 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		eth.Encode(ethHdr)
 	}
 
+	fd := e.fds[pkt.Hash%uint32(len(e.fds))]
 	if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 		vnetHdr := virtioNetHdr{}
-		vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr)
 		if gso != nil {
 			vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
 			if gso.NeedsCsum {
@@ -428,139 +429,169 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 			}
 		}
 
-		return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
+		vnetHdrBuf := binary.Marshal(make([]byte, 0, virtioNetHdrSize), binary.LittleEndian, vnetHdr)
+		return rawfile.NonBlockingWrite3(fd, vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
 	}
 
 	if pkt.Data.Size() == 0 {
-		return rawfile.NonBlockingWrite(e.fds[0], pkt.Header.View())
+		return rawfile.NonBlockingWrite(fd, pkt.Header.View())
+	}
+	if pkt.Header.UsedLength() == 0 {
+		return rawfile.NonBlockingWrite(fd, pkt.Data.ToView())
 	}
 
-	return rawfile.NonBlockingWrite3(e.fds[0], pkt.Header.View(), pkt.Data.ToView(), nil)
+	return rawfile.NonBlockingWrite3(fd, pkt.Header.View(), pkt.Data.ToView(), nil)
 }
 
-// WritePackets writes outbound packets to the file descriptor. If it is not
-// currently writable, the packet is dropped.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	var ethHdrBuf []byte
-	// hdr + data
-	iovLen := 2
-	if e.hdrSize > 0 {
-		// Add ethernet header if needed.
-		ethHdrBuf = make([]byte, header.EthernetMinimumSize)
-		eth := header.Ethernet(ethHdrBuf)
-		ethHdr := &header.EthernetFields{
-			DstAddr: r.RemoteLinkAddress,
-			Type:    protocol,
-		}
-
-		// Preserve the src address if it's set in the route.
-		if r.LocalLinkAddress != "" {
-			ethHdr.SrcAddr = r.LocalLinkAddress
-		} else {
-			ethHdr.SrcAddr = e.addr
-		}
-		eth.Encode(ethHdr)
-		iovLen++
-	}
+func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tcpip.Error) {
+	// Send a batch of packets through batchFD.
+	mmsgHdrs := make([]rawfile.MMsgHdr, 0, len(batch))
+	for _, pkt := range batch {
+		var ethHdrBuf []byte
+		iovLen := 0
+		if e.hdrSize > 0 {
+			// Add ethernet header if needed.
+			ethHdrBuf = make([]byte, header.EthernetMinimumSize)
+			eth := header.Ethernet(ethHdrBuf)
+			ethHdr := &header.EthernetFields{
+				DstAddr: pkt.EgressRoute.RemoteLinkAddress,
+				Type:    pkt.NetworkProtocolNumber,
+			}
 
-	n := len(pkts)
-
-	views := pkts[0].Data.Views()
-	/*
-	 * Each bondary in views can add one more iovec.
-	 *
-	 * payload |      |          |         |
-	 *         -----------------------------
-	 * packets |    |    |    |    |    |  |
-	 *         -----------------------------
-	 * iovecs  |    | |  |    |  | |    |  |
-	 */
-	iovec := make([]syscall.Iovec, n*iovLen+len(views)-1)
-	mmsgHdrs := make([]rawfile.MMsgHdr, n)
-
-	iovecIdx := 0
-	viewIdx := 0
-	viewOff := 0
-	off := 0
-	nextOff := 0
-	for i := range pkts {
-		// TODO(b/134618279): Different packets may have different data
-		// in the future. We should handle this.
-		if !viewsEqual(pkts[i].Data.Views(), views) {
-			panic("All packets in pkts should have the same Data.")
+			// Preserve the src address if it's set in the route.
+			if pkt.EgressRoute.LocalLinkAddress != "" {
+				ethHdr.SrcAddr = pkt.EgressRoute.LocalLinkAddress
+			} else {
+				ethHdr.SrcAddr = e.addr
+			}
+			eth.Encode(ethHdr)
+			iovLen++
 		}
 
-		prevIovecIdx := iovecIdx
-		mmsgHdr := &mmsgHdrs[i]
-		mmsgHdr.Msg.Iov = &iovec[iovecIdx]
-		packetSize := pkts[i].DataSize
-		hdr := &pkts[i].Header
-
-		off = pkts[i].DataOffset
-		if off != nextOff {
-			// We stop in a different point last time.
-			size := packetSize
-			viewIdx = 0
-			viewOff = 0
-			for size > 0 {
-				if size >= len(views[viewIdx]) {
-					viewIdx++
-					viewOff = 0
-					size -= len(views[viewIdx])
-				} else {
-					viewOff = size
-					size = 0
+		vnetHdr := virtioNetHdr{}
+		var vnetHdrBuf []byte
+		if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+			if pkt.GSOOptions != nil {
+				vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
+				if pkt.GSOOptions.NeedsCsum {
+					vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
+					vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen
+					vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset
+				}
+				if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data.Size()) > pkt.GSOOptions.MSS {
+					switch pkt.GSOOptions.Type {
+					case stack.GSOTCPv4:
+						vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
+					case stack.GSOTCPv6:
+						vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
+					default:
+						panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type))
+					}
+					vnetHdr.gsoSize = pkt.GSOOptions.MSS
 				}
 			}
+			vnetHdrBuf = binary.Marshal(make([]byte, 0, virtioNetHdrSize), binary.LittleEndian, vnetHdr)
+			iovLen++
 		}
-		nextOff = off + packetSize
 
+		iovecs := make([]syscall.Iovec, iovLen+1+len(pkt.Data.Views()))
+		var mmsgHdr rawfile.MMsgHdr
+		mmsgHdr.Msg.Iov = &iovecs[0]
+		iovecIdx := 0
+		if vnetHdrBuf != nil {
+			v := &iovecs[iovecIdx]
+			v.Base = &vnetHdrBuf[0]
+			v.Len = uint64(len(vnetHdrBuf))
+			iovecIdx++
+		}
 		if ethHdrBuf != nil {
-			v := &iovec[iovecIdx]
+			v := &iovecs[iovecIdx]
 			v.Base = &ethHdrBuf[0]
 			v.Len = uint64(len(ethHdrBuf))
 			iovecIdx++
 		}
-
-		v := &iovec[iovecIdx]
+		pktSize := uint64(0)
+		// Encode L3 Header
+		v := &iovecs[iovecIdx]
+		hdr := &pkt.Header
 		hdrView := hdr.View()
 		v.Base = &hdrView[0]
 		v.Len = uint64(len(hdrView))
+		pktSize += v.Len
 		iovecIdx++
 
-		for packetSize > 0 {
-			vec := &iovec[iovecIdx]
+		// Now encode the Transport Payload.
+		pktViews := pkt.Data.Views()
+		for i := range pktViews {
+			vec := &iovecs[iovecIdx]
 			iovecIdx++
-
-			v := views[viewIdx]
-			vec.Base = &v[viewOff]
-			s := len(v) - viewOff
-			if s <= packetSize {
-				viewIdx++
-				viewOff = 0
-			} else {
-				s = packetSize
-				viewOff += s
-			}
-			vec.Len = uint64(s)
-			packetSize -= s
+			vec.Base = &pktViews[i][0]
+			vec.Len = uint64(len(pktViews[i]))
+			pktSize += vec.Len
 		}
-
-		mmsgHdr.Msg.Iovlen = uint64(iovecIdx - prevIovecIdx)
+		mmsgHdr.Msg.Iovlen = uint64(iovecIdx)
+		mmsgHdrs = append(mmsgHdrs, mmsgHdr)
 	}
 
 	packets := 0
-	for packets < n {
-		sent, err := rawfile.NonBlockingSendMMsg(e.fds[0], mmsgHdrs)
+	for len(mmsgHdrs) > 0 {
+		sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs)
 		if err != nil {
 			return packets, err
 		}
 		packets += sent
 		mmsgHdrs = mmsgHdrs[sent:]
 	}
+
 	return packets, nil
 }
 
+// WritePackets writes outbound packets to the underlying file descriptors. If
+// one is not currently writable, the packet is dropped.
+//
+// Being a batch API, each packet in pkts should have the following
+// fields populated:
+//  - pkt.EgressRoute
+//  - pkt.GSOOptions
+//  - pkt.NetworkProtocolNumber
+func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// Preallocate to avoid repeated reallocation as we append to batch.
+	// batchSz is 47 because when SWGSO is in use then a single 65KB TCP
+	// segment can get split into 46 segments of 1420 bytes and a single 216
+	// byte segment.
+	const batchSz = 47
+	batch := make([]*stack.PacketBuffer, 0, batchSz)
+	batchFD := -1
+	sentPackets := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if len(batch) == 0 {
+			batchFD = e.fds[pkt.Hash%uint32(len(e.fds))]
+		}
+		pktFD := e.fds[pkt.Hash%uint32(len(e.fds))]
+		if sendNow := pktFD != batchFD; !sendNow {
+			batch = append(batch, pkt)
+			continue
+		}
+		n, err := e.sendBatch(batchFD, batch)
+		sentPackets += n
+		if err != nil {
+			return sentPackets, err
+		}
+		batch = batch[:0]
+		batch = append(batch, pkt)
+		batchFD = pktFD
+	}
+
+	if len(batch) != 0 {
+		n, err := e.sendBatch(batchFD, batch)
+		sentPackets += n
+		if err != nil {
+			return sentPackets, err
+		}
+	}
+	return sentPackets, nil
+}
+
 // viewsEqual tests whether v1 and v2 refer to the same backing bytes.
 func viewsEqual(vs1, vs2 []buffer.View) bool {
 	return len(vs1) == len(vs2) && (len(vs1) == 0 || &vs1[0] == &vs2[0])
@@ -610,8 +641,8 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, pkt)
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // NewInjectable creates a new fd-based InjectableEndpoint.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 2066987eb..eaee7e5d7 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -45,40 +45,46 @@ const (
 type packetInfo struct {
 	raddr    tcpip.LinkAddress
 	proto    tcpip.NetworkProtocolNumber
-	contents tcpip.PacketBuffer
+	contents *stack.PacketBuffer
 }
 
 type context struct {
-	t    *testing.T
-	fds  [2]int
-	ep   stack.LinkEndpoint
-	ch   chan packetInfo
-	done chan struct{}
+	t        *testing.T
+	readFDs  []int
+	writeFDs []int
+	ep       stack.LinkEndpoint
+	ch       chan packetInfo
+	done     chan struct{}
 }
 
 func newContext(t *testing.T, opt *Options) *context {
-	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	firstFDPair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	if err != nil {
+		t.Fatalf("Socketpair failed: %v", err)
+	}
+	secondFDPair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
 	if err != nil {
 		t.Fatalf("Socketpair failed: %v", err)
 	}
 
-	done := make(chan struct{}, 1)
+	done := make(chan struct{}, 2)
 	opt.ClosedFunc = func(*tcpip.Error) {
 		done <- struct{}{}
 	}
 
-	opt.FDs = []int{fds[1]}
+	opt.FDs = []int{firstFDPair[1], secondFDPair[1]}
 	ep, err := New(opt)
 	if err != nil {
 		t.Fatalf("Failed to create FD endpoint: %v", err)
 	}
 
 	c := &context{
-		t:    t,
-		fds:  fds,
-		ep:   ep,
-		ch:   make(chan packetInfo, 100),
-		done: done,
+		t:        t,
+		readFDs:  []int{firstFDPair[0], secondFDPair[0]},
+		writeFDs: opt.FDs,
+		ep:       ep,
+		ch:       make(chan packetInfo, 100),
+		done:     done,
 	}
 
 	ep.Attach(c)
@@ -87,12 +93,17 @@ func newContext(t *testing.T, opt *Options) *context {
 }
 
 func (c *context) cleanup() {
-	syscall.Close(c.fds[0])
+	for _, fd := range c.readFDs {
+		syscall.Close(fd)
+	}
+	<-c.done
 	<-c.done
-	syscall.Close(c.fds[1])
+	for _, fd := range c.writeFDs {
+		syscall.Close(fd)
+	}
 }
 
-func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (c *context) DeliverNetworkPacket(remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	c.ch <- packetInfo{remote, protocol, pkt}
 }
 
@@ -136,7 +147,7 @@ func TestAddress(t *testing.T) {
 	}
 }
 
-func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32) {
+func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash uint32) {
 	c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth, GSOMaxSize: gsoMaxSize})
 	defer c.cleanup()
 
@@ -168,16 +179,18 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32) {
 			L3HdrLen:   header.IPv4MaximumHeaderSize,
 		}
 	}
-	if err := c.ep.WritePacket(r, gso, proto, tcpip.PacketBuffer{
+	if err := c.ep.WritePacket(r, gso, proto, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
+		Hash:   hash,
 	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 
-	// Read from fd, then compare with what we wrote.
+	// Read from the corresponding FD, then compare with what we wrote.
 	b = make([]byte, mtu)
-	n, err := syscall.Read(c.fds[0], b)
+	fd := c.readFDs[hash%uint32(len(c.readFDs))]
+	n, err := syscall.Read(fd, b)
 	if err != nil {
 		t.Fatalf("Read failed: %v", err)
 	}
@@ -238,7 +251,7 @@ func TestWritePacket(t *testing.T) {
 				t.Run(
 					fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v", eth, plen, gso),
 					func(t *testing.T) {
-						testWritePacket(t, plen, eth, gso)
+						testWritePacket(t, plen, eth, gso, 0)
 					},
 				)
 			}
@@ -246,6 +259,27 @@ func TestWritePacket(t *testing.T) {
 	}
 }
 
+func TestHashedWritePacket(t *testing.T) {
+	lengths := []int{0, 100, 1000}
+	eths := []bool{true, false}
+	gsos := []uint32{0, 32768}
+	hashes := []uint32{0, 1}
+	for _, eth := range eths {
+		for _, plen := range lengths {
+			for _, gso := range gsos {
+				for _, hash := range hashes {
+					t.Run(
+						fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v,Hash=%d", eth, plen, gso, hash),
+						func(t *testing.T) {
+							testWritePacket(t, plen, eth, gso, hash)
+						},
+					)
+				}
+			}
+		}
+	}
+}
+
 func TestPreserveSrcAddress(t *testing.T) {
 	baddr := tcpip.LinkAddress("\xcc\xbb\xaa\x77\x88\x99")
 
@@ -261,7 +295,7 @@ func TestPreserveSrcAddress(t *testing.T) {
 	// WritePacket panics given a prependable with anything less than
 	// the minimum size of the ethernet header.
 	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
-	if err := c.ep.WritePacket(r, nil /* gso */, proto, tcpip.PacketBuffer{
+	if err := c.ep.WritePacket(r, nil /* gso */, proto, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.VectorisedView{},
 	}); err != nil {
@@ -270,7 +304,7 @@ func TestPreserveSrcAddress(t *testing.T) {
 
 	// Read from the FD, then compare with what we wrote.
 	b := make([]byte, mtu)
-	n, err := syscall.Read(c.fds[0], b)
+	n, err := syscall.Read(c.readFDs[0], b)
 	if err != nil {
 		t.Fatalf("Read failed: %v", err)
 	}
@@ -314,7 +348,7 @@ func TestDeliverPacket(t *testing.T) {
 				}
 
 				// Write packet via the file descriptor.
-				if _, err := syscall.Write(c.fds[0], all); err != nil {
+				if _, err := syscall.Write(c.readFDs[0], all); err != nil {
 					t.Fatalf("Write failed: %v", err)
 				}
 
@@ -324,7 +358,7 @@ func TestDeliverPacket(t *testing.T) {
 					want := packetInfo{
 						raddr: raddr,
 						proto: proto,
-						contents: tcpip.PacketBuffer{
+						contents: &stack.PacketBuffer{
 							Data:       buffer.View(b).ToVectorisedView(),
 							LinkHeader: buffer.View(hdr),
 						},
diff --git a/pkg/tcpip/link/fdbased/endpoint_unsafe.go b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
index 97a477b61..df14eaad1 100644
--- a/pkg/tcpip/link/fdbased/endpoint_unsafe.go
+++ b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
@@ -17,16 +17,7 @@
 package fdbased
 
 import (
-	"reflect"
 	"unsafe"
 )
 
 const virtioNetHdrSize = int(unsafe.Sizeof(virtioNetHdr{}))
-
-func vnetHdrToByteSlice(hdr *virtioNetHdr) (slice []byte) {
-	sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
-	sh.Data = uintptr(unsafe.Pointer(hdr))
-	sh.Len = virtioNetHdrSize
-	sh.Cap = virtioNetHdrSize
-	return
-}
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index 62ed1e569..2dfd29aa9 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
 const (
@@ -190,7 +191,7 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	pkt = pkt[d.e.hdrSize:]
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, tcpip.PacketBuffer{
+	d.e.dispatcher.DeliverNetworkPacket(remote, local, p, &stack.PacketBuffer{
 		Data:       buffer.View(pkt).ToVectorisedView(),
 		LinkHeader: buffer.View(eth),
 	})
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index c67d684ce..f04738cfb 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -139,13 +139,13 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	used := d.capViews(n, BufConfig)
-	pkt := tcpip.PacketBuffer{
+	pkt := &stack.PacketBuffer{
 		Data:       buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)),
 		LinkHeader: buffer.View(eth),
 	}
 	pkt.Data.TrimFront(d.e.hdrSize)
 
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
+	d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
 
 	// Prepare e.views for another packet: release used views.
 	for i := 0; i < used; i++ {
@@ -169,7 +169,7 @@ type recvMMsgDispatcher struct {
 
 	// iovecs is an array of array of iovec records where each iovec base
 	// pointer and length are initialzed to the corresponding view above,
-	// except when GSO is neabled then the first iovec in each array of
+	// except when GSO is enabled then the first iovec in each array of
 	// iovecs points to a buffer for the vnet header which is stripped
 	// before the views are passed up the stack for further processing.
 	iovecs [][]syscall.Iovec
@@ -296,12 +296,12 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 		}
 
 		used := d.capViews(k, int(n), BufConfig)
-		pkt := tcpip.PacketBuffer{
+		pkt := &stack.PacketBuffer{
 			Data:       buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)),
 			LinkHeader: buffer.View(eth),
 		}
 		pkt.Data.TrimFront(d.e.hdrSize)
-		d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
+		d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
 
 		// Prepare e.views for another packet: release used views.
 		for i := 0; i < used; i++ {
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 499cc608f..568c6874f 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -76,7 +76,7 @@ func (*endpoint) Wait() {}
 
 // WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
 // packets to the network-layer dispatcher.
-func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
 	views[0] = pkt.Header.View()
 	views = append(views, pkt.Data.Views()...)
@@ -84,7 +84,7 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 	// Because we're immediately turning around and writing the packet back
 	// to the rx path, we intentionally don't preserve the remote and local
 	// link addresses from the stack.Route we're passed.
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, tcpip.PacketBuffer{
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, &stack.PacketBuffer{
 		Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 	})
 
@@ -92,21 +92,21 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	// Reject the packet if it's shorter than an ethernet header.
-	if vv.Size() < header.EthernetMinimumSize {
+	// There should be an ethernet header at the beginning of vv.
+	hdr, ok := vv.PullUp(header.EthernetMinimumSize)
+	if !ok {
+		// Reject the packet if it's shorter than an ethernet header.
 		return tcpip.ErrBadAddress
 	}
-
-	// There should be an ethernet header at the beginning of vv.
-	linkHeader := header.Ethernet(vv.First()[:header.EthernetMinimumSize])
+	linkHeader := header.Ethernet(hdr)
 	vv.TrimFront(len(linkHeader))
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), tcpip.PacketBuffer{
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, linkHeader.Type(), &stack.PacketBuffer{
 		Data:       vv,
 		LinkHeader: buffer.View(linkHeader),
 	})
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 445b22c17..c69d6b7e9 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -80,14 +80,14 @@ func (m *InjectableEndpoint) IsAttached() bool {
 }
 
 // InjectInbound implements stack.InjectableLinkEndpoint.
-func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
-	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, pkt)
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	m.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // WritePackets writes outbound packets to the appropriate
 // LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if
 // r.RemoteAddress has a route registered in this endpoint.
-func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	endpoint, ok := m.routes[r.RemoteAddress]
 	if !ok {
 		return 0, tcpip.ErrNoRoute
@@ -98,7 +98,7 @@ func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts [
 // WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
 // based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a
 // route registered in this endpoint.
-func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	if endpoint, ok := m.routes[r.RemoteAddress]; ok {
 		return endpoint.WritePacket(r, gso, protocol, pkt)
 	}
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 63b249837..0744f66d6 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -50,7 +50,7 @@ func TestInjectableEndpointDispatch(t *testing.T) {
 	hdr.Prepend(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
 
-	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.NewViewFromBytes([]byte{0xFB}).ToVectorisedView(),
 	})
@@ -70,7 +70,7 @@ func TestInjectableEndpointDispatchHdrOnly(t *testing.T) {
 	hdr := buffer.NewPrependable(1)
 	hdr.Prepend(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
-	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.NewView(0).ToVectorisedView(),
 	})
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/link/qdisc/fifo/BUILD
index d1b73cfdf..054c213bc 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/link/qdisc/fifo/BUILD
@@ -3,16 +3,17 @@ load("//tools:defs.bzl", "go_library")
 package(licenses = ["notice"])
 
 go_library(
-    name = "iptables",
+    name = "fifo",
     srcs = [
-        "iptables.go",
-        "targets.go",
-        "types.go",
+        "endpoint.go",
+        "packet_buffer_queue.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/log",
+        "//pkg/sleep",
+        "//pkg/sync",
         "//pkg/tcpip",
-        "//pkg/tcpip/header",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go
new file mode 100644
index 000000000..b5dfb7850
--- /dev/null
+++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go
@@ -0,0 +1,209 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fifo provides the implementation of data-link layer endpoints that
+// wrap another endpoint and queues all outbound packets and asynchronously
+// dispatches them to the lower endpoint.
+package fifo
+
+import (
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// endpoint represents a LinkEndpoint which implements a FIFO queue for all
+// outgoing packets. endpoint can have 1 or more underlying queueDispatchers.
+// All outgoing packets are consistenly hashed to a single underlying queue
+// using the PacketBuffer.Hash if set, otherwise all packets are queued to the
+// first queue to avoid reordering in case of missing hash.
+type endpoint struct {
+	dispatcher  stack.NetworkDispatcher
+	lower       stack.LinkEndpoint
+	wg          sync.WaitGroup
+	dispatchers []*queueDispatcher
+}
+
+// queueDispatcher is responsible for dispatching all outbound packets in its
+// queue. It will also smartly batch packets when possible and write them
+// through the lower LinkEndpoint.
+type queueDispatcher struct {
+	lower          stack.LinkEndpoint
+	q              *packetBufferQueue
+	newPacketWaker sleep.Waker
+	closeWaker     sleep.Waker
+}
+
+// New creates a new fifo link endpoint with the n queues with maximum
+// capacity of queueLen.
+func New(lower stack.LinkEndpoint, n int, queueLen int) stack.LinkEndpoint {
+	e := &endpoint{
+		lower: lower,
+	}
+	// Create the required dispatchers
+	for i := 0; i < n; i++ {
+		qd := &queueDispatcher{
+			q:     &packetBufferQueue{limit: queueLen},
+			lower: lower,
+		}
+		e.dispatchers = append(e.dispatchers, qd)
+		e.wg.Add(1)
+		go func() {
+			defer e.wg.Done()
+			qd.dispatchLoop()
+		}()
+	}
+	return e
+}
+
+func (q *queueDispatcher) dispatchLoop() {
+	const newPacketWakerID = 1
+	const closeWakerID = 2
+	s := sleep.Sleeper{}
+	s.AddWaker(&q.newPacketWaker, newPacketWakerID)
+	s.AddWaker(&q.closeWaker, closeWakerID)
+	defer s.Done()
+
+	const batchSize = 32
+	var batch stack.PacketBufferList
+	for {
+		id, ok := s.Fetch(true)
+		if ok && id == closeWakerID {
+			return
+		}
+		for pkt := q.q.dequeue(); pkt != nil; pkt = q.q.dequeue() {
+			batch.PushBack(pkt)
+			if batch.Len() < batchSize && !q.q.empty() {
+				continue
+			}
+			// We pass a protocol of zero here because each packet carries its
+			// NetworkProtocol.
+			q.lower.WritePackets(nil /* route */, nil /* gso */, batch, 0 /* protocol */)
+			for pkt := batch.Front(); pkt != nil; pkt = pkt.Next() {
+				pkt.EgressRoute.Release()
+				batch.Remove(pkt)
+			}
+			batch.Reset()
+		}
+	}
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.DeliverNetworkPacket.
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
+}
+
+// Attach implements stack.LinkEndpoint.Attach.
+func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+	e.lower.Attach(e)
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU.
+func (e *endpoint) MTU() uint32 {
+	return e.lower.MTU()
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.lower.Capabilities()
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength.
+func (e *endpoint) MaxHeaderLength() uint16 {
+	return e.lower.MaxHeaderLength()
+}
+
+// LinkAddress implements stack.LinkEndpoint.LinkAddress.
+func (e *endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.lower.LinkAddress()
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (e *endpoint) GSOMaxSize() uint32 {
+	if gso, ok := e.lower.(stack.GSOEndpoint); ok {
+		return gso.GSOMaxSize()
+	}
+	return 0
+}
+
+// WritePacket implements stack.LinkEndpoint.WritePacket.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	// WritePacket caller's do not set the following fields in PacketBuffer
+	// so we populate them here.
+	newRoute := r.Clone()
+	pkt.EgressRoute = &newRoute
+	pkt.GSOOptions = gso
+	pkt.NetworkProtocolNumber = protocol
+	d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)]
+	if !d.q.enqueue(pkt) {
+		return tcpip.ErrNoBufferSpace
+	}
+	d.newPacketWaker.Assert()
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+//
+// Being a batch API, each packet in pkts should have the following fields
+// populated:
+//   - pkt.EgressRoute
+//   - pkt.GSOOptions
+//   - pkt.NetworkProtocolNumber
+func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	enqueued := 0
+	for pkt := pkts.Front(); pkt != nil; {
+		d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)]
+		nxt := pkt.Next()
+		// Since qdisc can hold onto a packet for long we should Clone
+		// the route here to ensure it doesn't get released while the
+		// packet is still in our queue.
+		newRoute := pkt.EgressRoute.Clone()
+		pkt.EgressRoute = &newRoute
+		if !d.q.enqueue(pkt) {
+			if enqueued > 0 {
+				d.newPacketWaker.Assert()
+			}
+			return enqueued, tcpip.ErrNoBufferSpace
+		}
+		pkt = nxt
+		enqueued++
+		d.newPacketWaker.Assert()
+	}
+	return enqueued, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	return e.lower.WriteRawPacket(vv)
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (e *endpoint) Wait() {
+	e.lower.Wait()
+
+	// The linkEP is gone. Teardown the outbound dispatcher goroutines.
+	for i := range e.dispatchers {
+		e.dispatchers[i].closeWaker.Assert()
+	}
+
+	e.wg.Wait()
+}
diff --git a/pkg/tcpip/link/qdisc/fifo/packet_buffer_queue.go b/pkg/tcpip/link/qdisc/fifo/packet_buffer_queue.go
new file mode 100644
index 000000000..eb5abb906
--- /dev/null
+++ b/pkg/tcpip/link/qdisc/fifo/packet_buffer_queue.go
@@ -0,0 +1,84 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fifo
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// packetBufferQueue is a bounded, thread-safe queue of PacketBuffers.
+//
+type packetBufferQueue struct {
+	mu    sync.Mutex
+	list  stack.PacketBufferList
+	limit int
+	used  int
+}
+
+// emptyLocked determines if the queue is empty.
+// Preconditions: q.mu must be held.
+func (q *packetBufferQueue) emptyLocked() bool {
+	return q.used == 0
+}
+
+// empty determines if the queue is empty.
+func (q *packetBufferQueue) empty() bool {
+	q.mu.Lock()
+	r := q.emptyLocked()
+	q.mu.Unlock()
+
+	return r
+}
+
+// setLimit updates the limit. No PacketBuffers are immediately dropped in case
+// the queue becomes full due to the new limit.
+func (q *packetBufferQueue) setLimit(limit int) {
+	q.mu.Lock()
+	q.limit = limit
+	q.mu.Unlock()
+}
+
+// enqueue adds the given packet to the queue.
+//
+// Returns true when the PacketBuffer is successfully added to the queue, in
+// which case ownership of the reference is transferred to the queue. And
+// returns false if the queue is full, in which case ownership is retained by
+// the caller.
+func (q *packetBufferQueue) enqueue(s *stack.PacketBuffer) bool {
+	q.mu.Lock()
+	r := q.used < q.limit
+	if r {
+		q.list.PushBack(s)
+		q.used++
+	}
+	q.mu.Unlock()
+
+	return r
+}
+
+// dequeue removes and returns the next PacketBuffer from queue, if one exists.
+// Ownership is transferred to the caller.
+func (q *packetBufferQueue) dequeue() *stack.PacketBuffer {
+	q.mu.Lock()
+	s := q.list.Front()
+	if s != nil {
+		q.list.Remove(s)
+		q.used--
+	}
+	q.mu.Unlock()
+
+	return s
+}
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
index 0b5a6cf49..99313ee25 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
@@ -14,7 +14,7 @@
 
 // +build linux,amd64 linux,arm64
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 655e537c4..0374a2441 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -185,7 +185,7 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress {
 
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	// Add the ethernet header here.
 	eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
 	pkt.LinkHeader = buffer.View(eth)
@@ -214,7 +214,7 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
@@ -275,7 +275,7 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
 
 		// Send packet up the stack.
 		eth := header.Ethernet(b[:header.EthernetMinimumSize])
-		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), tcpip.PacketBuffer{
+		d.DeliverNetworkPacket(eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), &stack.PacketBuffer{
 			Data:       buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(),
 			LinkHeader: buffer.View(eth),
 		})
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 5c729a439..28a2e88ba 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -131,7 +131,7 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress
 	return c
 }
 
-func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (c *testContext) DeliverNetworkPacket(remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	c.mu.Lock()
 	c.packets = append(c.packets, packetInfo{
 		addr:  remoteLinkAddr,
@@ -273,7 +273,7 @@ func TestSimpleSend(t *testing.T) {
 			randomFill(buf)
 
 			proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-			if err := c.ep.WritePacket(&r, nil /* gso */, proto, tcpip.PacketBuffer{
+			if err := c.ep.WritePacket(&r, nil /* gso */, proto, &stack.PacketBuffer{
 				Header: hdr,
 				Data:   buf.ToVectorisedView(),
 			}); err != nil {
@@ -345,7 +345,7 @@ func TestPreserveSrcAddressInSend(t *testing.T) {
 	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
 
 	proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-	if err := c.ep.WritePacket(&r, nil /* gso */, proto, tcpip.PacketBuffer{
+	if err := c.ep.WritePacket(&r, nil /* gso */, proto, &stack.PacketBuffer{
 		Header: hdr,
 	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
@@ -401,7 +401,7 @@ func TestFillTxQueue(t *testing.T) {
 	for i := queuePipeSize / 40; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
 
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -419,7 +419,7 @@ func TestFillTxQueue(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	}); err != want {
@@ -447,7 +447,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	// Send two packets so that the id slice has at least two slots.
 	for i := 2; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -470,7 +470,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	ids := make(map[uint64]struct{})
 	for i := queuePipeSize / 40; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -488,7 +488,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	}); err != want {
@@ -514,7 +514,7 @@ func TestFillTxMemory(t *testing.T) {
 	ids := make(map[uint64]struct{})
 	for i := queueDataSize / bufferSize; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -533,7 +533,7 @@ func TestFillTxMemory(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	})
@@ -561,7 +561,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// until there is only one buffer left.
 	for i := queueDataSize/bufferSize - 1; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -577,7 +577,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	{
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
 		uu := buffer.NewView(bufferSize).ToVectorisedView()
-		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   uu,
 		}); err != want {
@@ -588,7 +588,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// Attempt to write the one-buffer packet again. It must succeed.
 	{
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, tcpip.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -674,7 +674,7 @@ func TestSimpleReceive(t *testing.T) {
 		// Wait for packet to be received, then check it.
 		c.waitForPackets(1, time.After(5*time.Second), "Timeout waiting for packet")
 		c.mu.Lock()
-		rcvd := []byte(c.packets[0].vv.First())
+		rcvd := []byte(c.packets[0].vv.ToView())
 		c.packets = c.packets[:0]
 		c.mu.Unlock()
 
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 3392b7edd..ae3186314 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -21,11 +21,9 @@
 package sniffer
 
 import (
-	"bytes"
 	"encoding/binary"
 	"fmt"
 	"io"
-	"os"
 	"sync/atomic"
 	"time"
 
@@ -42,12 +40,12 @@ import (
 // LogPackets must be accessed atomically.
 var LogPackets uint32 = 1
 
-// LogPacketsToFile is a flag used to enable or disable logging packets to a
-// pcap file. Valid values are 0 or 1. A file must have been specified when the
+// LogPacketsToPCAP is a flag used to enable or disable logging packets to a
+// pcap writer. Valid values are 0 or 1. A writer must have been specified when the
 // sniffer was created for this flag to have effect.
 //
-// LogPacketsToFile must be accessed atomically.
-var LogPacketsToFile uint32 = 1
+// LogPacketsToPCAP must be accessed atomically.
+var LogPacketsToPCAP uint32 = 1
 
 var transportProtocolMinSizes map[tcpip.TransportProtocolNumber]int = map[tcpip.TransportProtocolNumber]int{
 	header.ICMPv4ProtocolNumber: header.IPv4MinimumSize,
@@ -59,7 +57,7 @@ var transportProtocolMinSizes map[tcpip.TransportProtocolNumber]int = map[tcpip.
 type endpoint struct {
 	dispatcher stack.NetworkDispatcher
 	lower      stack.LinkEndpoint
-	file       *os.File
+	writer     io.Writer
 	maxPCAPLen uint32
 }
 
@@ -99,23 +97,22 @@ func writePCAPHeader(w io.Writer, maxLen uint32) error {
 	})
 }
 
-// NewWithFile creates a new sniffer link-layer endpoint. It wraps around
-// another endpoint and logs packets and they traverse the endpoint.
+// NewWithWriter creates a new sniffer link-layer endpoint. It wraps around
+// another endpoint and logs packets as they traverse the endpoint.
 //
-// Packets can be logged to file in the pcap format. A sniffer created
-// with this function will not emit packets using the standard log
-// package.
+// Packets are logged to writer in the pcap format. A sniffer created with this
+// function will not emit packets using the standard log package.
 //
 // snapLen is the maximum amount of a packet to be saved. Packets with a length
-// less than or equal too snapLen will be saved in their entirety. Longer
+// less than or equal to snapLen will be saved in their entirety. Longer
 // packets will be truncated to snapLen.
-func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack.LinkEndpoint, error) {
-	if err := writePCAPHeader(file, snapLen); err != nil {
+func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (stack.LinkEndpoint, error) {
+	if err := writePCAPHeader(writer, snapLen); err != nil {
 		return nil, err
 	}
 	return &endpoint{
 		lower:      lower,
-		file:       file,
+		writer:     writer,
 		maxPCAPLen: snapLen,
 	}, nil
 }
@@ -123,38 +120,9 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack
 // DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
 // called by the link-layer endpoint being wrapped when a packet arrives, and
 // logs the packet before forwarding to the actual dispatcher.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
-	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("recv", protocol, pkt.Data.First(), nil)
-	}
-	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
-		vs := pkt.Data.Views()
-		length := pkt.Data.Size()
-		if length > int(e.maxPCAPLen) {
-			length = int(e.maxPCAPLen)
-		}
-
-		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
-		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(pkt.Data.Size()))); err != nil {
-			panic(err)
-		}
-		for _, v := range vs {
-			if length == 0 {
-				break
-			}
-			if len(v) > length {
-				v = v[:length]
-			}
-			if _, err := buf.Write([]byte(v)); err != nil {
-				panic(err)
-			}
-			length -= len(v)
-		}
-		if _, err := e.file.Write(buf.Bytes()); err != nil {
-			panic(err)
-		}
-	}
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dumpPacket("recv", nil, protocol, pkt)
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
 }
 
 // Attach implements the stack.LinkEndpoint interface. It saves the dispatcher
@@ -200,31 +168,39 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
-	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("send", protocol, pkt.Header.View(), gso)
+func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	writer := e.writer
+	if writer == nil && atomic.LoadUint32(&LogPackets) == 1 {
+		logPacket(prefix, protocol, pkt, gso)
 	}
-	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
-		hdrBuf := pkt.Header.View()
-		length := len(hdrBuf) + pkt.Data.Size()
-		if length > int(e.maxPCAPLen) {
-			length = int(e.maxPCAPLen)
+	if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
+		totalLength := pkt.Header.UsedLength() + pkt.Data.Size()
+		length := totalLength
+		if max := int(e.maxPCAPLen); length > max {
+			length = max
 		}
-
-		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
-		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(len(hdrBuf)+pkt.Data.Size()))); err != nil {
+		if err := binary.Write(writer, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(totalLength))); err != nil {
 			panic(err)
 		}
-		if len(hdrBuf) > length {
-			hdrBuf = hdrBuf[:length]
-		}
-		if _, err := buf.Write(hdrBuf); err != nil {
-			panic(err)
+		write := func(b []byte) {
+			if len(b) > length {
+				b = b[:length]
+			}
+			for len(b) != 0 {
+				n, err := writer.Write(b)
+				if err != nil {
+					panic(err)
+				}
+				b = b[n:]
+				length -= n
+			}
 		}
-		length -= len(hdrBuf)
-		logVectorisedView(pkt.Data, length, buf)
-		if _, err := e.file.Write(buf.Bytes()); err != nil {
-			panic(err)
+		write(pkt.Header.View())
+		for _, view := range pkt.Data.Views() {
+			if length == 0 {
+				break
+			}
+			write(view)
 		}
 	}
 }
@@ -232,71 +208,33 @@ func (e *endpoint) dumpPacket(gso *stack.GSO, protocol tcpip.NetworkProtocolNumb
 // WritePacket implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
-	e.dumpPacket(gso, protocol, pkt)
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.dumpPacket("send", gso, protocol, pkt)
 	return e.lower.WritePacket(r, gso, protocol, pkt)
 }
 
 // WritePackets implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	view := pkts[0].Data.ToView()
-	for _, pkt := range pkts {
-		e.dumpPacket(gso, protocol, tcpip.PacketBuffer{
-			Header: pkt.Header,
-			Data:   view[pkt.DataOffset:][:pkt.DataSize].ToVectorisedView(),
-		})
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.dumpPacket("send", gso, protocol, pkt)
 	}
 	return e.lower.WritePackets(r, gso, pkts, protocol)
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("send", 0, buffer.View("[raw packet, no header available]"), nil /* gso */)
-	}
-	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
-		length := vv.Size()
-		if length > int(e.maxPCAPLen) {
-			length = int(e.maxPCAPLen)
-		}
-
-		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
-		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(vv.Size()))); err != nil {
-			panic(err)
-		}
-		logVectorisedView(vv, length, buf)
-		if _, err := e.file.Write(buf.Bytes()); err != nil {
-			panic(err)
-		}
-	}
+	e.dumpPacket("send", nil, 0, &stack.PacketBuffer{
+		Data: vv,
+	})
 	return e.lower.WriteRawPacket(vv)
 }
 
-func logVectorisedView(vv buffer.VectorisedView, length int, buf *bytes.Buffer) {
-	if length <= 0 {
-		return
-	}
-	for _, v := range vv.Views() {
-		if len(v) > length {
-			v = v[:length]
-		}
-		n, err := buf.Write(v)
-		if err != nil {
-			panic(err)
-		}
-		length -= n
-		if length == 0 {
-			return
-		}
-	}
-}
-
 // Wait implements stack.LinkEndpoint.Wait.
-func (*endpoint) Wait() {}
+func (e *endpoint) Wait() { e.lower.Wait() }
 
-func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View, gso *stack.GSO) {
+func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
 	// Figure out the network layer info.
 	var transProto uint8
 	src := tcpip.Address("unknown")
@@ -305,28 +243,49 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 	size := uint16(0)
 	var fragmentOffset uint16
 	var moreFragments bool
+
+	// Create a clone of pkt, including any headers if present. Avoid allocating
+	// backing memory for the clone.
+	views := [8]buffer.View{}
+	vv := buffer.NewVectorisedView(0, views[:0])
+	vv.AppendView(pkt.Header.View())
+	vv.Append(pkt.Data)
+
 	switch protocol {
 	case header.IPv4ProtocolNumber:
-		ipv4 := header.IPv4(b)
+		hdr, ok := vv.PullUp(header.IPv4MinimumSize)
+		if !ok {
+			return
+		}
+		ipv4 := header.IPv4(hdr)
 		fragmentOffset = ipv4.FragmentOffset()
 		moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
 		src = ipv4.SourceAddress()
 		dst = ipv4.DestinationAddress()
 		transProto = ipv4.Protocol()
 		size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
-		b = b[ipv4.HeaderLength():]
+		vv.TrimFront(int(ipv4.HeaderLength()))
 		id = int(ipv4.ID())
 
 	case header.IPv6ProtocolNumber:
-		ipv6 := header.IPv6(b)
+		hdr, ok := vv.PullUp(header.IPv6MinimumSize)
+		if !ok {
+			return
+		}
+		ipv6 := header.IPv6(hdr)
 		src = ipv6.SourceAddress()
 		dst = ipv6.DestinationAddress()
 		transProto = ipv6.NextHeader()
 		size = ipv6.PayloadLength()
-		b = b[header.IPv6MinimumSize:]
+		vv.TrimFront(header.IPv6MinimumSize)
 
 	case header.ARPProtocolNumber:
-		arp := header.ARP(b)
+		hdr, ok := vv.PullUp(header.ARPSize)
+		if !ok {
+			return
+		}
+		vv.TrimFront(header.ARPSize)
+		arp := header.ARP(hdr)
 		log.Infof(
 			"%s arp %v (%v) -> %v (%v) valid:%v",
 			prefix,
@@ -342,7 +301,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	// We aren't guaranteed to have a transport header - it's possible for
 	// writes via raw endpoints to contain only network headers.
-	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && len(b) < minSize {
+	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && vv.Size() < minSize {
 		log.Infof("%s %v -> %v transport protocol: %d, but no transport header found (possible raw packet)", prefix, src, dst, transProto)
 		return
 	}
@@ -355,7 +314,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 	switch tcpip.TransportProtocolNumber(transProto) {
 	case header.ICMPv4ProtocolNumber:
 		transName = "icmp"
-		icmp := header.ICMPv4(b)
+		hdr, ok := vv.PullUp(header.ICMPv4MinimumSize)
+		if !ok {
+			break
+		}
+		icmp := header.ICMPv4(hdr)
 		icmpType := "unknown"
 		if fragmentOffset == 0 {
 			switch icmp.Type() {
@@ -388,7 +351,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	case header.ICMPv6ProtocolNumber:
 		transName = "icmp"
-		icmp := header.ICMPv6(b)
+		hdr, ok := vv.PullUp(header.ICMPv6MinimumSize)
+		if !ok {
+			break
+		}
+		icmp := header.ICMPv6(hdr)
 		icmpType := "unknown"
 		switch icmp.Type() {
 		case header.ICMPv6DstUnreachable:
@@ -419,8 +386,12 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	case header.UDPProtocolNumber:
 		transName = "udp"
-		udp := header.UDP(b)
-		if fragmentOffset == 0 && len(udp) >= header.UDPMinimumSize {
+		hdr, ok := vv.PullUp(header.UDPMinimumSize)
+		if !ok {
+			break
+		}
+		udp := header.UDP(hdr)
+		if fragmentOffset == 0 {
 			srcPort = udp.SourcePort()
 			dstPort = udp.DestinationPort()
 			details = fmt.Sprintf("xsum: 0x%x", udp.Checksum())
@@ -429,15 +400,19 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 
 	case header.TCPProtocolNumber:
 		transName = "tcp"
-		tcp := header.TCP(b)
-		if fragmentOffset == 0 && len(tcp) >= header.TCPMinimumSize {
+		hdr, ok := vv.PullUp(header.TCPMinimumSize)
+		if !ok {
+			break
+		}
+		tcp := header.TCP(hdr)
+		if fragmentOffset == 0 {
 			offset := int(tcp.DataOffset())
 			if offset < header.TCPMinimumSize {
 				details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
 				break
 			}
-			if offset > len(tcp) && !moreFragments {
-				details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, len(tcp))
+			if offset > vv.Size() && !moreFragments {
+				details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, vv.Size())
 				break
 			}
 
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index e5096ea38..e0db6cf54 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -4,6 +4,22 @@ package(licenses = ["notice"])
 
 go_library(
     name = "tun",
-    srcs = ["tun_unsafe.go"],
+    srcs = [
+        "device.go",
+        "protocol.go",
+        "tun_unsafe.go",
+    ],
     visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/refs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
 )
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
new file mode 100644
index 000000000..6bc9033d0
--- /dev/null
+++ b/pkg/tcpip/link/tun/device.go
@@ -0,0 +1,358 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// drivers/net/tun.c:tun_net_init()
+	defaultDevMtu = 1500
+
+	// Queue length for outbound packet, arriving at fd side for read. Overflow
+	// causes packet drops. gVisor implementation-specific.
+	defaultDevOutQueueLen = 1024
+)
+
+var zeroMAC [6]byte
+
+// Device is an opened /dev/net/tun device.
+//
+// +stateify savable
+type Device struct {
+	waiter.Queue
+
+	mu           sync.RWMutex `state:"nosave"`
+	endpoint     *tunEndpoint
+	notifyHandle *channel.NotificationHandle
+	flags        uint16
+}
+
+// beforeSave is invoked by stateify.
+func (d *Device) beforeSave() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	// TODO(b/110961832): Restore the device to stack. At this moment, the stack
+	// is not savable.
+	if d.endpoint != nil {
+		panic("/dev/net/tun does not support save/restore when a device is associated with it.")
+	}
+}
+
+// Release implements fs.FileOperations.Release.
+func (d *Device) Release() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Decrease refcount if there is an endpoint associated with this file.
+	if d.endpoint != nil {
+		d.endpoint.RemoveNotify(d.notifyHandle)
+		d.endpoint.DecRef()
+		d.endpoint = nil
+	}
+}
+
+// SetIff services TUNSETIFF ioctl(2) request.
+func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.endpoint != nil {
+		return syserror.EINVAL
+	}
+
+	// Input validations.
+	isTun := flags&linux.IFF_TUN != 0
+	isTap := flags&linux.IFF_TAP != 0
+	supportedFlags := uint16(linux.IFF_TUN | linux.IFF_TAP | linux.IFF_NO_PI)
+	if isTap && isTun || !isTap && !isTun || flags&^supportedFlags != 0 {
+		return syserror.EINVAL
+	}
+
+	prefix := "tun"
+	if isTap {
+		prefix = "tap"
+	}
+
+	linkCaps := stack.CapabilityNone
+	if isTap {
+		linkCaps |= stack.CapabilityResolutionRequired
+	}
+
+	endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps)
+	if err != nil {
+		return syserror.EINVAL
+	}
+
+	d.endpoint = endpoint
+	d.notifyHandle = d.endpoint.AddNotify(d)
+	d.flags = flags
+	return nil
+}
+
+func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, error) {
+	for {
+		// 1. Try to attach to an existing NIC.
+		if name != "" {
+			if nic, found := s.GetNICByName(name); found {
+				endpoint, ok := nic.LinkEndpoint().(*tunEndpoint)
+				if !ok {
+					// Not a NIC created by tun device.
+					return nil, syserror.EOPNOTSUPP
+				}
+				if !endpoint.TryIncRef() {
+					// Race detected: NIC got deleted in between.
+					continue
+				}
+				return endpoint, nil
+			}
+		}
+
+		// 2. Creating a new NIC.
+		id := tcpip.NICID(s.UniqueID())
+		endpoint := &tunEndpoint{
+			Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""),
+			stack:    s,
+			nicID:    id,
+			name:     name,
+		}
+		endpoint.Endpoint.LinkEPCapabilities = linkCaps
+		if endpoint.name == "" {
+			endpoint.name = fmt.Sprintf("%s%d", prefix, id)
+		}
+		err := s.CreateNICWithOptions(endpoint.nicID, endpoint, stack.NICOptions{
+			Name: endpoint.name,
+		})
+		switch err {
+		case nil:
+			return endpoint, nil
+		case tcpip.ErrDuplicateNICID:
+			// Race detected: A NIC has been created in between.
+			continue
+		default:
+			return nil, syserror.EINVAL
+		}
+	}
+}
+
+// Write inject one inbound packet to the network interface.
+func (d *Device) Write(data []byte) (int64, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return 0, syserror.EBADFD
+	}
+	if !endpoint.IsAttached() {
+		return 0, syserror.EIO
+	}
+
+	dataLen := int64(len(data))
+
+	// Packet information.
+	var pktInfoHdr PacketInfoHeader
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		if len(data) < PacketInfoHeaderSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		pktInfoHdr = PacketInfoHeader(data[:PacketInfoHeaderSize])
+		data = data[PacketInfoHeaderSize:]
+	}
+
+	// Ethernet header (TAP only).
+	var ethHdr header.Ethernet
+	if d.hasFlags(linux.IFF_TAP) {
+		if len(data) < header.EthernetMinimumSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		ethHdr = header.Ethernet(data[:header.EthernetMinimumSize])
+		data = data[header.EthernetMinimumSize:]
+	}
+
+	// Try to determine network protocol number, default zero.
+	var protocol tcpip.NetworkProtocolNumber
+	switch {
+	case pktInfoHdr != nil:
+		protocol = pktInfoHdr.Protocol()
+	case ethHdr != nil:
+		protocol = ethHdr.Type()
+	}
+
+	// Try to determine remote link address, default zero.
+	var remote tcpip.LinkAddress
+	switch {
+	case ethHdr != nil:
+		remote = ethHdr.SourceAddress()
+	default:
+		remote = tcpip.LinkAddress(zeroMAC[:])
+	}
+
+	pkt := &stack.PacketBuffer{
+		Data: buffer.View(data).ToVectorisedView(),
+	}
+	if ethHdr != nil {
+		pkt.LinkHeader = buffer.View(ethHdr)
+	}
+	endpoint.InjectLinkAddr(protocol, remote, pkt)
+	return dataLen, nil
+}
+
+// Read reads one outgoing packet from the network interface.
+func (d *Device) Read() ([]byte, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return nil, syserror.EBADFD
+	}
+
+	for {
+		info, ok := endpoint.Read()
+		if !ok {
+			return nil, syserror.ErrWouldBlock
+		}
+
+		v, ok := d.encodePkt(&info)
+		if !ok {
+			// Ignore unsupported packet.
+			continue
+		}
+		return v, nil
+	}
+}
+
+// encodePkt encodes packet for fd side.
+func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
+	var vv buffer.VectorisedView
+
+	// Packet information.
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		hdr := make(PacketInfoHeader, PacketInfoHeaderSize)
+		hdr.Encode(&PacketInfoFields{
+			Protocol: info.Proto,
+		})
+		vv.AppendView(buffer.View(hdr))
+	}
+
+	// If the packet does not already have link layer header, and the route
+	// does not exist, we can't compute it. This is possibly a raw packet, tun
+	// device doesn't support this at the moment.
+	if info.Pkt.LinkHeader == nil && info.Route.RemoteLinkAddress == "" {
+		return nil, false
+	}
+
+	// Ethernet header (TAP only).
+	if d.hasFlags(linux.IFF_TAP) {
+		// Add ethernet header if not provided.
+		if info.Pkt.LinkHeader == nil {
+			hdr := &header.EthernetFields{
+				SrcAddr: info.Route.LocalLinkAddress,
+				DstAddr: info.Route.RemoteLinkAddress,
+				Type:    info.Proto,
+			}
+			if hdr.SrcAddr == "" {
+				hdr.SrcAddr = d.endpoint.LinkAddress()
+			}
+
+			eth := make(header.Ethernet, header.EthernetMinimumSize)
+			eth.Encode(hdr)
+			vv.AppendView(buffer.View(eth))
+		} else {
+			vv.AppendView(info.Pkt.LinkHeader)
+		}
+	}
+
+	// Append upper headers.
+	vv.AppendView(buffer.View(info.Pkt.Header.View()[len(info.Pkt.LinkHeader):]))
+	// Append data payload.
+	vv.Append(info.Pkt.Data)
+
+	return vv.ToView(), true
+}
+
+// Name returns the name of the attached network interface. Empty string if
+// unattached.
+func (d *Device) Name() string {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	if d.endpoint != nil {
+		return d.endpoint.name
+	}
+	return ""
+}
+
+// Flags returns the flags set for d. Zero value if unset.
+func (d *Device) Flags() uint16 {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	return d.flags
+}
+
+func (d *Device) hasFlags(flags uint16) bool {
+	return d.flags&flags == flags
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (d *Device) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if mask&waiter.EventIn != 0 {
+		d.mu.RLock()
+		endpoint := d.endpoint
+		d.mu.RUnlock()
+		if endpoint != nil && endpoint.NumQueued() == 0 {
+			mask &= ^waiter.EventIn
+		}
+	}
+	return mask & (waiter.EventIn | waiter.EventOut)
+}
+
+// WriteNotify implements channel.Notification.WriteNotify.
+func (d *Device) WriteNotify() {
+	d.Notify(waiter.EventIn)
+}
+
+// tunEndpoint is the link endpoint for the NIC created by the tun device.
+//
+// It is ref-counted as multiple opening files can attach to the same NIC.
+// The last owner is responsible for deleting the NIC.
+type tunEndpoint struct {
+	*channel.Endpoint
+
+	refs.AtomicRefCount
+
+	stack *stack.Stack
+	nicID tcpip.NICID
+	name  string
+}
+
+// DecRef decrements refcount of e, removes NIC if refcount goes to 0.
+func (e *tunEndpoint) DecRef() {
+	e.DecRefWithDestructor(func() {
+		e.stack.RemoveNIC(e.nicID)
+	})
+}
diff --git a/pkg/tcpip/link/tun/protocol.go b/pkg/tcpip/link/tun/protocol.go
new file mode 100644
index 000000000..89d9d91a9
--- /dev/null
+++ b/pkg/tcpip/link/tun/protocol.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// PacketInfoHeaderSize is the size of the packet information header.
+	PacketInfoHeaderSize = 4
+
+	offsetFlags    = 0
+	offsetProtocol = 2
+)
+
+// PacketInfoFields contains fields sent through the wire if IFF_NO_PI flag is
+// not set.
+type PacketInfoFields struct {
+	Flags    uint16
+	Protocol tcpip.NetworkProtocolNumber
+}
+
+// PacketInfoHeader is the wire representation of the packet information sent if
+// IFF_NO_PI flag is not set.
+type PacketInfoHeader []byte
+
+// Encode encodes f into h.
+func (h PacketInfoHeader) Encode(f *PacketInfoFields) {
+	binary.BigEndian.PutUint16(h[offsetFlags:][:2], f.Flags)
+	binary.BigEndian.PutUint16(h[offsetProtocol:][:2], uint16(f.Protocol))
+}
+
+// Flags returns the flag field in h.
+func (h PacketInfoHeader) Flags() uint16 {
+	return binary.BigEndian.Uint16(h[offsetFlags:])
+}
+
+// Protocol returns the protocol field in h.
+func (h PacketInfoHeader) Protocol() tcpip.NetworkProtocolNumber {
+	return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(h[offsetProtocol:]))
+}
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index a8de38979..949b3f2b2 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -50,12 +50,12 @@ func New(lower stack.LinkEndpoint) *Endpoint {
 // It is called by the link-layer endpoint being wrapped when a packet arrives,
 // and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
 // been called.
-func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	if !e.dispatchGate.Enter() {
 		return
 	}
 
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
 	e.dispatchGate.Leave()
 }
 
@@ -99,7 +99,7 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 // WritePacket implements stack.LinkEndpoint.WritePacket. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	if !e.writeGate.Enter() {
 		return nil
 	}
@@ -112,9 +112,9 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 // WritePackets implements stack.LinkEndpoint.WritePackets. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	if !e.writeGate.Enter() {
-		return len(pkts), nil
+		return pkts.Len(), nil
 	}
 
 	n, err := e.lower.WritePackets(r, gso, pkts, protocol)
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 31b11a27a..63bf40562 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -35,7 +35,7 @@ type countedEndpoint struct {
 	dispatcher stack.NetworkDispatcher
 }
 
-func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (e *countedEndpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	e.dispatchCount++
 }
 
@@ -65,15 +65,15 @@ func (e *countedEndpoint) LinkAddress() tcpip.LinkAddress {
 	return e.linkAddr
 }
 
-func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	e.writeCount++
 	return nil
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
-	e.writeCount += len(pkts)
-	return len(pkts), nil
+func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	e.writeCount += pkts.Len()
+	return pkts.Len(), nil
 }
 
 func (e *countedEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
@@ -89,21 +89,21 @@ func TestWaitWrite(t *testing.T) {
 	wep := New(ep)
 
 	// Write and check that it goes through.
-	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
 	if want := 1; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on dispatches, then try to write. It must go through.
 	wep.WaitDispatch()
-	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on writes, then try to write. It must not go through.
 	wep.WaitWrite()
-	wep.WritePacket(nil, nil /* gso */, 0, tcpip.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
@@ -120,21 +120,21 @@ func TestWaitDispatch(t *testing.T) {
 	}
 
 	// Dispatch and check that it goes through.
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
 	if want := 1; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on writes, then try to dispatch. It must go through.
 	wep.WaitWrite()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on dispatches, then try to dispatch. It must not go through.
 	wep.WaitDispatch()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 4da13c5df..7f27a840d 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -42,6 +42,7 @@ const (
 
 // endpoint implements stack.NetworkEndpoint.
 type endpoint struct {
+	protocol      *protocol
 	nicID         tcpip.NICID
 	linkEP        stack.LinkEndpoint
 	linkAddrCache stack.LinkAddressCache
@@ -79,22 +80,26 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 
 func (e *endpoint) Close() {}
 
-func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
+// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
+func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return e.protocol.Number()
+}
+
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	return 0, tcpip.ErrNotSupported
 }
 
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
-	v := pkt.Data.First()
-	h := header.ARP(v)
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.ARP(pkt.NetworkHeader)
 	if !h.IsValid() {
 		return
 	}
@@ -113,7 +118,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 		copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget())
 		copy(packet.HardwareAddressTarget(), h.HardwareAddressSender())
 		copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender())
-		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 		})
 		fallthrough // also fill the cache from requests
@@ -142,18 +147,19 @@ func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWi
 		return nil, tcpip.ErrBadLocalAddress
 	}
 	return &endpoint{
+		protocol:      p,
 		nicID:         nicID,
 		linkEP:        sender,
 		linkAddrCache: linkAddrCache,
 	}, nil
 }
 
-// LinkAddressProtocol implements stack.LinkAddressResolver.
+// LinkAddressProtocol implements stack.LinkAddressResolver.LinkAddressProtocol.
 func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 	return header.IPv4ProtocolNumber
 }
 
-// LinkAddressRequest implements stack.LinkAddressResolver.
+// LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest.
 func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error {
 	r := &stack.Route{
 		RemoteLinkAddress: broadcastMAC,
@@ -167,12 +173,12 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	copy(h.ProtocolAddressSender(), localAddr)
 	copy(h.ProtocolAddressTarget(), addr)
 
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 	})
 }
 
-// ResolveStaticAddress implements stack.LinkAddressResolver.
+// ResolveStaticAddress implements stack.LinkAddressResolver.ResolveStaticAddress.
 func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
 	if addr == header.IPv4Broadcast {
 		return broadcastMAC, true
@@ -183,16 +189,33 @@ func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bo
 	return tcpip.LinkAddress([]byte(nil)), false
 }
 
-// SetOption implements NetworkProtocol.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+// SetOption implements stack.NetworkProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-// Option implements NetworkProtocol.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+// Option implements stack.NetworkProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
+// Parse implements stack.NetworkProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.ARPSize)
+	if !ok {
+		return 0, false, false
+	}
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(header.ARPSize)
+	return 0, false, true
+}
+
 var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
 
 // NewProtocol returns an ARP network protocol.
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 03cf03b6d..66e67429c 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -103,7 +103,7 @@ func TestDirectRequest(t *testing.T) {
 
 	inject := func(addr tcpip.Address) {
 		copy(h.ProtocolAddressTarget(), addr)
-		c.linkEP.InjectInbound(arp.ProtocolNumber, tcpip.PacketBuffer{
+		c.linkEP.InjectInbound(arp.ProtocolNumber, &stack.PacketBuffer{
 			Data: v.ToVectorisedView(),
 		})
 	}
@@ -138,7 +138,8 @@ func TestDirectRequest(t *testing.T) {
 	// Sleep tests are gross, but this will only potentially flake
 	// if there's a bug. If there is no bug this will reliably
 	// succeed.
-	ctx, _ := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	defer cancel()
 	if pkt, ok := c.linkEP.ReadContext(ctx); ok {
 		t.Errorf("stackAddrBad: unexpected packet sent, Proto=%v", pkt.Proto)
 	}
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 92f2aa13a..2982450f8 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -81,8 +81,8 @@ func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout t
 	}
 }
 
-// Process processes an incoming fragment belonging to an ID
-// and returns a complete packet when all the packets belonging to that ID have been received.
+// Process processes an incoming fragment belonging to an ID and returns a
+// complete packet when all the packets belonging to that ID have been received.
 func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) {
 	f.mu.Lock()
 	r, ok := f.reassemblers[id]
@@ -115,10 +115,12 @@ func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buf
 	// Evict reassemblers if we are consuming more memory than highLimit until
 	// we reach lowLimit.
 	if f.size > f.highLimit {
-		tail := f.rList.Back()
-		for f.size > f.lowLimit && tail != nil {
+		for f.size > f.lowLimit {
+			tail := f.rList.Back()
+			if tail == nil {
+				break
+			}
 			f.release(tail)
-			tail = tail.Prev()
 		}
 	}
 	f.mu.Unlock()
diff --git a/pkg/tcpip/network/hash/hash.go b/pkg/tcpip/network/hash/hash.go
index 6a215938b..8f65713c5 100644
--- a/pkg/tcpip/network/hash/hash.go
+++ b/pkg/tcpip/network/hash/hash.go
@@ -80,12 +80,12 @@ func IPv4FragmentHash(h header.IPv4) uint32 {
 // RFC 2640 (sec 4.5) is not very sharp on this aspect.
 // As a reference, also Linux ignores the protocol to compute
 // the hash (inet6_hash_frag).
-func IPv6FragmentHash(h header.IPv6, f header.IPv6Fragment) uint32 {
+func IPv6FragmentHash(h header.IPv6, id uint32) uint32 {
 	t := h.SourceAddress()
 	y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
 	t = h.DestinationAddress()
 	z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
-	return Hash3Words(f.ID(), y, z, hashIV)
+	return Hash3Words(id, y, z, hashIV)
 }
 
 func rol32(v, shift uint32) uint32 {
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index f4d78f8c6..7c8fb3e0a 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -96,7 +96,7 @@ func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buff
 // DeliverTransportPacket is called by network endpoints after parsing incoming
 // packets. This is used by the test object to verify that the results of the
 // parsing are expected.
-func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt *stack.PacketBuffer) {
 	t.checkValues(protocol, pkt.Data, r.RemoteAddress, r.LocalAddress)
 	t.dataCalls++
 }
@@ -104,7 +104,7 @@ func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.Trans
 // DeliverTransportControlPacket is called by network endpoints after parsing
 // incoming control (ICMP) packets. This is used by the test object to verify
 // that the results of the parsing are expected.
-func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	t.checkValues(trans, pkt.Data, remote, local)
 	if typ != t.typ {
 		t.t.Errorf("typ = %v, want %v", typ, t.typ)
@@ -150,7 +150,7 @@ func (*testObject) Wait() {}
 // WritePacket is called by network endpoints after producing a packet and
 // writing it to the link endpoint. This is used by the test object to verify
 // that the produced packet is as expected.
-func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	var prot tcpip.TransportProtocolNumber
 	var srcAddr tcpip.Address
 	var dstAddr tcpip.Address
@@ -172,7 +172,7 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
@@ -246,7 +246,11 @@ func TestIPv4Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: 123,
+		TTL:      123,
+		TOS:      stack.DefaultTOS,
+	}, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
@@ -289,9 +293,9 @@ func TestIPv4Receive(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
-		Data: view.ToVectorisedView(),
-	})
+	pkt := stack.PacketBuffer{Data: view.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -378,10 +382,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
 			o.typ = c.expectedTyp
 			o.extra = c.expectedExtra
 
-			vv := view[:len(view)-c.trunc].ToVectorisedView()
-			ep.HandlePacket(&r, tcpip.PacketBuffer{
-				Data: vv,
-			})
+			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv4MinimumSize))
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
 			}
@@ -444,17 +445,17 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 	}
 
 	// Send first segment.
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
-		Data: frag1.ToVectorisedView(),
-	})
+	pkt := stack.PacketBuffer{Data: frag1.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
 	if o.dataCalls != 0 {
 		t.Fatalf("Bad number of data calls: got %x, want 0", o.dataCalls)
 	}
 
 	// Send second segment.
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
-		Data: frag2.ToVectorisedView(),
-	})
+	pkt = stack.PacketBuffer{Data: frag2.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -487,7 +488,11 @@ func TestIPv6Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: 123,
+		TTL:      123,
+		TOS:      stack.DefaultTOS,
+	}, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
@@ -530,9 +535,9 @@ func TestIPv6Receive(t *testing.T) {
 		t.Fatalf("could not find route: %v", err)
 	}
 
-	ep.HandlePacket(&r, tcpip.PacketBuffer{
-		Data: view.ToVectorisedView(),
-	})
+	pkt := stack.PacketBuffer{Data: view.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -644,12 +649,25 @@ func TestIPv6ReceiveControl(t *testing.T) {
 			// Set ICMPv6 checksum.
 			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
 
-			ep.HandlePacket(&r, tcpip.PacketBuffer{
-				Data: view[:len(view)-c.trunc].ToVectorisedView(),
-			})
+			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv6MinimumSize))
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
 			}
 		})
 	}
 }
+
+// truncatedPacket returns a PacketBuffer based on a truncated view. If view,
+// after truncation, is large enough to hold a network header, it makes part of
+// view the packet's NetworkHeader and the rest its Data. Otherwise all of view
+// becomes Data.
+func truncatedPacket(view buffer.View, trunc, netHdrLen int) *stack.PacketBuffer {
+	v := view[:len(view)-trunc]
+	if len(v) < netHdrLen {
+		return &stack.PacketBuffer{Data: v.ToVectorisedView()}
+	}
+	return &stack.PacketBuffer{
+		NetworkHeader: v[:netHdrLen],
+		Data:          v[netHdrLen:].ToVectorisedView(),
+	}
+}
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 0fef2b1f1..78420d6e6 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -13,7 +13,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/network/fragmentation",
         "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
@@ -35,5 +34,6 @@ go_test(
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
+        "@com_github_google_go-cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 32bf39e43..1b67aa066 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -15,7 +15,6 @@
 package ipv4
 
 import (
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -25,8 +24,12 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
-	h := header.IPv4(pkt.Data.First())
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
+	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return
+	}
+	hdr := header.IPv4(h)
 
 	// We don't use IsValid() here because ICMP only requires that the IP
 	// header plus 8 bytes of the transport header be included. So it's
@@ -35,12 +38,12 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.
 	//
 	// Drop packet if it doesn't have the basic IPv4 header or if the
 	// original source address doesn't match the endpoint's address.
-	if len(h) < header.IPv4MinimumSize || h.SourceAddress() != e.id.LocalAddress {
+	if hdr.SourceAddress() != e.id.LocalAddress {
 		return
 	}
 
-	hlen := int(h.HeaderLength())
-	if pkt.Data.Size() < hlen || h.FragmentOffset() != 0 {
+	hlen := int(hdr.HeaderLength())
+	if pkt.Data.Size() < hlen || hdr.FragmentOffset() != 0 {
 		// We won't be able to handle this if it doesn't contain the
 		// full IPv4 header, or if it's a fragment not at offset 0
 		// (because it won't have the transport header).
@@ -49,15 +52,18 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.
 
 	// Skip the ip header, then deliver control message.
 	pkt.Data.TrimFront(hlen)
-	p := h.TransportProtocol()
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+	p := hdr.TransportProtocol()
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) {
 	stats := r.Stats()
 	received := stats.ICMP.V4PacketsReceived
-	v := pkt.Data.First()
-	if len(v) < header.ICMPv4MinimumSize {
+	// TODO(gvisor.dev/issue/170): ICMP packets don't have their
+	// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+	// full explanation.
+	v, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
+	if !ok {
 		received.Invalid.Increment()
 		return
 	}
@@ -85,7 +91,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
 
 		// It's possible that a raw socket expects to receive this.
 		h.SetChecksum(wantChecksum)
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, tcpip.PacketBuffer{
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, &stack.PacketBuffer{
 			Data:          pkt.Data.Clone(nil),
 			NetworkHeader: append(buffer.View(nil), pkt.NetworkHeader...),
 		})
@@ -99,7 +105,11 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
 		pkt.SetChecksum(0)
 		pkt.SetChecksum(^header.Checksum(pkt, header.ChecksumVV(vv, 0)))
 		sent := stats.ICMP.V4PacketsSent
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
+			Protocol: header.ICMPv4ProtocolNumber,
+			TTL:      r.DefaultTTL(),
+			TOS:      stack.DefaultTOS,
+		}, &stack.PacketBuffer{
 			Header:          hdr,
 			Data:            vv,
 			TransportHeader: buffer.View(pkt),
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 6597e6781..7e9f16c90 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -21,12 +21,12 @@
 package ipv4
 
 import (
+	"fmt"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
 	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -119,13 +119,18 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
+// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
+func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return e.protocol.Number()
+}
+
 // writePacketFragments calls e.linkEP.WritePacket with each packet fragment to
 // write. It assumes that the IP header is entirely in pkt.Header but does not
 // assume that only the IP header is in pkt.Header. It assumes that the input
 // packet's stated length matches the length of the header+payload. mtu
 // includes the IP header and options. This does not support the DontFragment
 // IP flag.
-func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt *stack.PacketBuffer) *tcpip.Error {
 	// This packet is too big, it needs to be fragmented.
 	ip := header.IPv4(pkt.Header.View())
 	flags := ip.Flags()
@@ -165,7 +170,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 		if i > 0 {
 			newPayload := pkt.Data.Clone(nil)
 			newPayload.CapLength(innerMTU)
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, &stack.PacketBuffer{
 				Header:        pkt.Header,
 				Data:          newPayload,
 				NetworkHeader: buffer.View(h),
@@ -184,7 +189,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 			newPayload := pkt.Data.Clone(nil)
 			newPayloadLength := outerMTU - pkt.Header.UsedLength()
 			newPayload.CapLength(newPayloadLength)
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, &stack.PacketBuffer{
 				Header:        pkt.Header,
 				Data:          newPayload,
 				NetworkHeader: buffer.View(h),
@@ -198,7 +203,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 			startOfHdr := pkt.Header
 			startOfHdr.TrimBack(pkt.Header.UsedLength() - outerMTU)
 			emptyVV := buffer.NewVectorisedView(0, []buffer.View{})
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, tcpip.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, &stack.PacketBuffer{
 				Header:        startOfHdr,
 				Data:          emptyVV,
 				NetworkHeader: buffer.View(h),
@@ -241,22 +246,37 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
-	if r.Loop&stack.PacketLoop != 0 {
-		// The inbound path expects the network header to still be in
-		// the PacketBuffer's Data field.
-		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-		views[0] = pkt.Header.View()
-		views = append(views, pkt.Data.Views()...)
-		loopedR := r.MakeLoopedRoute()
+	nicName := e.stack.FindNICNameFromID(e.NICID())
+	// iptables filtering. All packets that reach here are locally
+	// generated.
+	ipt := e.stack.IPTables()
+	if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok {
+		// iptables is telling us to drop the packet.
+		return nil
+	}
 
-		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
-			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
-		})
+	// If the packet is manipulated as per NAT Ouput rules, handle packet
+	// based on destination address and do not send the packet to link layer.
+	// TODO(gvisor.dev/issue/170): We should do this for every packet, rather than
+	// only NATted packets, but removing this check short circuits broadcasts
+	// before they are sent out to other hosts.
+	if pkt.NatDone {
+		netHeader := header.IPv4(pkt.NetworkHeader)
+		ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress())
+		if err == nil {
+			route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress())
+			ep.HandlePacket(&route, pkt)
+			return nil
+		}
+	}
 
+	if r.Loop&stack.PacketLoop != 0 {
+		loopedR := r.MakeLoopedRoute()
+		e.HandlePacket(&loopedR, pkt)
 		loopedR.Release()
 	}
 	if r.Loop&stack.PacketOut == 0 {
@@ -273,29 +293,71 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("multiple packets in local loop")
 	}
 	if r.Loop&stack.PacketOut == 0 {
-		return len(pkts), nil
+		return pkts.Len(), nil
+	}
+
+	for pkt := pkts.Front(); pkt != nil; {
+		ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
+		pkt.NetworkHeader = buffer.View(ip)
+		pkt = pkt.Next()
 	}
 
-	for i := range pkts {
-		ip := e.addIPHeader(r, &pkts[i].Header, pkts[i].DataSize, params)
-		pkts[i].NetworkHeader = buffer.View(ip)
+	nicName := e.stack.FindNICNameFromID(e.NICID())
+	// iptables filtering. All packets that reach here are locally
+	// generated.
+	ipt := e.stack.IPTables()
+	dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r, nicName)
+	if len(dropped) == 0 && len(natPkts) == 0 {
+		// Fast path: If no packets are to be dropped then we can just invoke the
+		// faster WritePackets API directly.
+		n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
+		r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+		return n, err
+	}
+
+	// Slow Path as we are dropping some packets in the batch degrade to
+	// emitting one packet at a time.
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if _, ok := dropped[pkt]; ok {
+			continue
+		}
+		if _, ok := natPkts[pkt]; ok {
+			netHeader := header.IPv4(pkt.NetworkHeader)
+			if ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress()); err == nil {
+				src := netHeader.SourceAddress()
+				dst := netHeader.DestinationAddress()
+				route := r.ReverseRoute(src, dst)
+				ep.HandlePacket(&route, pkt)
+				n++
+				continue
+			}
+		}
+		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+			r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+			return n, err
+		}
+		n++
 	}
-	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
 	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
-	return n, err
+	return n, nil
 }
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
-	ip := header.IPv4(pkt.Data.First())
+	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return tcpip.ErrInvalidOptionValue
+	}
+	ip := header.IPv4(h)
 	if !ip.IsValid(pkt.Data.Size()) {
 		return tcpip.ErrInvalidOptionValue
 	}
@@ -344,31 +406,23 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuf
 
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
-	headerView := pkt.Data.First()
-	h := header.IPv4(headerView)
-	if !h.IsValid(pkt.Data.Size()) {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.IPv4(pkt.NetworkHeader)
+	if !h.IsValid(pkt.Data.Size() + len(pkt.NetworkHeader) + len(pkt.TransportHeader)) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
-	pkt.NetworkHeader = headerView[:h.HeaderLength()]
-
-	hlen := int(h.HeaderLength())
-	tlen := int(h.TotalLength())
-	pkt.Data.TrimFront(hlen)
-	pkt.Data.CapLength(tlen - hlen)
 
 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
 	ipt := e.stack.IPTables()
-	if ok := ipt.Check(iptables.Input, pkt); !ok {
+	if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok {
 		// iptables is telling us to drop the packet.
 		return
 	}
 
-	more := (h.Flags() & header.IPv4FlagMoreFragments) != 0
-	if more || h.FragmentOffset() != 0 {
-		if pkt.Data.Size() == 0 {
+	if h.More() || h.FragmentOffset() != 0 {
+		if pkt.Data.Size()+len(pkt.TransportHeader) == 0 {
 			// Drop the packet as it's marked as a fragment but has
 			// no payload.
 			r.Stats().IP.MalformedPacketsReceived.Increment()
@@ -387,7 +441,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 		}
 		var ready bool
 		var err error
-		pkt.Data, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, pkt.Data)
+		pkt.Data, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, h.More(), pkt.Data)
 		if err != nil {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
 			r.Stats().IP.MalformedFragmentsReceived.Increment()
@@ -399,7 +453,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	}
 	p := h.TransportProtocol()
 	if p == header.ICMPv4ProtocolNumber {
-		headerView.CapLength(hlen)
+		pkt.NetworkHeader.CapLength(int(h.HeaderLength()))
 		e.handleICMP(r, pkt)
 		return
 	}
@@ -473,6 +527,41 @@ func (p *protocol) DefaultTTL() uint8 {
 	return uint8(atomic.LoadUint32(&p.defaultTTL))
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return 0, false, false
+	}
+	ipHdr := header.IPv4(hdr)
+
+	// If there are options, pull those into hdr as well.
+	if headerLen := int(ipHdr.HeaderLength()); headerLen > header.IPv4MinimumSize && headerLen <= pkt.Data.Size() {
+		hdr, ok = pkt.Data.PullUp(headerLen)
+		if !ok {
+			panic(fmt.Sprintf("There are only %d bytes in pkt.Data, but there should be at least %d", pkt.Data.Size(), headerLen))
+		}
+		ipHdr = header.IPv4(hdr)
+	}
+
+	// If this is a fragment, don't bother parsing the transport header.
+	parseTransportHeader := true
+	if ipHdr.More() || ipHdr.FragmentOffset() != 0 {
+		parseTransportHeader = false
+	}
+
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(len(hdr))
+	pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr))
+	return ipHdr.TransportProtocol(), parseTransportHeader, true
+}
+
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
 // payload mtu.
 func calculateMTU(mtu uint32) uint32 {
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index e900f1b45..11e579c4b 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -20,6 +20,7 @@ import (
 	"math/rand"
 	"testing"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -113,7 +114,7 @@ func makeHdrAndPayload(hdrLength int, extraLength int, viewSizes []int) (buffer.
 
 // comparePayloads compared the contents of all the packets against the contents
 // of the source packet.
-func compareFragments(t *testing.T, packets []tcpip.PacketBuffer, sourcePacketInfo tcpip.PacketBuffer, mtu uint32) {
+func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketInfo *stack.PacketBuffer, mtu uint32) {
 	t.Helper()
 	// Make a complete array of the sourcePacketInfo packet.
 	source := header.IPv4(packets[0].Header.View()[:header.IPv4MinimumSize])
@@ -173,7 +174,7 @@ func compareFragments(t *testing.T, packets []tcpip.PacketBuffer, sourcePacketIn
 
 type errorChannel struct {
 	*channel.Endpoint
-	Ch                    chan tcpip.PacketBuffer
+	Ch                    chan *stack.PacketBuffer
 	packetCollectorErrors []*tcpip.Error
 }
 
@@ -183,7 +184,7 @@ type errorChannel struct {
 func newErrorChannel(size int, mtu uint32, linkAddr tcpip.LinkAddress, packetCollectorErrors []*tcpip.Error) *errorChannel {
 	return &errorChannel{
 		Endpoint:              channel.New(size, mtu, linkAddr),
-		Ch:                    make(chan tcpip.PacketBuffer, size),
+		Ch:                    make(chan *stack.PacketBuffer, size),
 		packetCollectorErrors: packetCollectorErrors,
 	}
 }
@@ -202,7 +203,7 @@ func (e *errorChannel) Drain() int {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	select {
 	case e.Ch <- pkt:
 	default:
@@ -281,13 +282,17 @@ func TestFragmentation(t *testing.T) {
 	for _, ft := range fragTests {
 		t.Run(ft.description, func(t *testing.T) {
 			hdr, payload := makeHdrAndPayload(ft.hdrLength, ft.extraLength, ft.payloadViewsSizes)
-			source := tcpip.PacketBuffer{
+			source := &stack.PacketBuffer{
 				Header: hdr,
 				// Save the source payload because WritePacket will modify it.
 				Data: payload.Clone(nil),
 			}
 			c := buildContext(t, nil, ft.mtu)
-			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      42,
+				TOS:      stack.DefaultTOS,
+			}, &stack.PacketBuffer{
 				Header: hdr,
 				Data:   payload,
 			})
@@ -295,7 +300,7 @@ func TestFragmentation(t *testing.T) {
 				t.Errorf("err got %v, want %v", err, nil)
 			}
 
-			var results []tcpip.PacketBuffer
+			var results []*stack.PacketBuffer
 		L:
 			for {
 				select {
@@ -337,7 +342,11 @@ func TestFragmentationErrors(t *testing.T) {
 		t.Run(ft.description, func(t *testing.T) {
 			hdr, payload := makeHdrAndPayload(ft.hdrLength, header.IPv4MinimumSize, ft.payloadViewsSizes)
 			c := buildContext(t, ft.packetCollectorErrors, ft.mtu)
-			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      42,
+				TOS:      stack.DefaultTOS,
+			}, &stack.PacketBuffer{
 				Header: hdr,
 				Data:   payload,
 			})
@@ -459,7 +468,7 @@ func TestInvalidFragments(t *testing.T) {
 			s.CreateNIC(nicID, sniffer.New(ep))
 
 			for _, pkt := range tc.packets {
-				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, tcpip.PacketBuffer{
+				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, &stack.PacketBuffer{
 					Data: buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}),
 				})
 			}
@@ -473,3 +482,264 @@ func TestInvalidFragments(t *testing.T) {
 		})
 	}
 }
+
+// TestReceiveFragments feeds fragments in through the incoming packet path to
+// test reassembly
+func TestReceiveFragments(t *testing.T) {
+	const addr1 = "\x0c\xa8\x00\x01" // 192.168.0.1
+	const addr2 = "\x0c\xa8\x00\x02" // 192.168.0.2
+	const nicID = 1
+
+	// Build and return a UDP header containing payload.
+	udpGen := func(payloadLen int, multiplier uint8) buffer.View {
+		payload := buffer.NewView(payloadLen)
+		for i := 0; i < len(payload); i++ {
+			payload[i] = uint8(i) * multiplier
+		}
+
+		udpLength := header.UDPMinimumSize + len(payload)
+
+		hdr := buffer.NewPrependable(udpLength)
+		u := header.UDP(hdr.Prepend(udpLength))
+		u.Encode(&header.UDPFields{
+			SrcPort: 5555,
+			DstPort: 80,
+			Length:  uint16(udpLength),
+		})
+		copy(u.Payload(), payload)
+		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+		sum = header.Checksum(payload, sum)
+		u.SetChecksum(^u.CalculateChecksum(sum))
+		return hdr.View()
+	}
+
+	// UDP header plus a payload of 0..256
+	ipv4Payload1 := udpGen(256, 1)
+	udpPayload1 := ipv4Payload1[header.UDPMinimumSize:]
+	// UDP header plus a payload of 0..256 in increments of 2.
+	ipv4Payload2 := udpGen(128, 2)
+	udpPayload2 := ipv4Payload2[header.UDPMinimumSize:]
+
+	type fragmentData struct {
+		id             uint16
+		flags          uint8
+		fragmentOffset uint16
+		payload        buffer.View
+	}
+
+	tests := []struct {
+		name             string
+		fragments        []fragmentData
+		expectedPayloads [][]byte
+	}{
+		{
+			name: "No fragmentation",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          0,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1,
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "More fragments without payload",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1,
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Non-zero fragment offset without payload",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          0,
+					fragmentOffset: 8,
+					payload:        ipv4Payload1,
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+				{
+					id:             1,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload1[64:],
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Second fragment has MoreFlags set",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 64,
+					payload:        ipv4Payload1[64:],
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with different IDs",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+				{
+					id:             2,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload1[64:],
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two interleaved fragmented packets",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+				{
+					id:             2,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload2[:64],
+				},
+				{
+					id:             1,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload1[64:],
+				},
+				{
+					id:             2,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload2[64:],
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1, udpPayload2},
+		},
+		{
+			name: "Fragment without followup",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+			},
+			expectedPayloads: nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// Setup a stack and endpoint.
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			e := channel.New(0, 1280, tcpip.LinkAddress("\xf0\x00"))
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, header.IPv4ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, addr2, err)
+			}
+
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, header.IPv4ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, header.IPv4ProtocolNumber, err)
+			}
+			defer ep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: addr2, Port: 80}
+			if err := ep.Bind(bindAddr); err != nil {
+				t.Fatalf("Bind(%+v): %s", bindAddr, err)
+			}
+
+			// Prepare and send the fragments.
+			for _, frag := range test.fragments {
+				hdr := buffer.NewPrependable(header.IPv4MinimumSize)
+
+				// Serialize IPv4 fixed header.
+				ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:            header.IPv4MinimumSize,
+					TotalLength:    header.IPv4MinimumSize + uint16(len(frag.payload)),
+					ID:             frag.id,
+					Flags:          frag.flags,
+					FragmentOffset: frag.fragmentOffset,
+					TTL:            64,
+					Protocol:       uint8(header.UDPProtocolNumber),
+					SrcAddr:        addr1,
+					DstAddr:        addr2,
+				})
+
+				vv := hdr.View().ToVectorisedView()
+				vv.AppendView(frag.payload)
+
+				e.InjectInbound(header.IPv4ProtocolNumber, &stack.PacketBuffer{
+					Data: vv,
+				})
+			}
+
+			if got, want := s.Stats().UDP.PacketsReceived.Value(), uint64(len(test.expectedPayloads)); got != want {
+				t.Errorf("got UDP Rx Packets = %d, want = %d", got, want)
+			}
+
+			for i, expectedPayload := range test.expectedPayloads {
+				gotPayload, _, err := ep.Read(nil)
+				if err != nil {
+					t.Fatalf("(i=%d) Read(nil): %s", i, err)
+				}
+				if diff := cmp.Diff(buffer.View(expectedPayload), gotPayload); diff != "" {
+					t.Errorf("(i=%d) got UDP payload mismatch (-want +got):\n%s", i, diff)
+				}
+			}
+
+			if gotPayload, _, err := ep.Read(nil); err != tcpip.ErrWouldBlock {
+				t.Fatalf("(last) got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index fb11874c6..3f71fc520 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -13,6 +13,8 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/network/fragmentation",
+        "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
     ],
 )
@@ -29,6 +31,7 @@ go_test(
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/sniffer",
@@ -36,5 +39,6 @@ go_test(
         "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
+        "@com_github_google_go-cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 45dc757c7..2ff7eedf4 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -15,7 +15,7 @@
 package ipv6
 
 import (
-	"log"
+	"fmt"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -27,8 +27,12 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
-	h := header.IPv6(pkt.Data.First())
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
+	h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return
+	}
+	hdr := header.IPv6(h)
 
 	// We don't use IsValid() here because ICMP only requires that up to
 	// 1280 bytes of the original packet be included. So it's likely that it
@@ -36,17 +40,21 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.
 	//
 	// Drop packet if it doesn't have the basic IPv6 header or if the
 	// original source address doesn't match the endpoint's address.
-	if len(h) < header.IPv6MinimumSize || h.SourceAddress() != e.id.LocalAddress {
+	if hdr.SourceAddress() != e.id.LocalAddress {
 		return
 	}
 
 	// Skip the IP header, then handle the fragmentation header if there
 	// is one.
 	pkt.Data.TrimFront(header.IPv6MinimumSize)
-	p := h.TransportProtocol()
+	p := hdr.TransportProtocol()
 	if p == header.IPv6FragmentHeader {
-		f := header.IPv6Fragment(pkt.Data.First())
-		if !f.IsValid() || f.FragmentOffset() != 0 {
+		f, ok := pkt.Data.PullUp(header.IPv6FragmentHeaderSize)
+		if !ok {
+			return
+		}
+		fragHdr := header.IPv6Fragment(f)
+		if !fragHdr.IsValid() || fragHdr.FragmentOffset() != 0 {
 			// We can't handle fragments that aren't at offset 0
 			// because they don't have the transport headers.
 			return
@@ -55,90 +63,87 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.
 		// Skip fragmentation header and find out the actual protocol
 		// number.
 		pkt.Data.TrimFront(header.IPv6FragmentHeaderSize)
-		p = f.TransportProtocol()
+		p = fragHdr.TransportProtocol()
 	}
 
 	// Deliver the control packet to the transport endpoint.
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.PacketBuffer) {
+func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
-	v := pkt.Data.First()
-	if len(v) < header.ICMPv6MinimumSize {
+	// TODO(gvisor.dev/issue/170): ICMP packets don't have their
+	// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+	// full explanation.
+	v, ok := pkt.Data.PullUp(header.ICMPv6HeaderSize)
+	if !ok {
 		received.Invalid.Increment()
 		return
 	}
 	h := header.ICMPv6(v)
-	iph := header.IPv6(netHeader)
+	iph := header.IPv6(pkt.NetworkHeader)
 
 	// Validate ICMPv6 checksum before processing the packet.
 	//
-	// Only the first view in vv is accounted for by h. To account for the
-	// rest of vv, a shallow copy is made and the first view is removed.
 	// This copy is used as extra payload during the checksum calculation.
-	payload := pkt.Data
-	payload.RemoveFirst()
+	payload := pkt.Data.Clone(nil)
+	payload.TrimFront(len(h))
 	if got, want := h.Checksum(), header.ICMPv6Checksum(h, iph.SourceAddress(), iph.DestinationAddress(), payload); got != want {
 		received.Invalid.Increment()
 		return
 	}
 
-	// As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and
-	// 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
-	// in the IPv6 header is not set to 255, or the ICMPv6 Code field is not
-	// set to 0.
-	switch h.Type() {
-	case header.ICMPv6NeighborSolicit,
-		header.ICMPv6NeighborAdvert,
-		header.ICMPv6RouterSolicit,
-		header.ICMPv6RouterAdvert,
-		header.ICMPv6RedirectMsg:
-		if iph.HopLimit() != header.NDPHopLimit {
-			received.Invalid.Increment()
-			return
-		}
-
-		if h.Code() != 0 {
-			received.Invalid.Increment()
-			return
-		}
+	isNDPValid := func() bool {
+		// As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and
+		// 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
+		// in the IPv6 header is not set to 255, or the ICMPv6 Code field is not
+		// set to 0.
+		//
+		// As per RFC 6980 section 5, nodes MUST silently drop NDP messages if the
+		// packet includes a fragmentation header.
+		return !hasFragmentHeader && iph.HopLimit() == header.NDPHopLimit && h.Code() == 0
 	}
 
 	// TODO(b/112892170): Meaningfully handle all ICMP types.
 	switch h.Type() {
 	case header.ICMPv6PacketTooBig:
 		received.PacketTooBig.Increment()
-		if len(v) < header.ICMPv6PacketTooBigMinimumSize {
+		hdr, ok := pkt.Data.PullUp(header.ICMPv6PacketTooBigMinimumSize)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
-		mtu := h.MTU()
+		mtu := header.ICMPv6(hdr).MTU()
 		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
 
 	case header.ICMPv6DstUnreachable:
 		received.DstUnreachable.Increment()
-		if len(v) < header.ICMPv6DstUnreachableMinimumSize {
+		hdr, ok := pkt.Data.PullUp(header.ICMPv6DstUnreachableMinimumSize)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
-		switch h.Code() {
+		switch header.ICMPv6(hdr).Code() {
 		case header.ICMPv6PortUnreachable:
 			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 		}
 
 	case header.ICMPv6NeighborSolicit:
 		received.NeighborSolicit.Increment()
-		if len(v) < header.ICMPv6NeighborSolicitMinimumSize {
+		if pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
 
-		ns := header.NDPNeighborSolicit(h.NDPPayload())
+		// The remainder of payload must be only the neighbor solicitation, so
+		// payload.ToView() always returns the solicitation. Per RFC 6980 section 5,
+		// NDP messages cannot be fragmented. Also note that in the common case NDP
+		// datagrams are very small and ToView() will not incur allocations.
+		ns := header.NDPNeighborSolicit(payload.ToView())
 		it, err := ns.Options().Iter(true)
 		if err != nil {
 			// If we have a malformed NDP NS option, drop the packet.
@@ -148,58 +153,53 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 
 		targetAddr := ns.TargetAddress()
 		s := r.Stack()
-		rxNICID := r.NICID()
-		if isTentative, err := s.IsAddrTentative(rxNICID, targetAddr); err != nil {
-			// We will only get an error if rxNICID is unrecognized,
-			// which should not happen. For now short-circuit this
-			// packet.
+		if isTentative, err := s.IsAddrTentative(e.nicID, targetAddr); err != nil {
+			// We will only get an error if the NIC is unrecognized, which should not
+			// happen. For now, drop this packet.
 			//
 			// TODO(b/141002840): Handle this better?
 			return
 		} else if isTentative {
-			// If the target address is tentative and the source
-			// of the packet is a unicast (specified) address, then
-			// the source of the packet is attempting to perform
-			// address resolution on the target. In this case, the
-			// solicitation is silently ignored, as per RFC 4862
-			// section 5.4.3.
+			// If the target address is tentative and the source of the packet is a
+			// unicast (specified) address, then the source of the packet is
+			// attempting to perform address resolution on the target. In this case,
+			// the solicitation is silently ignored, as per RFC 4862 section 5.4.3.
 			//
-			// If the target address is tentative and the source of
-			// the packet is the unspecified address (::), then we
-			// know another node is also performing DAD for the
-			// same address (since targetAddr is tentative for us,
-			// we know we are also performing DAD on it). In this
-			// case we let the stack know so it can handle such a
-			// scenario and do nothing further with the NDP NS.
-			if iph.SourceAddress() == header.IPv6Any {
-				s.DupTentativeAddrDetected(rxNICID, targetAddr)
+			// If the target address is tentative and the source of the packet is the
+			// unspecified address (::), then we know another node is also performing
+			// DAD for the same address (since the target address is tentative for us,
+			// we know we are also performing DAD on it). In this case we let the
+			// stack know so it can handle such a scenario and do nothing further with
+			// the NS.
+			if r.RemoteAddress == header.IPv6Any {
+				s.DupTentativeAddrDetected(e.nicID, targetAddr)
 			}
 
-			// Do not handle neighbor solicitations targeted
-			// to an address that is tentative on the received
-			// NIC any further.
+			// Do not handle neighbor solicitations targeted to an address that is
+			// tentative on the NIC any further.
 			return
 		}
 
-		// At this point we know that targetAddr is not tentative on
-		// rxNICID so the packet is processed as defined in RFC 4861,
-		// as per RFC 4862 section 5.4.3.
+		// At this point we know that the target address is not tentative on the NIC
+		// so the packet is processed as defined in RFC 4861, as per RFC 4862
+		// section 5.4.3.
 
+		// Is the NS targetting us?
 		if e.linkAddrCache.CheckLocalAddress(e.nicID, ProtocolNumber, targetAddr) == 0 {
-			// We don't have a useful answer; the best we can do is ignore the request.
 			return
 		}
 
-		// If the NS message has the source link layer option, update the link
-		// address cache with the link address for the sender of the message.
+		// If the NS message contains the Source Link-Layer Address option, update
+		// the link address cache with the value of the option.
 		//
 		// TODO(b/148429853): Properly process the NS message and do Neighbor
 		// Unreachability Detection.
+		var sourceLinkAddr tcpip.LinkAddress
 		for {
 			opt, done, err := it.Next()
 			if err != nil {
 				// This should never happen as Iter(true) above did not return an error.
-				log.Fatalf("unexpected error when iterating over NDP options: %s", err)
+				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
 			}
 			if done {
 				break
@@ -207,22 +207,36 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 
 			switch opt := opt.(type) {
 			case header.NDPSourceLinkLayerAddressOption:
-				e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, opt.EthernetAddress())
+				// No RFCs define what to do when an NS message has multiple Source
+				// Link-Layer Address options. Since no interface can have multiple
+				// link-layer addresses, we consider such messages invalid.
+				if len(sourceLinkAddr) != 0 {
+					received.Invalid.Increment()
+					return
+				}
+
+				sourceLinkAddr = opt.EthernetAddress()
 			}
 		}
 
-		optsSerializer := header.NDPOptionsSerializer{
-			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress[:]),
+		unspecifiedSource := r.RemoteAddress == header.IPv6Any
+
+		// As per RFC 4861 section 4.3, the Source Link-Layer Address Option MUST
+		// NOT be included when the source IP address is the unspecified address.
+		// Otherwise, on link layers that have addresses this option MUST be
+		// included in multicast solicitations and SHOULD be included in unicast
+		// solicitations.
+		if len(sourceLinkAddr) == 0 {
+			if header.IsV6MulticastAddress(r.LocalAddress) && !unspecifiedSource {
+				received.Invalid.Increment()
+				return
+			}
+		} else if unspecifiedSource {
+			received.Invalid.Increment()
+			return
+		} else {
+			e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, sourceLinkAddr)
 		}
-		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()))
-		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
-		packet.SetType(header.ICMPv6NeighborAdvert)
-		na := header.NDPNeighborAdvert(packet.NDPPayload())
-		na.SetSolicitedFlag(true)
-		na.SetOverrideFlag(true)
-		na.SetTargetAddress(targetAddr)
-		opts := na.Options()
-		opts.Serialize(optsSerializer)
 
 		// ICMPv6 Neighbor Solicit messages are always sent to
 		// specially crafted IPv6 multicast addresses. As a result, the
@@ -235,6 +249,40 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		r := r.Clone()
 		defer r.Release()
 		r.LocalAddress = targetAddr
+
+		// As per RFC 4861 section 7.2.4, if the the source of the solicitation is
+		// the unspecified address, the node MUST set the Solicited flag to zero and
+		// multicast the advertisement to the all-nodes address.
+		solicited := true
+		if unspecifiedSource {
+			solicited = false
+			r.RemoteAddress = header.IPv6AllNodesMulticastAddress
+		}
+
+		// If the NS has a source link-layer option, use the link address it
+		// specifies as the remote link address for the response instead of the
+		// source link address of the packet.
+		//
+		// TODO(#2401): As per RFC 4861 section 7.2.4 we should consult our link
+		// address cache for the right destination link address instead of manually
+		// patching the route with the remote link address if one is specified in a
+		// Source Link-Layer Address option.
+		if len(sourceLinkAddr) != 0 {
+			r.RemoteLinkAddress = sourceLinkAddr
+		}
+
+		optsSerializer := header.NDPOptionsSerializer{
+			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress),
+		}
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()))
+		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+		packet.SetType(header.ICMPv6NeighborAdvert)
+		na := header.NDPNeighborAdvert(packet.NDPPayload())
+		na.SetSolicitedFlag(solicited)
+		na.SetOverrideFlag(true)
+		na.SetTargetAddress(targetAddr)
+		opts := na.Options()
+		opts.Serialize(optsSerializer)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 		// RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
@@ -243,7 +291,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		//
 		// The IP Hop Limit field has a value of 255, i.e., the packet
 		// could not possibly have been forwarded by a router.
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 			Header: hdr,
 		}); err != nil {
 			sent.Dropped.Increment()
@@ -253,12 +301,16 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 
 	case header.ICMPv6NeighborAdvert:
 		received.NeighborAdvert.Increment()
-		if len(v) < header.ICMPv6NeighborAdvertSize {
+		if pkt.Data.Size() < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
 			received.Invalid.Increment()
 			return
 		}
 
-		na := header.NDPNeighborAdvert(h.NDPPayload())
+		// The remainder of payload must be only the neighbor advertisement, so
+		// payload.ToView() always returns the advertisement. Per RFC 6980 section
+		// 5, NDP messages cannot be fragmented. Also note that in the common case
+		// NDP datagrams are very small and ToView() will not incur allocations.
+		na := header.NDPNeighborAdvert(payload.ToView())
 		it, err := na.Options().Iter(true)
 		if err != nil {
 			// If we have a malformed NDP NA option, drop the packet.
@@ -268,45 +320,43 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 
 		targetAddr := na.TargetAddress()
 		stack := r.Stack()
-		rxNICID := r.NICID()
 
-		if isTentative, err := stack.IsAddrTentative(rxNICID, targetAddr); err != nil {
-			// We will only get an error if rxNICID is unrecognized,
-			// which should not happen. For now short-circuit this
-			// packet.
+		if isTentative, err := stack.IsAddrTentative(e.nicID, targetAddr); err != nil {
+			// We will only get an error if the NIC is unrecognized, which should not
+			// happen. For now short-circuit this packet.
 			//
 			// TODO(b/141002840): Handle this better?
 			return
 		} else if isTentative {
-			// We just got an NA from a node that owns an address we
-			// are performing DAD on, implying the address is not
-			// unique. In this case we let the stack know so it can
-			// handle such a scenario and do nothing furthur with
+			// We just got an NA from a node that owns an address we are performing
+			// DAD on, implying the address is not unique. In this case we let the
+			// stack know so it can handle such a scenario and do nothing furthur with
 			// the NDP NA.
-			stack.DupTentativeAddrDetected(rxNICID, targetAddr)
+			stack.DupTentativeAddrDetected(e.nicID, targetAddr)
 			return
 		}
 
-		// At this point we know that the targetAddress is not tentative
-		// on rxNICID. However, targetAddr may still be assigned to
-		// rxNICID but not tentative (it could be permanent). Such a
-		// scenario is beyond the scope of RFC 4862. As such, we simply
-		// ignore such a scenario for now and proceed as normal.
+		// At this point we know that the target address is not tentative on the
+		// NIC. However, the target address may still be assigned to the NIC but not
+		// tentative (it could be permanent). Such a scenario is beyond the scope of
+		// RFC 4862. As such, we simply ignore such a scenario for now and proceed
+		// as normal.
 		//
+		// TODO(b/143147598): Handle the scenario described above. Also inform the
+		// netstack integration that a duplicate address was detected outside of
+		// DAD.
+
 		// If the NA message has the target link layer option, update the link
 		// address cache with the link address for the target of the message.
 		//
-		// TODO(b/143147598): Handle the scenario described above. Also
-		// inform the netstack integration that a duplicate address was
-		// detected outside of DAD.
-		//
 		// TODO(b/148429853): Properly process the NA message and do Neighbor
 		// Unreachability Detection.
+		var targetLinkAddr tcpip.LinkAddress
 		for {
 			opt, done, err := it.Next()
 			if err != nil {
 				// This should never happen as Iter(true) above did not return an error.
-				log.Fatalf("unexpected error when iterating over NDP options: %s", err)
+				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
 			}
 			if done {
 				break
@@ -314,23 +364,36 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 
 			switch opt := opt.(type) {
 			case header.NDPTargetLinkLayerAddressOption:
-				e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, opt.EthernetAddress())
+				// No RFCs define what to do when an NA message has multiple Target
+				// Link-Layer Address options. Since no interface can have multiple
+				// link-layer addresses, we consider such messages invalid.
+				if len(targetLinkAddr) != 0 {
+					received.Invalid.Increment()
+					return
+				}
+
+				targetLinkAddr = opt.EthernetAddress()
 			}
 		}
 
+		if len(targetLinkAddr) != 0 {
+			e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, targetLinkAddr)
+		}
+
 	case header.ICMPv6EchoRequest:
 		received.EchoRequest.Increment()
-		if len(v) < header.ICMPv6EchoMinimumSize {
+		icmpHdr, ok := pkt.Data.PullUp(header.ICMPv6EchoMinimumSize)
+		if !ok {
 			received.Invalid.Increment()
 			return
 		}
 		pkt.Data.TrimFront(header.ICMPv6EchoMinimumSize)
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize)
 		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
-		copy(packet, h)
+		copy(packet, icmpHdr)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   pkt.Data,
 		}); err != nil {
@@ -341,7 +404,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 
 	case header.ICMPv6EchoReply:
 		received.EchoReply.Increment()
-		if len(v) < header.ICMPv6EchoMinimumSize {
+		if pkt.Data.Size() < header.ICMPv6EchoMinimumSize {
 			received.Invalid.Increment()
 			return
 		}
@@ -355,8 +418,21 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 
 	case header.ICMPv6RouterSolicit:
 		received.RouterSolicit.Increment()
+		if !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
 
 	case header.ICMPv6RouterAdvert:
+		received.RouterAdvert.Increment()
+
+		// Is the NDP payload of sufficient size to hold a Router
+		// Advertisement?
+		if pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize || !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
+
 		routerAddr := iph.SourceAddress()
 
 		//
@@ -370,17 +446,11 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 			return
 		}
 
-		p := h.NDPPayload()
-
-		// Is the NDP payload of sufficient size to hold a Router
-		// Advertisement?
-		if len(p) < header.NDPRAMinimumSize {
-			// ...No, silently drop the packet.
-			received.Invalid.Increment()
-			return
-		}
-
-		ra := header.NDPRouterAdvert(p)
+		// The remainder of payload must be only the router advertisement, so
+		// payload.ToView() always returns the advertisement. Per RFC 6980 section
+		// 5, NDP messages cannot be fragmented. Also note that in the common case
+		// NDP datagrams are very small and ToView() will not incur allocations.
+		ra := header.NDPRouterAdvert(payload.ToView())
 		opts := ra.Options()
 
 		// Are options valid as per the wire format?
@@ -395,8 +465,6 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 		// as RFC 4861 section 6.1.2 is concerned.
 		//
 
-		received.RouterAdvert.Increment()
-
 		// Tell the NIC to handle the RA.
 		stack := r.Stack()
 		rxNICID := r.NICID()
@@ -404,6 +472,10 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.P
 
 	case header.ICMPv6RedirectMsg:
 		received.RedirectMsg.Increment()
+		if !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
 
 	default:
 		received.Invalid.Increment()
@@ -463,7 +535,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	})
 
 	// TODO(stijlist): count this in ICMP stats.
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, tcpip.PacketBuffer{
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 	})
 }
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 50c4b6474..52a01b44e 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -32,7 +32,8 @@ import (
 
 const (
 	linkAddr0 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
-	linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f")
+	linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
+	linkAddr2 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f")
 )
 
 var (
@@ -56,7 +57,7 @@ func (*stubLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 	return ""
 }
 
-func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, tcpip.PacketBuffer) *tcpip.Error {
+func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error {
 	return nil
 }
 
@@ -66,7 +67,7 @@ type stubDispatcher struct {
 	stack.TransportDispatcher
 }
 
-func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, tcpip.PacketBuffer) {
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, *stack.PacketBuffer) {
 }
 
 type stubLinkAddressCache struct {
@@ -165,7 +166,8 @@ func TestICMPCounts(t *testing.T) {
 		},
 		{
 			typ:  header.ICMPv6NeighborSolicit,
-			size: header.ICMPv6NeighborSolicitMinimumSize},
+			size: header.ICMPv6NeighborSolicitMinimumSize,
+		},
 		{
 			typ:       header.ICMPv6NeighborAdvert,
 			size:      header.ICMPv6NeighborAdvertMinimumSize,
@@ -177,36 +179,32 @@ func TestICMPCounts(t *testing.T) {
 		},
 	}
 
-	handleIPv6Payload := func(hdr buffer.Prependable) {
-		payloadLength := hdr.UsedLength()
-		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	handleIPv6Payload := func(icmp header.ICMPv6) {
+		ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
 		ip.Encode(&header.IPv6Fields{
-			PayloadLength: uint16(payloadLength),
+			PayloadLength: uint16(len(icmp)),
 			NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 			HopLimit:      header.NDPHopLimit,
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(&r, tcpip.PacketBuffer{
-			Data: hdr.View().ToVectorisedView(),
+		ep.HandlePacket(&r, &stack.PacketBuffer{
+			NetworkHeader: buffer.View(ip),
+			Data:          buffer.View(icmp).ToVectorisedView(),
 		})
 	}
 
 	for _, typ := range types {
-		extraDataLen := len(typ.extraData)
-		hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
-		extraData := buffer.View(hdr.Prepend(extraDataLen))
-		copy(extraData, typ.extraData)
-		pkt := header.ICMPv6(hdr.Prepend(typ.size))
-		pkt.SetType(typ.typ)
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
-
-		handleIPv6Payload(hdr)
+		icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+		copy(icmp[typ.size:], typ.extraData)
+		icmp.SetType(typ.typ)
+		icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+		handleIPv6Payload(icmp)
 	}
 
 	// Construct an empty ICMP packet so that
 	// Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented.
-	handleIPv6Payload(buffer.NewPrependable(header.IPv6MinimumSize))
+	handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize)))
 
 	icmpv6Stats := s.Stats().ICMP.V6PacketsReceived
 	visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) {
@@ -326,7 +324,7 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 		views := []buffer.View{pi.Pkt.Header.View(), pi.Pkt.Data.ToView()}
 		size := pi.Pkt.Header.UsedLength() + pi.Pkt.Data.Size()
 		vv := buffer.NewVectorisedView(size, views)
-		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), tcpip.PacketBuffer{
+		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), &stack.PacketBuffer{
 			Data: vv,
 		})
 	}
@@ -544,25 +542,22 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 			}
 
 			handleIPv6Payload := func(checksum bool) {
-				extraDataLen := len(typ.extraData)
-				hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
-				extraData := buffer.View(hdr.Prepend(extraDataLen))
-				copy(extraData, typ.extraData)
-				pkt := header.ICMPv6(hdr.Prepend(typ.size))
-				pkt.SetType(typ.typ)
+				icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+				copy(icmp[typ.size:], typ.extraData)
+				icmp.SetType(typ.typ)
 				if checksum {
-					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, extraData.ToVectorisedView()))
+					icmp.SetChecksum(header.ICMPv6Checksum(icmp, lladdr1, lladdr0, buffer.View{}.ToVectorisedView()))
 				}
-				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
 				ip.Encode(&header.IPv6Fields{
-					PayloadLength: uint16(typ.size + extraDataLen),
+					PayloadLength: uint16(len(icmp)),
 					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 					HopLimit:      header.NDPHopLimit,
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
-					Data: hdr.View().ToVectorisedView(),
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+					Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}),
 				})
 			}
 
@@ -738,7 +733,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 					Data: hdr.View().ToVectorisedView(),
 				})
 			}
@@ -916,7 +911,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 					Data: buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize, []buffer.View{hdr.View(), payload}),
 				})
 			}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 180a480fd..95fbcf2d1 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -21,11 +21,14 @@
 package ipv6
 
 import (
+	"fmt"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
+	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -49,6 +52,7 @@ type endpoint struct {
 	linkEP        stack.LinkEndpoint
 	linkAddrCache stack.LinkAddressCache
 	dispatcher    stack.TransportDispatcher
+	fragmentation *fragmentation.Fragmentation
 	protocol      *protocol
 }
 
@@ -112,7 +116,7 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
@@ -124,7 +128,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
-		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
+		e.HandlePacket(&loopedR, &stack.PacketBuffer{
 			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 
@@ -139,19 +143,17 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("not implemented")
 	}
 	if r.Loop&stack.PacketOut == 0 {
-		return len(pkts), nil
+		return pkts.Len(), nil
 	}
 
-	for i := range pkts {
-		hdr := &pkts[i].Header
-		size := pkts[i].DataSize
-		ip := e.addIPHeader(r, hdr, size, params)
-		pkts[i].NetworkHeader = buffer.View(ip)
+	for pb := pkts.Front(); pb != nil; pb = pb.Next() {
+		ip := e.addIPHeader(r, &pb.Header, pb.Data.Size(), params)
+		pb.NetworkHeader = buffer.View(ip)
 	}
 
 	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
@@ -161,37 +163,273 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 
 // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
 // supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	// TODO(b/146666412): Support IPv6 header-included packets.
 	return tcpip.ErrNotSupported
 }
 
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
-	headerView := pkt.Data.First()
-	h := header.IPv6(headerView)
-	if !h.IsValid(pkt.Data.Size()) {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.IPv6(pkt.NetworkHeader)
+	if !h.IsValid(pkt.Data.Size() + len(pkt.NetworkHeader) + len(pkt.TransportHeader)) {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
 
-	pkt.NetworkHeader = headerView[:header.IPv6MinimumSize]
-	pkt.Data.TrimFront(header.IPv6MinimumSize)
-	pkt.Data.CapLength(int(h.PayloadLength()))
-
-	p := h.TransportProtocol()
-	if p == header.ICMPv6ProtocolNumber {
-		e.handleICMP(r, headerView, pkt)
-		return
+	// vv consists of:
+	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
+	// - The transport header, if present.
+	// - Any other payload data.
+	vv := pkt.NetworkHeader[header.IPv6MinimumSize:].ToVectorisedView()
+	vv.AppendView(pkt.TransportHeader)
+	vv.Append(pkt.Data)
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv)
+	hasFragmentHeader := false
+
+	for firstHeader := true; ; firstHeader = false {
+		extHdr, done, err := it.Next()
+		if err != nil {
+			r.Stats().IP.MalformedPacketsReceived.Increment()
+			return
+		}
+		if done {
+			break
+		}
+
+		switch extHdr := extHdr.(type) {
+		case header.IPv6HopByHopOptionsExtHdr:
+			// As per RFC 8200 section 4.1, the Hop By Hop extension header is
+			// restricted to appear immediately after an IPv6 fixed header.
+			//
+			// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1
+			// (unrecognized next header) error in response to an extension header's
+			// Next Header field with the Hop By Hop extension header identifier.
+			if !firstHeader {
+				return
+			}
+
+			optsIt := extHdr.Iter()
+
+			for {
+				opt, done, err := optsIt.Next()
+				if err != nil {
+					r.Stats().IP.MalformedPacketsReceived.Increment()
+					return
+				}
+				if done {
+					break
+				}
+
+				// We currently do not support any IPv6 Hop By Hop extension header
+				// options.
+				switch opt.UnknownAction() {
+				case header.IPv6OptionUnknownActionSkip:
+				case header.IPv6OptionUnknownActionDiscard:
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMP:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				default:
+					panic(fmt.Sprintf("unrecognized action for an unrecognized Hop By Hop extension header option = %d", opt))
+				}
+			}
+
+		case header.IPv6RoutingExtHdr:
+			// As per RFC 8200 section 4.4, if a node encounters a routing header with
+			// an unrecognized routing type value, with a non-zero Segments Left
+			// value, the node must discard the packet and send an ICMP Parameter
+			// Problem, Code 0. If the Segments Left is 0, the node must ignore the
+			// Routing extension header and process the next header in the packet.
+			//
+			// Note, the stack does not yet handle any type of routing extension
+			// header, so we just make sure Segments Left is zero before processing
+			// the next extension header.
+			//
+			// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 0 for
+			// unrecognized routing types with a non-zero Segments Left value.
+			if extHdr.SegmentsLeft() != 0 {
+				return
+			}
+
+		case header.IPv6FragmentExtHdr:
+			hasFragmentHeader = true
+
+			if extHdr.IsAtomic() {
+				// This fragment extension header indicates that this packet is an
+				// atomic fragment. An atomic fragment is a fragment that contains
+				// all the data required to reassemble a full packet. As per RFC 6946,
+				// atomic fragments must not interfere with "normal" fragmented traffic
+				// so we skip processing the fragment instead of feeding it through the
+				// reassembly process below.
+				continue
+			}
+
+			// Don't consume the iterator if we have the first fragment because we
+			// will use it to validate that the first fragment holds the upper layer
+			// header.
+			rawPayload := it.AsRawHeader(extHdr.FragmentOffset() != 0 /* consume */)
+
+			if extHdr.FragmentOffset() == 0 {
+				// Check that the iterator ends with a raw payload as the first fragment
+				// should include all headers up to and including any upper layer
+				// headers, as per RFC 8200 section 4.5; only upper layer data
+				// (non-headers) should follow the fragment extension header.
+				var lastHdr header.IPv6PayloadHeader
+
+				for {
+					it, done, err := it.Next()
+					if err != nil {
+						r.Stats().IP.MalformedPacketsReceived.Increment()
+						r.Stats().IP.MalformedPacketsReceived.Increment()
+						return
+					}
+					if done {
+						break
+					}
+
+					lastHdr = it
+				}
+
+				// If the last header is a raw header, then the last portion of the IPv6
+				// payload is not a known IPv6 extension header. Note, this does not
+				// mean that the last portion is an upper layer header or not an
+				// extension header because:
+				//  1) we do not yet support all extension headers
+				//  2) we do not validate the upper layer header before reassembling.
+				//
+				// This check makes sure that a known IPv6 extension header is not
+				// present after the Fragment extension header in a non-initial
+				// fragment.
+				//
+				// TODO(#2196): Support IPv6 Authentication and Encapsulated
+				// Security Payload extension headers.
+				// TODO(#2333): Validate that the upper layer header is valid.
+				switch lastHdr.(type) {
+				case header.IPv6RawPayloadHeader:
+				default:
+					r.Stats().IP.MalformedPacketsReceived.Increment()
+					r.Stats().IP.MalformedFragmentsReceived.Increment()
+					return
+				}
+			}
+
+			fragmentPayloadLen := rawPayload.Buf.Size()
+			if fragmentPayloadLen == 0 {
+				// Drop the packet as it's marked as a fragment but has no payload.
+				r.Stats().IP.MalformedPacketsReceived.Increment()
+				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				return
+			}
+
+			// The packet is a fragment, let's try to reassemble it.
+			start := extHdr.FragmentOffset() * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit
+			last := start + uint16(fragmentPayloadLen) - 1
+
+			// Drop the packet if the fragmentOffset is incorrect. i.e the
+			// combination of fragmentOffset and pkt.Data.size() causes a
+			// wrap around resulting in last being less than the offset.
+			if last < start {
+				r.Stats().IP.MalformedPacketsReceived.Increment()
+				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				return
+			}
+
+			var ready bool
+			// Note that pkt doesn't have its transport header set after reassembly,
+			// and won't until DeliverNetworkPacket sets it.
+			pkt.Data, ready, err = e.fragmentation.Process(hash.IPv6FragmentHash(h, extHdr.ID()), start, last, extHdr.More(), rawPayload.Buf)
+			if err != nil {
+				r.Stats().IP.MalformedPacketsReceived.Increment()
+				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				return
+			}
+
+			if ready {
+				// We create a new iterator with the reassembled packet because we could
+				// have more extension headers in the reassembled payload, as per RFC
+				// 8200 section 4.5.
+				it = header.MakeIPv6PayloadIterator(rawPayload.Identifier, pkt.Data)
+			}
+
+		case header.IPv6DestinationOptionsExtHdr:
+			optsIt := extHdr.Iter()
+
+			for {
+				opt, done, err := optsIt.Next()
+				if err != nil {
+					r.Stats().IP.MalformedPacketsReceived.Increment()
+					return
+				}
+				if done {
+					break
+				}
+
+				// We currently do not support any IPv6 Destination extension header
+				// options.
+				switch opt.UnknownAction() {
+				case header.IPv6OptionUnknownActionSkip:
+				case header.IPv6OptionUnknownActionDiscard:
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMP:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				default:
+					panic(fmt.Sprintf("unrecognized action for an unrecognized Destination extension header option = %d", opt))
+				}
+			}
+
+		case header.IPv6RawPayloadHeader:
+			// If the last header in the payload isn't a known IPv6 extension header,
+			// handle it as if it is transport layer data.
+
+			// For unfragmented packets, extHdr still contains the transport header.
+			// Get rid of it.
+			//
+			// For reassembled fragments, pkt.TransportHeader is unset, so this is a
+			// no-op and pkt.Data begins with the transport header.
+			extHdr.Buf.TrimFront(len(pkt.TransportHeader))
+			pkt.Data = extHdr.Buf
+
+			if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
+				e.handleICMP(r, pkt, hasFragmentHeader)
+			} else {
+				r.Stats().IP.PacketsDelivered.Increment()
+				// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
+				// in response to unrecognized next header values.
+				e.dispatcher.DeliverTransportPacket(r, p, pkt)
+			}
+
+		default:
+			// If we receive a packet for an extension header we do not yet handle,
+			// drop the packet for now.
+			//
+			// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
+			// in response to unrecognized next header values.
+			r.Stats().UnknownProtocolRcvdPackets.Increment()
+			return
+		}
 	}
-
-	r.Stats().IP.PacketsDelivered.Increment()
-	e.dispatcher.DeliverTransportPacket(r, p, pkt)
 }
 
 // Close cleans up resources associated with the endpoint.
 func (*endpoint) Close() {}
 
+// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
+func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return e.protocol.Number()
+}
+
 type protocol struct {
 	// defaultTTL is the current default TTL for the protocol. Only the
 	// uint8 portion of it is meaningful and it must be accessed
@@ -229,6 +467,7 @@ func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWi
 		linkEP:        linkEP,
 		linkAddrCache: linkAddrCache,
 		dispatcher:    dispatcher,
+		fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
 		protocol:      p,
 	}, nil
 }
@@ -265,6 +504,85 @@ func (p *protocol) DefaultTTL() uint8 {
 	return uint8(atomic.LoadUint32(&p.defaultTTL))
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return 0, false, false
+	}
+	ipHdr := header.IPv6(hdr)
+
+	// dataClone consists of:
+	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
+	// - The transport header, if present.
+	// - Any other payload data.
+	views := [8]buffer.View{}
+	dataClone := pkt.Data.Clone(views[:])
+	dataClone.TrimFront(header.IPv6MinimumSize)
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone)
+
+	// Iterate over the IPv6 extensions to find their length.
+	//
+	// Parsing occurs again in HandlePacket because we don't track the
+	// extensions in PacketBuffer. Unfortunately, that means HandlePacket
+	// has to do the parsing work again.
+	var nextHdr tcpip.TransportProtocolNumber
+	foundNext := true
+	extensionsSize := 0
+traverseExtensions:
+	for extHdr, done, err := it.Next(); ; extHdr, done, err = it.Next() {
+		if err != nil {
+			break
+		}
+		// If we exhaust the extension list, the entire packet is the IPv6 header
+		// and (possibly) extensions.
+		if done {
+			extensionsSize = dataClone.Size()
+			foundNext = false
+			break
+		}
+
+		switch extHdr := extHdr.(type) {
+		case header.IPv6FragmentExtHdr:
+			// If this is an atomic fragment, we don't have to treat it specially.
+			if !extHdr.More() && extHdr.FragmentOffset() == 0 {
+				continue
+			}
+			// This is a non-atomic fragment and has to be re-assembled before we can
+			// examine the payload for a transport header.
+			foundNext = false
+
+		case header.IPv6RawPayloadHeader:
+			// We've found the payload after any extensions.
+			extensionsSize = dataClone.Size() - extHdr.Buf.Size()
+			nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
+			break traverseExtensions
+
+		default:
+			// Any other extension is a no-op, keep looping until we find the payload.
+		}
+	}
+
+	// Put the IPv6 header with extensions in pkt.NetworkHeader.
+	hdr, ok = pkt.Data.PullUp(header.IPv6MinimumSize + extensionsSize)
+	if !ok {
+		panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size()))
+	}
+	ipHdr = header.IPv6(hdr)
+
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(len(hdr))
+	pkt.Data.CapLength(int(ipHdr.PayloadLength()))
+
+	return nextHdr, foundNext, true
+}
+
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
 // payload mtu.
 func calculateMTU(mtu uint32) uint32 {
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 1cbfa7278..213ff64f2 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -17,6 +17,7 @@ package ipv6
 import (
 	"testing"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -33,6 +34,15 @@ const (
 	// The least significant 3 bytes are the same as addr2 so both addr2 and
 	// addr3 will have the same solicited-node address.
 	addr3 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x02"
+	addr4 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x03"
+
+	// Tests use the extension header identifier values as uint8 instead of
+	// header.IPv6ExtensionHeaderIdentifier.
+	hopByHopExtHdrID    = uint8(header.IPv6HopByHopOptionsExtHdrIdentifier)
+	routingExtHdrID     = uint8(header.IPv6RoutingExtHdrIdentifier)
+	fragmentExtHdrID    = uint8(header.IPv6FragmentExtHdrIdentifier)
+	destinationExtHdrID = uint8(header.IPv6DestinationOptionsExtHdrIdentifier)
+	noNextHdrID         = uint8(header.IPv6NoNextHeaderIdentifier)
 )
 
 // testReceiveICMP tests receiving an ICMP packet from src to dst. want is the
@@ -55,7 +65,7 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+	e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 		Data: hdr.View().ToVectorisedView(),
 	})
 
@@ -113,7 +123,7 @@ func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+	e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 		Data: hdr.View().ToVectorisedView(),
 	})
 
@@ -158,6 +168,8 @@ func TestReceiveOnAllNodesMulticastAddr(t *testing.T) {
 // packets destined to the IPv6 solicited-node address of an assigned IPv6
 // address.
 func TestReceiveOnSolicitedNodeAddr(t *testing.T) {
+	const nicID = 1
+
 	tests := []struct {
 		name            string
 		protocolFactory stack.TransportProtocol
@@ -175,50 +187,61 @@ func TestReceiveOnSolicitedNodeAddr(t *testing.T) {
 				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
 				TransportProtocols: []stack.TransportProtocol{test.protocolFactory},
 			})
-			e := channel.New(10, 1280, linkAddr1)
-			if err := s.CreateNIC(1, e); err != nil {
-				t.Fatalf("CreateNIC(_) = %s", err)
+			e := channel.New(1, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
-			// Should not receive a packet destined to the solicited
-			// node address of addr2/addr3 yet as we haven't added
-			// those addresses.
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: header.IPv6EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			// Should not receive a packet destined to the solicited node address of
+			// addr2/addr3 yet as we haven't added those addresses.
 			test.rxf(t, s, e, addr1, snmc, 0)
 
-			if err := s.AddAddress(1, ProtocolNumber, addr2); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, addr2, err)
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
 			}
 
-			// Should receive a packet destined to the solicited
-			// node address of addr2/addr3 now that we have added
-			// added addr2.
+			// Should receive a packet destined to the solicited node address of
+			// addr2/addr3 now that we have added added addr2.
 			test.rxf(t, s, e, addr1, snmc, 1)
 
-			if err := s.AddAddress(1, ProtocolNumber, addr3); err != nil {
-				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, addr3, err)
+			if err := s.AddAddress(nicID, ProtocolNumber, addr3); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr3, err)
 			}
 
-			// Should still receive a packet destined to the
-			// solicited node address of addr2/addr3 now that we
-			// have added addr3.
+			// Should still receive a packet destined to the solicited node address of
+			// addr2/addr3 now that we have added addr3.
 			test.rxf(t, s, e, addr1, snmc, 2)
 
-			if err := s.RemoveAddress(1, addr2); err != nil {
-				t.Fatalf("RemoveAddress(_, %s) = %s", addr2, err)
+			if err := s.RemoveAddress(nicID, addr2); err != nil {
+				t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr2, err)
 			}
 
-			// Should still receive a packet destined to the
-			// solicited node address of addr2/addr3 now that we
-			// have removed addr2.
+			// Should still receive a packet destined to the solicited node address of
+			// addr2/addr3 now that we have removed addr2.
 			test.rxf(t, s, e, addr1, snmc, 3)
 
-			if err := s.RemoveAddress(1, addr3); err != nil {
-				t.Fatalf("RemoveAddress(_, %s) = %s", addr3, err)
+			// Make sure addr3's endpoint does not get removed from the NIC by
+			// incrementing its reference count with a route.
+			r, err := s.FindRoute(nicID, addr3, addr4, ProtocolNumber, false)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr3, addr4, ProtocolNumber, err)
+			}
+			defer r.Release()
+
+			if err := s.RemoveAddress(nicID, addr3); err != nil {
+				t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr3, err)
 			}
 
-			// Should not receive a packet destined to the solicited
-			// node address of addr2/addr3 yet as both of them got
-			// removed.
+			// Should not receive a packet destined to the solicited node address of
+			// addr2/addr3 yet as both of them got removed, even though a route using
+			// addr3 exists.
 			test.rxf(t, s, e, addr1, snmc, 3)
 		})
 	}
@@ -268,3 +291,975 @@ func TestAddIpv6Address(t *testing.T) {
 		})
 	}
 }
+
+func TestReceiveIPv6ExtHdrs(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name         string
+		extHdr       func(nextHdr uint8) ([]byte, uint8)
+		shouldAccept bool
+	}{
+		{
+			name:         "None",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, nextHdr },
+			shouldAccept: true,
+		},
+		{
+			name: "hopbyhop with unknown option skippable action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Skippable unknown.
+					62, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "hopbyhop with unknown option discard action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard unknown.
+					127, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop with unknown option discard and send icmp action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP if option is unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop with unknown option discard and send icmp action unless multicast dest",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP unless packet is for multicast destination if
+					// option is unknown.
+					255, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name:         "routing with zero segments left",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 2, 3, 4, 5}, routingExtHdrID },
+			shouldAccept: true,
+		},
+		{
+			name:         "routing with non-zero segments left",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 1, 2, 3, 4, 5}, routingExtHdrID },
+			shouldAccept: false,
+		},
+		{
+			name:         "atomic fragment with zero ID",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 0, 0, 0, 0, 0, 0}, fragmentExtHdrID },
+			shouldAccept: true,
+		},
+		{
+			name:         "atomic fragment with non-zero ID",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 0, 0, 1, 2, 3, 4}, fragmentExtHdrID },
+			shouldAccept: true,
+		},
+		{
+			name:         "fragment",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 1, 2, 3, 4}, fragmentExtHdrID },
+			shouldAccept: false,
+		},
+		{
+			name:         "No next header",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID },
+			shouldAccept: false,
+		},
+		{
+			name: "destination with unknown option skippable action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Skippable unknown.
+					62, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "destination with unknown option discard action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard unknown.
+					127, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "destination with unknown option discard and send icmp action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP if option is unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "destination with unknown option discard and send icmp action unless multicast dest",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP unless packet is for multicast destination if
+					// option is unknown.
+					255, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "routing - atomic fragment",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					nextHdr, 0, 0, 0, 1, 2, 3, 4,
+				}, routingExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "atomic fragment - routing",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Fragment extension header.
+					routingExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Routing extension header.
+					nextHdr, 0, 1, 0, 2, 3, 4, 5,
+				}, fragmentExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "hop by hop (with skippable unknown) - routing",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with skippable unknown option.
+					routingExtHdrID, 0, 62, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					nextHdr, 0, 1, 0, 2, 3, 4, 5,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "routing - hop by hop (with skippable unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Routing extension header.
+					hopByHopExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Hop By Hop extension header with skippable unknown option.
+					nextHdr, 0, 62, 4, 1, 2, 3, 4,
+				}, routingExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name:         "No next header",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID },
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop (with skippable unknown) - routing - atomic fragment - destination (with skippable unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with skippable unknown option.
+					routingExtHdrID, 0, 62, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					destinationExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Destination extension header with skippable unknown option.
+					nextHdr, 0, 63, 4, 1, 2, 3, 4,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "hopbyhop (with discard unknown) - routing - atomic fragment - destination (with skippable unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with discard action for unknown option.
+					routingExtHdrID, 0, 65, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					destinationExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Destination extension header with skippable unknown option.
+					nextHdr, 0, 63, 4, 1, 2, 3, 4,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop (with skippable unknown) - routing - atomic fragment - destination (with discard unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with skippable unknown option.
+					routingExtHdrID, 0, 62, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					destinationExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Destination extension header with discard action for unknown
+					// option.
+					nextHdr, 0, 65, 4, 1, 2, 3, 4,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
+			}
+
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, ProtocolNumber, err)
+			}
+			defer ep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: addr2, Port: 80}
+			if err := ep.Bind(bindAddr); err != nil {
+				t.Fatalf("Bind(%+v): %s", bindAddr, err)
+			}
+
+			udpPayload := []byte{1, 2, 3, 4, 5, 6, 7, 8}
+			udpLength := header.UDPMinimumSize + len(udpPayload)
+			extHdrBytes, ipv6NextHdr := test.extHdr(uint8(header.UDPProtocolNumber))
+			extHdrLen := len(extHdrBytes)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + extHdrLen + udpLength)
+
+			// Serialize UDP message.
+			u := header.UDP(hdr.Prepend(udpLength))
+			u.Encode(&header.UDPFields{
+				SrcPort: 5555,
+				DstPort: 80,
+				Length:  uint16(udpLength),
+			})
+			copy(u.Payload(), udpPayload)
+			sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+			sum = header.Checksum(udpPayload, sum)
+			u.SetChecksum(^u.CalculateChecksum(sum))
+
+			// Copy extension header bytes between the UDP message and the IPv6
+			// fixed header.
+			copy(hdr.Prepend(extHdrLen), extHdrBytes)
+
+			// Serialize IPv6 fixed header.
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    ipv6NextHdr,
+				HopLimit:      255,
+				SrcAddr:       addr1,
+				DstAddr:       addr2,
+			})
+
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			stats := s.Stats().UDP.PacketsReceived
+
+			if !test.shouldAccept {
+				if got := stats.Value(); got != 0 {
+					t.Errorf("got UDP Rx Packets = %d, want = 0", got)
+				}
+
+				return
+			}
+
+			// Expect a UDP packet.
+			if got := stats.Value(); got != 1 {
+				t.Errorf("got UDP Rx Packets = %d, want = 1", got)
+			}
+			gotPayload, _, err := ep.Read(nil)
+			if err != nil {
+				t.Fatalf("Read(nil): %s", err)
+			}
+			if diff := cmp.Diff(buffer.View(udpPayload), gotPayload); diff != "" {
+				t.Errorf("got UDP payload mismatch (-want +got):\n%s", diff)
+			}
+
+			// Should not have any more UDP packets.
+			if gotPayload, _, err := ep.Read(nil); err != tcpip.ErrWouldBlock {
+				t.Fatalf("got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+			}
+		})
+	}
+}
+
+// fragmentData holds the IPv6 payload for a fragmented IPv6 packet.
+type fragmentData struct {
+	nextHdr uint8
+	data    buffer.VectorisedView
+}
+
+func TestReceiveIPv6Fragments(t *testing.T) {
+	const nicID = 1
+	const udpPayload1Length = 256
+	const udpPayload2Length = 128
+	const fragmentExtHdrLen = 8
+	// Note, not all routing extension headers will be 8 bytes but this test
+	// uses 8 byte routing extension headers for most sub tests.
+	const routingExtHdrLen = 8
+
+	udpGen := func(payload []byte, multiplier uint8) buffer.View {
+		payloadLen := len(payload)
+		for i := 0; i < payloadLen; i++ {
+			payload[i] = uint8(i) * multiplier
+		}
+
+		udpLength := header.UDPMinimumSize + payloadLen
+
+		hdr := buffer.NewPrependable(udpLength)
+		u := header.UDP(hdr.Prepend(udpLength))
+		u.Encode(&header.UDPFields{
+			SrcPort: 5555,
+			DstPort: 80,
+			Length:  uint16(udpLength),
+		})
+		copy(u.Payload(), payload)
+		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+		sum = header.Checksum(payload, sum)
+		u.SetChecksum(^u.CalculateChecksum(sum))
+		return hdr.View()
+	}
+
+	var udpPayload1Buf [udpPayload1Length]byte
+	udpPayload1 := udpPayload1Buf[:]
+	ipv6Payload1 := udpGen(udpPayload1, 1)
+
+	var udpPayload2Buf [udpPayload2Length]byte
+	udpPayload2 := udpPayload2Buf[:]
+	ipv6Payload2 := udpGen(udpPayload2, 2)
+
+	tests := []struct {
+		name             string
+		expectedPayload  []byte
+		fragments        []fragmentData
+		expectedPayloads [][]byte
+	}{
+		{
+			name: "No fragmentation",
+			fragments: []fragmentData{
+				{
+					nextHdr: uint8(header.UDPProtocolNumber),
+					data:    ipv6Payload1.ToVectorisedView(),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Atomic fragment",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1),
+						[]buffer.View{
+							// Fragment extension header.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 0}),
+
+							ipv6Payload1,
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments with different IDs",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 2
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 2}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with per-fragment routing header with zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 0.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 0.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments with per-fragment routing header with non-zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 1.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 1, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 1.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 1, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 9, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 72, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with routing header with zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header.
+							//
+							// Segments left = 0.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 1, 0, 2, 3, 4, 5}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 9, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 72, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments with routing header with non-zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header.
+							//
+							// Segments left = 1.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 1, 1, 2, 3, 4, 5}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 9, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 72, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with routing header with zero segments left across fragments",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is fragmentExtHdrLen+8 because the
+						// first 8 bytes of the 16 byte routing extension header is in
+						// this fragment.
+						fragmentExtHdrLen+8,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header (part 1)
+							//
+							// Segments left = 0.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 1, 1, 0, 2, 3, 4, 5}),
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is
+						// fragmentExtHdrLen+8+len(ipv6Payload1) because the last 8 bytes of
+						// the 16 byte routing extension header is in this fagment.
+						fragmentExtHdrLen+8+len(ipv6Payload1),
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 1, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 8, 0, 0, 0, 1}),
+
+							// Routing extension header (part 2)
+							buffer.View([]byte{6, 7, 8, 9, 10, 11, 12, 13}),
+
+							ipv6Payload1,
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with routing header with non-zero segments left across fragments",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is fragmentExtHdrLen+8 because the
+						// first 8 bytes of the 16 byte routing extension header is in
+						// this fragment.
+						fragmentExtHdrLen+8,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header (part 1)
+							//
+							// Segments left = 1.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 1, 1, 1, 2, 3, 4, 5}),
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is
+						// fragmentExtHdrLen+8+len(ipv6Payload1) because the last 8 bytes of
+						// the 16 byte routing extension header is in this fagment.
+						fragmentExtHdrLen+8+len(ipv6Payload1),
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 1, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 8, 0, 0, 0, 1}),
+
+							// Routing extension header (part 2)
+							buffer.View([]byte{6, 7, 8, 9, 10, 11, 12, 13}),
+
+							ipv6Payload1,
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		// As per RFC 6946, IPv6 atomic fragments MUST NOT interfere with "normal"
+		// fragmented traffic.
+		{
+			name: "Two fragments with atomic",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				// This fragment has the same ID as the other fragments but is an atomic
+				// fragment. It should not interfere with the other fragments.
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload2),
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 1}),
+
+							ipv6Payload2,
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload2, udpPayload1},
+		},
+		{
+			name: "Two interleaved fragmented packets",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+32,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 2
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 2}),
+
+							ipv6Payload2[:32],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload2)-32,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 4, More = false, ID = 2
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 32, 0, 0, 0, 2}),
+
+							ipv6Payload2[32:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1, udpPayload2},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
+			}
+
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, ProtocolNumber, err)
+			}
+			defer ep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: addr2, Port: 80}
+			if err := ep.Bind(bindAddr); err != nil {
+				t.Fatalf("Bind(%+v): %s", bindAddr, err)
+			}
+
+			for _, f := range test.fragments {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize)
+
+				// Serialize IPv6 fixed header.
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(f.data.Size()),
+					NextHeader:    f.nextHdr,
+					HopLimit:      255,
+					SrcAddr:       addr1,
+					DstAddr:       addr2,
+				})
+
+				vv := hdr.View().ToVectorisedView()
+				vv.Append(f.data)
+
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+					Data: vv,
+				})
+			}
+
+			if got, want := s.Stats().UDP.PacketsReceived.Value(), uint64(len(test.expectedPayloads)); got != want {
+				t.Errorf("got UDP Rx Packets = %d, want = %d", got, want)
+			}
+
+			for i, p := range test.expectedPayloads {
+				gotPayload, _, err := ep.Read(nil)
+				if err != nil {
+					t.Fatalf("(i=%d) Read(nil): %s", i, err)
+				}
+				if diff := cmp.Diff(buffer.View(p), gotPayload); diff != "" {
+					t.Errorf("(i=%d) got UDP payload mismatch (-want +got):\n%s", i, diff)
+				}
+			}
+
+			if gotPayload, _, err := ep.Read(nil); err != tcpip.ErrWouldBlock {
+				t.Fatalf("(last) got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index c9395de52..64239ce9a 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -20,6 +20,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -135,7 +136,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -173,6 +174,257 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 	}
 }
 
+func TestNeighorSolicitationResponse(t *testing.T) {
+	const nicID = 1
+	nicAddr := lladdr0
+	remoteAddr := lladdr1
+	nicAddrSNMC := header.SolicitedNodeAddr(nicAddr)
+	nicLinkAddr := linkAddr0
+	remoteLinkAddr0 := linkAddr1
+	remoteLinkAddr1 := linkAddr2
+
+	tests := []struct {
+		name          string
+		nsOpts        header.NDPOptionsSerializer
+		nsSrcLinkAddr tcpip.LinkAddress
+		nsSrc         tcpip.Address
+		nsDst         tcpip.Address
+		nsInvalid     bool
+		naDstLinkAddr tcpip.LinkAddress
+		naSolicited   bool
+		naSrc         tcpip.Address
+		naDst         tcpip.Address
+	}{
+		{
+			name:          "Unspecified source to multicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   false,
+			naSrc:         nicAddr,
+			naDst:         header.IPv6AllNodesMulticastAddress,
+		},
+		{
+			name: "Unspecified source with source ll option to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     true,
+		},
+		{
+			name:          "Unspecified source to unicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   false,
+			naSrc:         nicAddr,
+			naDst:         header.IPv6AllNodesMulticastAddress,
+		},
+		{
+			name: "Unspecified source with source ll option to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddr,
+			nsInvalid:     true,
+		},
+
+		{
+			name: "Specified source with 1 source ll to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 1 source ll different from route to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr1,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name:          "Specified source to multicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     true,
+		},
+		{
+			name: "Specified source with 2 source ll to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     true,
+		},
+
+		{
+			name:          "Specified source to unicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 1 source ll to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 1 source ll different from route to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr1,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 2 source ll to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			e := channel.New(1, 1280, nicLinkAddr)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, nicAddr); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, nicAddr, err)
+			}
+
+			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + test.nsOpts.Length()
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+			pkt.SetType(header.ICMPv6NeighborSolicit)
+			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+			ns.SetTargetAddress(nicAddr)
+			opts := ns.Options()
+			opts.Serialize(test.nsOpts)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, test.nsDst, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       test.nsSrc,
+				DstAddr:       test.nsDst,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			if test.nsInvalid {
+				if got := invalid.Value(); got != 1 {
+					t.Fatalf("got invalid = %d, want = 1", got)
+				}
+
+				if p, got := e.Read(); got {
+					t.Fatalf("unexpected response to an invalid NS = %+v", p.Pkt)
+				}
+
+				// If we expected the NS to be invalid, we have nothing else to check.
+				return
+			}
+
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			p, got := e.Read()
+			if !got {
+				t.Fatal("expected an NDP NA response")
+			}
+
+			if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
+				t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
+			}
+
+			checker.IPv6(t, p.Pkt.Header.View(),
+				checker.SrcAddr(test.naSrc),
+				checker.DstAddr(test.naDst),
+				checker.TTL(header.NDPHopLimit),
+				checker.NDPNA(
+					checker.NDPNASolicitedFlag(test.naSolicited),
+					checker.NDPNATargetAddress(nicAddr),
+					checker.NDPNAOptions([]header.NDPOption{
+						header.NDPTargetLinkLayerAddressOption(nicLinkAddr[:]),
+					}),
+				))
+		})
+	}
+}
+
 // TestNeighorAdvertisementWithTargetLinkLayerOption tests that receiving a
 // valid NDP NA message with the Target Link Layer Address option results in a
 // new entry in the link address cache for the target of the message.
@@ -197,6 +449,13 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 			name:    "Invalid Length",
 			optsBuf: []byte{2, 2, 2, 3, 4, 5, 6, 7},
 		},
+		{
+			name: "Multiple",
+			optsBuf: []byte{
+				2, 1, 2, 3, 4, 5, 6, 7,
+				2, 1, 2, 3, 4, 5, 6, 8,
+			},
+		},
 	}
 
 	for _, test := range tests {
@@ -238,7 +497,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -276,9 +535,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 	}
 }
 
-// TestHopLimitValidation is a test that makes sure that NDP packets are only
-// received if their IP header's hop limit is set to 255.
-func TestHopLimitValidation(t *testing.T) {
+func TestNDPValidation(t *testing.T) {
 	setup := func(t *testing.T) (*stack.Stack, stack.NetworkEndpoint, stack.Route) {
 		t.Helper()
 
@@ -294,18 +551,29 @@ func TestHopLimitValidation(t *testing.T) {
 		return s, ep, r
 	}
 
-	handleIPv6Payload := func(hdr buffer.Prependable, hopLimit uint8, ep stack.NetworkEndpoint, r *stack.Route) {
-		payloadLength := hdr.UsedLength()
-		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
+		nextHdr := uint8(header.ICMPv6ProtocolNumber)
+		var extensions buffer.View
+		if atomicFragment {
+			extensions = buffer.NewView(header.IPv6FragmentExtHdrLength)
+			extensions[0] = nextHdr
+			nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
+		}
+
+		ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize + len(extensions)))
 		ip.Encode(&header.IPv6Fields{
-			PayloadLength: uint16(payloadLength),
-			NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+			PayloadLength: uint16(len(payload) + len(extensions)),
+			NextHeader:    nextHdr,
 			HopLimit:      hopLimit,
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(r, tcpip.PacketBuffer{
-			Data: hdr.View().ToVectorisedView(),
+		if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) {
+			t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n)
+		}
+		ep.HandlePacket(r, &stack.PacketBuffer{
+			NetworkHeader: buffer.View(ip),
+			Data:          payload.ToVectorisedView(),
 		})
 	}
 
@@ -364,61 +632,90 @@ func TestHopLimitValidation(t *testing.T) {
 		},
 	}
 
+	subTests := []struct {
+		name           string
+		atomicFragment bool
+		hopLimit       uint8
+		code           uint8
+		valid          bool
+	}{
+		{
+			name:           "Valid",
+			atomicFragment: false,
+			hopLimit:       header.NDPHopLimit,
+			code:           0,
+			valid:          true,
+		},
+		{
+			name:           "Fragmented",
+			atomicFragment: true,
+			hopLimit:       header.NDPHopLimit,
+			code:           0,
+			valid:          false,
+		},
+		{
+			name:           "Invalid hop limit",
+			atomicFragment: false,
+			hopLimit:       header.NDPHopLimit - 1,
+			code:           0,
+			valid:          false,
+		},
+		{
+			name:           "Invalid ICMPv6 code",
+			atomicFragment: false,
+			hopLimit:       header.NDPHopLimit,
+			code:           1,
+			valid:          false,
+		},
+	}
+
 	for _, typ := range types {
 		t.Run(typ.name, func(t *testing.T) {
-			s, ep, r := setup(t)
-			defer r.Release()
-
-			stats := s.Stats().ICMP.V6PacketsReceived
-			invalid := stats.Invalid
-			typStat := typ.statCounter(stats)
-
-			extraDataLen := len(typ.extraData)
-			hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
-			extraData := buffer.View(hdr.Prepend(extraDataLen))
-			copy(extraData, typ.extraData)
-			pkt := header.ICMPv6(hdr.Prepend(typ.size))
-			pkt.SetType(typ.typ)
-			pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
-
-			// Invalid count should initially be 0.
-			if got := invalid.Value(); got != 0 {
-				t.Fatalf("got invalid = %d, want = 0", got)
-			}
-
-			// Should not have received any ICMPv6 packets with
-			// type = typ.typ.
-			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
-			}
-
-			// Receive the NDP packet with an invalid hop limit
-			// value.
-			handleIPv6Payload(hdr, header.NDPHopLimit-1, ep, &r)
-
-			// Invalid count should have increased.
-			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
-			}
-
-			// Rx count of NDP packet of type typ.typ should not
-			// have increased.
-			if got := typStat.Value(); got != 0 {
-				t.Fatalf("got %s = %d, want = 0", typ.name, got)
-			}
-
-			// Receive the NDP packet with a valid hop limit value.
-			handleIPv6Payload(hdr, header.NDPHopLimit, ep, &r)
-
-			// Rx count of NDP packet of type typ.typ should have
-			// increased.
-			if got := typStat.Value(); got != 1 {
-				t.Fatalf("got %s = %d, want = 1", typ.name, got)
-			}
-
-			// Invalid count should not have increased again.
-			if got := invalid.Value(); got != 1 {
-				t.Fatalf("got invalid = %d, want = 1", got)
+			for _, test := range subTests {
+				t.Run(test.name, func(t *testing.T) {
+					s, ep, r := setup(t)
+					defer r.Release()
+
+					stats := s.Stats().ICMP.V6PacketsReceived
+					invalid := stats.Invalid
+					typStat := typ.statCounter(stats)
+
+					icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+					copy(icmp[typ.size:], typ.extraData)
+					icmp.SetType(typ.typ)
+					icmp.SetCode(test.code)
+					icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+
+					// Rx count of the NDP message should initially be 0.
+					if got := typStat.Value(); got != 0 {
+						t.Errorf("got %s = %d, want = 0", typ.name, got)
+					}
+
+					// Invalid count should initially be 0.
+					if got := invalid.Value(); got != 0 {
+						t.Errorf("got invalid = %d, want = 0", got)
+					}
+
+					if t.Failed() {
+						t.FailNow()
+					}
+
+					handleIPv6Payload(buffer.View(icmp), test.hopLimit, test.atomicFragment, ep, &r)
+
+					// Rx count of the NDP packet should have increased.
+					if got := typStat.Value(); got != 1 {
+						t.Errorf("got %s = %d, want = 1", typ.name, got)
+					}
+
+					want := uint64(0)
+					if !test.valid {
+						// Invalid count should have increased.
+						want = 1
+					}
+					if got := invalid.Value(); got != want {
+						t.Errorf("got invalid = %d, want = %d", got, want)
+					}
+				})
 			}
 		})
 	}
@@ -588,25 +885,22 @@ func TestRouterAdvertValidation(t *testing.T) {
 				t.Fatalf("got rxRA = %d, want = 0", got)
 			}
 
-			e.InjectInbound(header.IPv6ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(header.IPv6ProtocolNumber, &stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
+			if got := rxRA.Value(); got != 1 {
+				t.Fatalf("got rxRA = %d, want = 1", got)
+			}
+
 			if test.expectedSuccess {
 				if got := invalid.Value(); got != 0 {
 					t.Fatalf("got invalid = %d, want = 0", got)
 				}
-				if got := rxRA.Value(); got != 1 {
-					t.Fatalf("got rxRA = %d, want = 1", got)
-				}
-
 			} else {
 				if got := invalid.Value(); got != 1 {
 					t.Fatalf("got invalid = %d, want = 1", got)
 				}
-				if got := rxRA.Value(); got != 0 {
-					t.Fatalf("got rxRA = %d, want = 0", got)
-				}
 			}
 		})
 	}
diff --git a/pkg/tcpip/packet_buffer_state.go b/pkg/tcpip/packet_buffer_state.go
deleted file mode 100644
index ad3cc24fa..000000000
--- a/pkg/tcpip/packet_buffer_state.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tcpip
-
-import "gvisor.dev/gvisor/pkg/tcpip/buffer"
-
-// beforeSave is invoked by stateify.
-func (pk *PacketBuffer) beforeSave() {
-	// Non-Data fields may be slices of the Data field. This causes
-	// problems for SR, so during save we make each header independent.
-	pk.Header = pk.Header.DeepCopy()
-	pk.LinkHeader = append(buffer.View(nil), pk.LinkHeader...)
-	pk.NetworkHeader = append(buffer.View(nil), pk.NetworkHeader...)
-	pk.TransportHeader = append(buffer.View(nil), pk.TransportHeader...)
-}
diff --git a/pkg/tcpip/seqnum/seqnum.go b/pkg/tcpip/seqnum/seqnum.go
index b40a3c212..d3bea7de4 100644
--- a/pkg/tcpip/seqnum/seqnum.go
+++ b/pkg/tcpip/seqnum/seqnum.go
@@ -46,11 +46,6 @@ func (v Value) InWindow(first Value, size Size) bool {
 	return v.InRange(first, first.Add(size))
 }
 
-// Overlap checks if the window [a,a+b) overlaps with the window [x, x+y).
-func Overlap(a Value, b Size, x Value, y Size) bool {
-	return a.LessThan(x.Add(y)) && x.LessThan(a.Add(b))
-}
-
 // Add calculates the sequence number following the [v, v+s) window.
 func (v Value) Add(s Size) Value {
 	return v + Value(s)
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 705cf01ee..f71073207 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -15,14 +15,35 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "packet_buffer_list",
+    out = "packet_buffer_list.go",
+    package = "stack",
+    prefix = "PacketBuffer",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*PacketBuffer",
+        "Linker": "*PacketBuffer",
+    },
+)
+
 go_library(
     name = "stack",
     srcs = [
+        "conntrack.go",
+        "dhcpv6configurationfromndpra_string.go",
+        "forwarder.go",
         "icmp_rate_limit.go",
+        "iptables.go",
+        "iptables_targets.go",
+        "iptables_types.go",
         "linkaddrcache.go",
         "linkaddrentry_list.go",
         "ndp.go",
         "nic.go",
+        "packet_buffer.go",
+        "packet_buffer_list.go",
+        "rand.go",
         "registration.go",
         "route.go",
         "stack.go",
@@ -32,6 +53,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/ilist",
+        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
@@ -39,9 +61,9 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
+        "//pkg/tcpip/transport/tcpconntrack",
         "//pkg/waiter",
         "@org_golang_x_time//rate:go_default_library",
     ],
@@ -63,7 +85,6 @@ go_test(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
@@ -79,6 +100,7 @@ go_test(
     name = "stack_test",
     size = "small",
     srcs = [
+        "forwarder_test.go",
         "linkaddrcache_test.go",
         "nic_test.go",
     ],
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
new file mode 100644
index 000000000..05bf62788
--- /dev/null
+++ b/pkg/tcpip/stack/conntrack.go
@@ -0,0 +1,434 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"encoding/binary"
+	"sync"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack"
+)
+
+// Connection tracking is used to track and manipulate packets for NAT rules.
+// The connection is created for a packet if it does not exist. Every connection
+// contains two tuples (original and reply). The tuples are manipulated if there
+// is a matching NAT rule. The packet is modified by looking at the tuples in the
+// Prerouting and Output hooks.
+
+// Direction of the tuple.
+type ctDirection int
+
+const (
+	dirOriginal ctDirection = iota
+	dirReply
+)
+
+// Status of connection.
+// TODO(gvisor.dev/issue/170): Add other states of connection.
+type connStatus int
+
+const (
+	connNew connStatus = iota
+	connEstablished
+)
+
+// Manipulation type for the connection.
+type manipType int
+
+const (
+	manipDstPrerouting manipType = iota
+	manipDstOutput
+)
+
+// connTrackMutable is the manipulatable part of the tuple.
+type connTrackMutable struct {
+	// addr is source address of the tuple.
+	addr tcpip.Address
+
+	// port is source port of the tuple.
+	port uint16
+
+	// protocol is network layer protocol.
+	protocol tcpip.NetworkProtocolNumber
+}
+
+// connTrackImmutable is the non-manipulatable part of the tuple.
+type connTrackImmutable struct {
+	// addr is destination address of the tuple.
+	addr tcpip.Address
+
+	// direction is direction (original or reply) of the tuple.
+	direction ctDirection
+
+	// port is destination port of the tuple.
+	port uint16
+
+	// protocol is transport layer protocol.
+	protocol tcpip.TransportProtocolNumber
+}
+
+// connTrackTuple represents the tuple which is created from the
+// packet.
+type connTrackTuple struct {
+	// dst is non-manipulatable part of the tuple.
+	dst connTrackImmutable
+
+	// src is manipulatable part of the tuple.
+	src connTrackMutable
+}
+
+// connTrackTupleHolder is the container of tuple and connection.
+type ConnTrackTupleHolder struct {
+	// conn is pointer to the connection tracking entry.
+	conn *connTrack
+
+	// tuple is original or reply tuple.
+	tuple connTrackTuple
+}
+
+// connTrack is the connection.
+type connTrack struct {
+	// originalTupleHolder contains tuple in original direction.
+	originalTupleHolder ConnTrackTupleHolder
+
+	// replyTupleHolder contains tuple in reply direction.
+	replyTupleHolder ConnTrackTupleHolder
+
+	// status indicates connection is new or established.
+	status connStatus
+
+	// timeout indicates the time connection should be active.
+	timeout time.Duration
+
+	// manip indicates if the packet should be manipulated.
+	manip manipType
+
+	// tcb is TCB control block. It is used to keep track of states
+	// of tcp connection.
+	tcb tcpconntrack.TCB
+
+	// tcbHook indicates if the packet is inbound or outbound to
+	// update the state of tcb.
+	tcbHook Hook
+}
+
+// ConnTrackTable contains a map of all existing connections created for
+// NAT rules.
+type ConnTrackTable struct {
+	// connMu protects connTrackTable.
+	connMu sync.RWMutex
+
+	// connTrackTable maintains a map of tuples needed for connection tracking
+	// for iptables NAT rules. The key for the map is an integer calculated
+	// using seed, source address, destination address, source port and
+	// destination port.
+	CtMap map[uint32]ConnTrackTupleHolder
+
+	// seed is a one-time random value initialized at stack startup
+	// and is used in calculation of hash key for connection tracking
+	// table.
+	Seed uint32
+}
+
+// packetToTuple converts packet to a tuple in original direction.
+func packetToTuple(pkt *PacketBuffer, hook Hook) (connTrackTuple, *tcpip.Error) {
+	var tuple connTrackTuple
+
+	netHeader := header.IPv4(pkt.NetworkHeader)
+	// TODO(gvisor.dev/issue/170): Need to support for other
+	// protocols as well.
+	if netHeader == nil || netHeader.TransportProtocol() != header.TCPProtocolNumber {
+		return tuple, tcpip.ErrUnknownProtocol
+	}
+	tcpHeader := header.TCP(pkt.TransportHeader)
+	if tcpHeader == nil {
+		return tuple, tcpip.ErrUnknownProtocol
+	}
+
+	tuple.src.addr = netHeader.SourceAddress()
+	tuple.src.port = tcpHeader.SourcePort()
+	tuple.src.protocol = header.IPv4ProtocolNumber
+
+	tuple.dst.addr = netHeader.DestinationAddress()
+	tuple.dst.port = tcpHeader.DestinationPort()
+	tuple.dst.protocol = netHeader.TransportProtocol()
+
+	return tuple, nil
+}
+
+// getReplyTuple creates reply tuple for the given tuple.
+func getReplyTuple(tuple connTrackTuple) connTrackTuple {
+	var replyTuple connTrackTuple
+	replyTuple.src.addr = tuple.dst.addr
+	replyTuple.src.port = tuple.dst.port
+	replyTuple.src.protocol = tuple.src.protocol
+	replyTuple.dst.addr = tuple.src.addr
+	replyTuple.dst.port = tuple.src.port
+	replyTuple.dst.protocol = tuple.dst.protocol
+	replyTuple.dst.direction = dirReply
+
+	return replyTuple
+}
+
+// makeNewConn creates new connection.
+func makeNewConn(tuple, replyTuple connTrackTuple) connTrack {
+	var conn connTrack
+	conn.status = connNew
+	conn.originalTupleHolder.tuple = tuple
+	conn.originalTupleHolder.conn = &conn
+	conn.replyTupleHolder.tuple = replyTuple
+	conn.replyTupleHolder.conn = &conn
+
+	return conn
+}
+
+// getTupleHash returns hash of the tuple. The fields used for
+// generating hash are seed (generated once for stack), source address,
+// destination address, source port and destination ports.
+func (ct *ConnTrackTable) getTupleHash(tuple connTrackTuple) uint32 {
+	h := jenkins.Sum32(ct.Seed)
+	h.Write([]byte(tuple.src.addr))
+	h.Write([]byte(tuple.dst.addr))
+	portBuf := make([]byte, 2)
+	binary.LittleEndian.PutUint16(portBuf, tuple.src.port)
+	h.Write([]byte(portBuf))
+	binary.LittleEndian.PutUint16(portBuf, tuple.dst.port)
+	h.Write([]byte(portBuf))
+
+	return h.Sum32()
+}
+
+// connTrackForPacket returns connTrack for packet.
+// TODO(gvisor.dev/issue/170): Only TCP packets are supported. Need to support other
+// transport protocols.
+func (ct *ConnTrackTable) connTrackForPacket(pkt *PacketBuffer, hook Hook, createConn bool) (*connTrack, ctDirection) {
+	var dir ctDirection
+	tuple, err := packetToTuple(pkt, hook)
+	if err != nil {
+		return nil, dir
+	}
+
+	ct.connMu.Lock()
+	defer ct.connMu.Unlock()
+
+	connTrackTable := ct.CtMap
+	hash := ct.getTupleHash(tuple)
+
+	var conn *connTrack
+	switch createConn {
+	case true:
+		// If connection does not exist for the hash, create a new
+		// connection.
+		replyTuple := getReplyTuple(tuple)
+		replyHash := ct.getTupleHash(replyTuple)
+		newConn := makeNewConn(tuple, replyTuple)
+		conn = &newConn
+
+		// Add tupleHolders to the map.
+		// TODO(gvisor.dev/issue/170): Need to support collisions using linked list.
+		ct.CtMap[hash] = conn.originalTupleHolder
+		ct.CtMap[replyHash] = conn.replyTupleHolder
+	default:
+		tupleHolder, ok := connTrackTable[hash]
+		if !ok {
+			return nil, dir
+		}
+
+		// If this is the reply of new connection, set the connection
+		// status as ESTABLISHED.
+		conn = tupleHolder.conn
+		if conn.status == connNew && tupleHolder.tuple.dst.direction == dirReply {
+			conn.status = connEstablished
+		}
+		if tupleHolder.conn == nil {
+			panic("tupleHolder has null connection tracking entry")
+		}
+
+		dir = tupleHolder.tuple.dst.direction
+	}
+	return conn, dir
+}
+
+// SetNatInfo will manipulate the tuples according to iptables NAT rules.
+func (ct *ConnTrackTable) SetNatInfo(pkt *PacketBuffer, rt RedirectTarget, hook Hook) {
+	// Get the connection. Connection is always created before this
+	// function is called.
+	conn, _ := ct.connTrackForPacket(pkt, hook, false)
+	if conn == nil {
+		panic("connection should be created to manipulate tuples.")
+	}
+	replyTuple := conn.replyTupleHolder.tuple
+	replyHash := ct.getTupleHash(replyTuple)
+
+	// TODO(gvisor.dev/issue/170): Support only redirect of ports. Need to
+	// support changing of address for Prerouting.
+
+	// Change the port as per the iptables rule. This tuple will be used
+	// to manipulate the packet in HandlePacket.
+	conn.replyTupleHolder.tuple.src.addr = rt.MinIP
+	conn.replyTupleHolder.tuple.src.port = rt.MinPort
+	newHash := ct.getTupleHash(conn.replyTupleHolder.tuple)
+
+	// Add the changed tuple to the map.
+	ct.connMu.Lock()
+	defer ct.connMu.Unlock()
+	ct.CtMap[newHash] = conn.replyTupleHolder
+	if hook == Output {
+		conn.replyTupleHolder.conn.manip = manipDstOutput
+	}
+
+	// Delete the old tuple.
+	delete(ct.CtMap, replyHash)
+}
+
+// handlePacketPrerouting manipulates ports for packets in Prerouting hook.
+// TODO(gvisor.dev/issue/170): Change address for Prerouting hook..
+func handlePacketPrerouting(pkt *PacketBuffer, conn *connTrack, dir ctDirection) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+	tcpHeader := header.TCP(pkt.TransportHeader)
+
+	// For prerouting redirection, packets going in the original direction
+	// have their destinations modified and replies have their sources
+	// modified.
+	switch dir {
+	case dirOriginal:
+		port := conn.replyTupleHolder.tuple.src.port
+		tcpHeader.SetDestinationPort(port)
+		netHeader.SetDestinationAddress(conn.replyTupleHolder.tuple.src.addr)
+	case dirReply:
+		port := conn.originalTupleHolder.tuple.dst.port
+		tcpHeader.SetSourcePort(port)
+		netHeader.SetSourceAddress(conn.originalTupleHolder.tuple.dst.addr)
+	}
+
+	netHeader.SetChecksum(0)
+	netHeader.SetChecksum(^netHeader.CalculateChecksum())
+}
+
+// handlePacketOutput manipulates ports for packets in Output hook.
+func handlePacketOutput(pkt *PacketBuffer, conn *connTrack, gso *GSO, r *Route, dir ctDirection) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+	tcpHeader := header.TCP(pkt.TransportHeader)
+
+	// For output redirection, packets going in the original direction
+	// have their destinations modified and replies have their sources
+	// modified. For prerouting redirection, we only reach this point
+	// when replying, so packet sources are modified.
+	if conn.manip == manipDstOutput && dir == dirOriginal {
+		port := conn.replyTupleHolder.tuple.src.port
+		tcpHeader.SetDestinationPort(port)
+		netHeader.SetDestinationAddress(conn.replyTupleHolder.tuple.src.addr)
+	} else {
+		port := conn.originalTupleHolder.tuple.dst.port
+		tcpHeader.SetSourcePort(port)
+		netHeader.SetSourceAddress(conn.originalTupleHolder.tuple.dst.addr)
+	}
+
+	// Calculate the TCP checksum and set it.
+	tcpHeader.SetChecksum(0)
+	hdr := &pkt.Header
+	length := uint16(pkt.Data.Size()+hdr.UsedLength()) - uint16(netHeader.HeaderLength())
+	xsum := r.PseudoHeaderChecksum(header.TCPProtocolNumber, length)
+	if gso != nil && gso.NeedsCsum {
+		tcpHeader.SetChecksum(xsum)
+	} else if r.Capabilities()&CapabilityTXChecksumOffload == 0 {
+		xsum = header.ChecksumVVWithOffset(pkt.Data, xsum, int(tcpHeader.DataOffset()), pkt.Data.Size())
+		tcpHeader.SetChecksum(^tcpHeader.CalculateChecksum(xsum))
+	}
+
+	netHeader.SetChecksum(0)
+	netHeader.SetChecksum(^netHeader.CalculateChecksum())
+}
+
+// HandlePacket will manipulate the port and address of the packet if the
+// connection exists.
+func (ct *ConnTrackTable) HandlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Route) {
+	if pkt.NatDone {
+		return
+	}
+
+	if hook != Prerouting && hook != Output {
+		return
+	}
+
+	conn, dir := ct.connTrackForPacket(pkt, hook, false)
+	// Connection or Rule not found for the packet.
+	if conn == nil {
+		return
+	}
+
+	netHeader := header.IPv4(pkt.NetworkHeader)
+	// TODO(gvisor.dev/issue/170): Need to support for other transport
+	// protocols as well.
+	if netHeader == nil || netHeader.TransportProtocol() != header.TCPProtocolNumber {
+		return
+	}
+
+	tcpHeader := header.TCP(pkt.TransportHeader)
+	if tcpHeader == nil {
+		return
+	}
+
+	switch hook {
+	case Prerouting:
+		handlePacketPrerouting(pkt, conn, dir)
+	case Output:
+		handlePacketOutput(pkt, conn, gso, r, dir)
+	}
+	pkt.NatDone = true
+
+	// Update the state of tcb.
+	// TODO(gvisor.dev/issue/170): Add support in tcpcontrack to handle
+	// other tcp states.
+	var st tcpconntrack.Result
+	if conn.tcb.IsEmpty() {
+		conn.tcb.Init(tcpHeader)
+		conn.tcbHook = hook
+	} else {
+		switch hook {
+		case conn.tcbHook:
+			st = conn.tcb.UpdateStateOutbound(tcpHeader)
+		default:
+			st = conn.tcb.UpdateStateInbound(tcpHeader)
+		}
+	}
+
+	// Delete conntrack if tcp connection is closed.
+	if st == tcpconntrack.ResultClosedByPeer || st == tcpconntrack.ResultClosedBySelf || st == tcpconntrack.ResultReset {
+		ct.deleteConnTrack(conn)
+	}
+}
+
+// deleteConnTrack deletes the connection.
+func (ct *ConnTrackTable) deleteConnTrack(conn *connTrack) {
+	if conn == nil {
+		return
+	}
+
+	tuple := conn.originalTupleHolder.tuple
+	hash := ct.getTupleHash(tuple)
+	replyTuple := conn.replyTupleHolder.tuple
+	replyHash := ct.getTupleHash(replyTuple)
+
+	ct.connMu.Lock()
+	defer ct.connMu.Unlock()
+
+	delete(ct.CtMap, hash)
+	delete(ct.CtMap, replyHash)
+}
diff --git a/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go b/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go
new file mode 100644
index 000000000..d199ded6a
--- /dev/null
+++ b/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by "stringer -type DHCPv6ConfigurationFromNDPRA"; DO NOT EDIT.
+
+package stack
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[DHCPv6NoConfiguration-1]
+	_ = x[DHCPv6ManagedAddress-2]
+	_ = x[DHCPv6OtherConfigurations-3]
+}
+
+const _DHCPv6ConfigurationFromNDPRA_name = "DHCPv6NoConfigurationDHCPv6ManagedAddressDHCPv6OtherConfigurations"
+
+var _DHCPv6ConfigurationFromNDPRA_index = [...]uint8{0, 21, 41, 66}
+
+func (i DHCPv6ConfigurationFromNDPRA) String() string {
+	i -= 1
+	if i < 0 || i >= DHCPv6ConfigurationFromNDPRA(len(_DHCPv6ConfigurationFromNDPRA_index)-1) {
+		return "DHCPv6ConfigurationFromNDPRA(" + strconv.FormatInt(int64(i+1), 10) + ")"
+	}
+	return _DHCPv6ConfigurationFromNDPRA_name[_DHCPv6ConfigurationFromNDPRA_index[i]:_DHCPv6ConfigurationFromNDPRA_index[i+1]]
+}
diff --git a/pkg/tcpip/stack/forwarder.go b/pkg/tcpip/stack/forwarder.go
new file mode 100644
index 000000000..3eff141e6
--- /dev/null
+++ b/pkg/tcpip/stack/forwarder.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// maxPendingResolutions is the maximum number of pending link-address
+	// resolutions.
+	maxPendingResolutions          = 64
+	maxPendingPacketsPerResolution = 256
+)
+
+type pendingPacket struct {
+	nic   *NIC
+	route *Route
+	proto tcpip.NetworkProtocolNumber
+	pkt   *PacketBuffer
+}
+
+type forwardQueue struct {
+	sync.Mutex
+
+	// The packets to send once the resolver completes.
+	packets map[<-chan struct{}][]*pendingPacket
+
+	// FIFO of channels used to cancel the oldest goroutine waiting for
+	// link-address resolution.
+	cancelChans []chan struct{}
+}
+
+func newForwardQueue() *forwardQueue {
+	return &forwardQueue{packets: make(map[<-chan struct{}][]*pendingPacket)}
+}
+
+func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	shouldWait := false
+
+	f.Lock()
+	packets, ok := f.packets[ch]
+	if !ok {
+		shouldWait = true
+	}
+	for len(packets) == maxPendingPacketsPerResolution {
+		p := packets[0]
+		packets = packets[1:]
+		p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+		p.route.Release()
+	}
+	if l := len(packets); l >= maxPendingPacketsPerResolution {
+		panic(fmt.Sprintf("max pending packets for resolution reached; got %d packets, max = %d", l, maxPendingPacketsPerResolution))
+	}
+	f.packets[ch] = append(packets, &pendingPacket{
+		nic:   n,
+		route: r,
+		proto: protocol,
+		pkt:   pkt,
+	})
+	f.Unlock()
+
+	if !shouldWait {
+		return
+	}
+
+	// Wait for the link-address resolution to complete.
+	// Start a goroutine with a forwarding-cancel channel so that we can
+	// limit the maximum number of goroutines running concurrently.
+	cancel := f.newCancelChannel()
+	go func() {
+		cancelled := false
+		select {
+		case <-ch:
+		case <-cancel:
+			cancelled = true
+		}
+
+		f.Lock()
+		packets := f.packets[ch]
+		delete(f.packets, ch)
+		f.Unlock()
+
+		for _, p := range packets {
+			if cancelled {
+				p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+			} else if _, err := p.route.Resolve(nil); err != nil {
+				p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+			} else {
+				p.nic.forwardPacket(p.route, p.proto, p.pkt)
+			}
+			p.route.Release()
+		}
+	}()
+}
+
+// newCancelChannel creates a channel that can cancel a pending forwarding
+// activity. The oldest channel is closed if the number of open channels would
+// exceed maxPendingResolutions.
+func (f *forwardQueue) newCancelChannel() chan struct{} {
+	f.Lock()
+	defer f.Unlock()
+
+	if len(f.cancelChans) == maxPendingResolutions {
+		ch := f.cancelChans[0]
+		f.cancelChans = f.cancelChans[1:]
+		close(ch)
+	}
+	if l := len(f.cancelChans); l >= maxPendingResolutions {
+		panic(fmt.Sprintf("max pending resolutions reached; got %d active resolutions, max = %d", l, maxPendingResolutions))
+	}
+
+	ch := make(chan struct{})
+	f.cancelChans = append(f.cancelChans, ch)
+	return ch
+}
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
new file mode 100644
index 000000000..a6546cef0
--- /dev/null
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -0,0 +1,650 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"encoding/binary"
+	"math"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+const (
+	fwdTestNetNumber           tcpip.NetworkProtocolNumber = math.MaxUint32
+	fwdTestNetHeaderLen                                    = 12
+	fwdTestNetDefaultPrefixLen                             = 8
+
+	// fwdTestNetDefaultMTU is the MTU, in bytes, used throughout the tests,
+	// except where another value is explicitly used. It is chosen to match
+	// the MTU of loopback interfaces on linux systems.
+	fwdTestNetDefaultMTU = 65536
+
+	dstAddrOffset        = 0
+	srcAddrOffset        = 1
+	protocolNumberOffset = 2
+)
+
+// fwdTestNetworkEndpoint is a network-layer protocol endpoint.
+// Headers of this protocol are fwdTestNetHeaderLen bytes, but we currently only
+// use the first three: destination address, source address, and transport
+// protocol. They're all one byte fields to simplify parsing.
+type fwdTestNetworkEndpoint struct {
+	nicID      tcpip.NICID
+	id         NetworkEndpointID
+	prefixLen  int
+	proto      *fwdTestNetworkProtocol
+	dispatcher TransportDispatcher
+	ep         LinkEndpoint
+}
+
+func (f *fwdTestNetworkEndpoint) MTU() uint32 {
+	return f.ep.MTU() - uint32(f.MaxHeaderLength())
+}
+
+func (f *fwdTestNetworkEndpoint) NICID() tcpip.NICID {
+	return f.nicID
+}
+
+func (f *fwdTestNetworkEndpoint) PrefixLen() int {
+	return f.prefixLen
+}
+
+func (*fwdTestNetworkEndpoint) DefaultTTL() uint8 {
+	return 123
+}
+
+func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
+	return &f.id
+}
+
+func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt *PacketBuffer) {
+	// Dispatch the packet to the transport protocol.
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader[protocolNumberOffset]), pkt)
+}
+
+func (f *fwdTestNetworkEndpoint) MaxHeaderLength() uint16 {
+	return f.ep.MaxHeaderLength() + fwdTestNetHeaderLen
+}
+
+func (f *fwdTestNetworkEndpoint) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, dstAddr tcpip.Address) uint16 {
+	return 0
+}
+
+func (f *fwdTestNetworkEndpoint) Capabilities() LinkEndpointCapabilities {
+	return f.ep.Capabilities()
+}
+
+func (f *fwdTestNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return f.proto.Number()
+}
+
+func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
+	// Add the protocol's header to the packet and send it to the link
+	// endpoint.
+	b := pkt.Header.Prepend(fwdTestNetHeaderLen)
+	b[dstAddrOffset] = r.RemoteAddress[0]
+	b[srcAddrOffset] = f.id.LocalAddress[0]
+	b[protocolNumberOffset] = byte(params.Protocol)
+
+	return f.ep.WritePacket(r, gso, fwdTestNetNumber, pkt)
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+func (*fwdTestNetworkEndpoint) Close() {}
+
+// fwdTestNetworkProtocol is a network-layer protocol that implements Address
+// resolution.
+type fwdTestNetworkProtocol struct {
+	addrCache              *linkAddrCache
+	addrResolveDelay       time.Duration
+	onLinkAddressResolved  func(cache *linkAddrCache, addr tcpip.Address)
+	onResolveStaticAddress func(tcpip.Address) (tcpip.LinkAddress, bool)
+}
+
+func (f *fwdTestNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
+	return fwdTestNetNumber
+}
+
+func (f *fwdTestNetworkProtocol) MinimumPacketSize() int {
+	return fwdTestNetHeaderLen
+}
+
+func (f *fwdTestNetworkProtocol) DefaultPrefixLen() int {
+	return fwdTestNetDefaultPrefixLen
+}
+
+func (*fwdTestNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
+}
+
+func (*fwdTestNetworkProtocol) Parse(pkt *PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	netHeader, ok := pkt.Data.PullUp(fwdTestNetHeaderLen)
+	if !ok {
+		return 0, false, false
+	}
+	pkt.NetworkHeader = netHeader
+	pkt.Data.TrimFront(fwdTestNetHeaderLen)
+	return tcpip.TransportProtocolNumber(pkt.NetworkHeader[protocolNumberOffset]), true, true
+}
+
+func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) (NetworkEndpoint, *tcpip.Error) {
+	return &fwdTestNetworkEndpoint{
+		nicID:      nicID,
+		id:         NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
+		prefixLen:  addrWithPrefix.PrefixLen,
+		proto:      f,
+		dispatcher: dispatcher,
+		ep:         ep,
+	}, nil
+}
+
+func (f *fwdTestNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func (f *fwdTestNetworkProtocol) Option(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func (f *fwdTestNetworkProtocol) Close() {}
+
+func (f *fwdTestNetworkProtocol) Wait() {}
+
+func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error {
+	if f.addrCache != nil && f.onLinkAddressResolved != nil {
+		time.AfterFunc(f.addrResolveDelay, func() {
+			f.onLinkAddressResolved(f.addrCache, addr)
+		})
+	}
+	return nil
+}
+
+func (f *fwdTestNetworkProtocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if f.onResolveStaticAddress != nil {
+		return f.onResolveStaticAddress(addr)
+	}
+	return "", false
+}
+
+func (f *fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return fwdTestNetNumber
+}
+
+// fwdTestPacketInfo holds all the information about an outbound packet.
+type fwdTestPacketInfo struct {
+	RemoteLinkAddress tcpip.LinkAddress
+	LocalLinkAddress  tcpip.LinkAddress
+	Pkt               *PacketBuffer
+}
+
+type fwdTestLinkEndpoint struct {
+	dispatcher NetworkDispatcher
+	mtu        uint32
+	linkAddr   tcpip.LinkAddress
+
+	// C is where outbound packets are queued.
+	C chan fwdTestPacketInfo
+}
+
+// InjectInbound injects an inbound packet.
+func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	e.InjectLinkAddr(protocol, "", pkt)
+}
+
+// InjectLinkAddr injects an inbound packet with a remote link address.
+func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
+}
+
+// Attach saves the stack network-layer dispatcher for use later when packets
+// are injected.
+func (e *fwdTestLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *fwdTestLinkEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *fwdTestLinkEndpoint) MTU() uint32 {
+	return e.mtu
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e fwdTestLinkEndpoint) Capabilities() LinkEndpointCapabilities {
+	caps := LinkEndpointCapabilities(0)
+	return caps | CapabilityResolutionRequired
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (*fwdTestLinkEndpoint) GSOMaxSize() uint32 {
+	return 1 << 15
+}
+
+// MaxHeaderLength returns the maximum size of the link layer header. Given it
+// doesn't have a header, it just returns 0.
+func (*fwdTestLinkEndpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
+	p := fwdTestPacketInfo{
+		RemoteLinkAddress: r.RemoteLinkAddress,
+		LocalLinkAddress:  r.LocalLinkAddress,
+		Pkt:               pkt,
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
+// WritePackets stores outbound packets into the channel.
+func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.WritePacket(r, gso, protocol, pkt)
+		n++
+	}
+
+	return n, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	p := fwdTestPacketInfo{
+		Pkt: &PacketBuffer{Data: vv},
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*fwdTestLinkEndpoint) Wait() {}
+
+func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (ep1, ep2 *fwdTestLinkEndpoint) {
+	// Create a stack with the network protocol and two NICs.
+	s := New(Options{
+		NetworkProtocols: []NetworkProtocol{proto},
+	})
+
+	proto.addrCache = s.linkAddrCache
+
+	// Enable forwarding.
+	s.SetForwarding(true)
+
+	// NIC 1 has the link address "a", and added the network address 1.
+	ep1 = &fwdTestLinkEndpoint{
+		C:        make(chan fwdTestPacketInfo, 300),
+		mtu:      fwdTestNetDefaultMTU,
+		linkAddr: "a",
+	}
+	if err := s.CreateNIC(1, ep1); err != nil {
+		t.Fatal("CreateNIC #1 failed:", err)
+	}
+	if err := s.AddAddress(1, fwdTestNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress #1 failed:", err)
+	}
+
+	// NIC 2 has the link address "b", and added the network address 2.
+	ep2 = &fwdTestLinkEndpoint{
+		C:        make(chan fwdTestPacketInfo, 300),
+		mtu:      fwdTestNetDefaultMTU,
+		linkAddr: "b",
+	}
+	if err := s.CreateNIC(2, ep2); err != nil {
+		t.Fatal("CreateNIC #2 failed:", err)
+	}
+	if err := s.AddAddress(2, fwdTestNetNumber, "\x02"); err != nil {
+		t.Fatal("AddAddress #2 failed:", err)
+	}
+
+	// Route all packets to NIC 2.
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, NIC: 2}})
+	}
+
+	return ep1, ep2
+}
+
+func TestForwardingWithStaticResolver(t *testing.T) {
+	// Create a network protocol with a static resolver.
+	proto := &fwdTestNetworkProtocol{
+		onResolveStaticAddress:
+		// The network address 3 is resolved to the link address "c".
+		func(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+			if addr == "\x03" {
+				return "c", true
+			}
+			return "", false
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	var p fwdTestPacketInfo
+
+	select {
+	case p = <-ep2.C:
+	default:
+		t.Fatal("packet not forwarded")
+	}
+
+	// Test that the static address resolution happened correctly.
+	if p.RemoteLinkAddress != "c" {
+		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+	}
+	if p.LocalLinkAddress != "b" {
+		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+	}
+}
+
+func TestForwardingWithFakeResolver(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any address will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	var p fwdTestPacketInfo
+
+	select {
+	case p = <-ep2.C:
+	case <-time.After(time.Second):
+		t.Fatal("packet not forwarded")
+	}
+
+	// Test that the address resolution happened correctly.
+	if p.RemoteLinkAddress != "c" {
+		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+	}
+	if p.LocalLinkAddress != "b" {
+		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+	}
+}
+
+func TestForwardingWithNoResolver(t *testing.T) {
+	// Create a network protocol without a resolver.
+	proto := &fwdTestNetworkProtocol{}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	select {
+	case <-ep2.C:
+		t.Fatal("Packet should not be forwarded")
+	case <-time.After(time.Second):
+	}
+}
+
+func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Only packets to address 3 will be resolved to the
+			// link address "c".
+			if addr == "\x03" {
+				cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+			}
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject an inbound packet to address 4 on NIC 1. This packet should
+	// not be forwarded.
+	buf := buffer.NewView(30)
+	buf[dstAddrOffset] = 4
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	// Inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf = buffer.NewView(30)
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	var p fwdTestPacketInfo
+
+	select {
+	case p = <-ep2.C:
+	case <-time.After(time.Second):
+		t.Fatal("packet not forwarded")
+	}
+
+	if p.Pkt.NetworkHeader[dstAddrOffset] != 3 {
+		t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", p.Pkt.NetworkHeader[dstAddrOffset])
+	}
+
+	// Test that the address resolution happened correctly.
+	if p.RemoteLinkAddress != "c" {
+		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+	}
+	if p.LocalLinkAddress != "b" {
+		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+	}
+}
+
+func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any packets will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject two inbound packets to address 3 on NIC 1.
+	for i := 0; i < 2; i++ {
+		buf := buffer.NewView(30)
+		buf[dstAddrOffset] = 3
+		ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	}
+
+	for i := 0; i < 2; i++ {
+		var p fwdTestPacketInfo
+
+		select {
+		case p = <-ep2.C:
+		case <-time.After(time.Second):
+			t.Fatal("packet not forwarded")
+		}
+
+		if p.Pkt.NetworkHeader[dstAddrOffset] != 3 {
+			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", p.Pkt.NetworkHeader[dstAddrOffset])
+		}
+
+		// Test that the address resolution happened correctly.
+		if p.RemoteLinkAddress != "c" {
+			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+		}
+		if p.LocalLinkAddress != "b" {
+			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+		}
+	}
+}
+
+func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any packets will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
+		// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
+		buf := buffer.NewView(30)
+		buf[dstAddrOffset] = 3
+		// Set the packet sequence number.
+		binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
+		ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	}
+
+	for i := 0; i < maxPendingPacketsPerResolution; i++ {
+		var p fwdTestPacketInfo
+
+		select {
+		case p = <-ep2.C:
+		case <-time.After(time.Second):
+			t.Fatal("packet not forwarded")
+		}
+
+		if b := p.Pkt.Header.View(); b[dstAddrOffset] != 3 {
+			t.Fatalf("got b[dstAddrOffset] = %d, want = 3", b[dstAddrOffset])
+		}
+		seqNumBuf, ok := p.Pkt.Data.PullUp(2) // The sequence number is a uint16 (2 bytes).
+		if !ok {
+			t.Fatalf("p.Pkt.Data is too short to hold a sequence number: %d", p.Pkt.Data.Size())
+		}
+
+		// The first 5 packets should not be forwarded so the sequence number should
+		// start with 5.
+		want := uint16(i + 5)
+		if n := binary.BigEndian.Uint16(seqNumBuf); n != want {
+			t.Fatalf("got the packet #%d, want = #%d", n, want)
+		}
+
+		// Test that the address resolution happened correctly.
+		if p.RemoteLinkAddress != "c" {
+			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+		}
+		if p.LocalLinkAddress != "b" {
+			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+		}
+	}
+}
+
+func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any packets will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	for i := 0; i < maxPendingResolutions+5; i++ {
+		// Inject inbound 'maxPendingResolutions + 5' packets on NIC 1.
+		// Each packet has a different destination address (3 to
+		// maxPendingResolutions + 7).
+		buf := buffer.NewView(30)
+		buf[dstAddrOffset] = byte(3 + i)
+		ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	}
+
+	for i := 0; i < maxPendingResolutions; i++ {
+		var p fwdTestPacketInfo
+
+		select {
+		case p = <-ep2.C:
+		case <-time.After(time.Second):
+			t.Fatal("packet not forwarded")
+		}
+
+		// The first 5 packets (address 3 to 7) should not be forwarded
+		// because their address resolutions are interrupted.
+		if p.Pkt.NetworkHeader[dstAddrOffset] < 8 {
+			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want p.Pkt.NetworkHeader[dstAddrOffset] >= 8", p.Pkt.NetworkHeader[dstAddrOffset])
+		}
+
+		// Test that the address resolution happened correctly.
+		if p.RemoteLinkAddress != "c" {
+			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+		}
+		if p.LocalLinkAddress != "b" {
+			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
new file mode 100644
index 000000000..4e9b404c8
--- /dev/null
+++ b/pkg/tcpip/stack/iptables.go
@@ -0,0 +1,366 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// Table names.
+const (
+	TablenameNat    = "nat"
+	TablenameMangle = "mangle"
+	TablenameFilter = "filter"
+)
+
+// Chain names as defined by net/ipv4/netfilter/ip_tables.c.
+const (
+	ChainNamePrerouting  = "PREROUTING"
+	ChainNameInput       = "INPUT"
+	ChainNameForward     = "FORWARD"
+	ChainNameOutput      = "OUTPUT"
+	ChainNamePostrouting = "POSTROUTING"
+)
+
+// HookUnset indicates that there is no hook set for an entrypoint or
+// underflow.
+const HookUnset = -1
+
+// DefaultTables returns a default set of tables. Each chain is set to accept
+// all packets.
+func DefaultTables() *IPTables {
+	// TODO(gvisor.dev/issue/170): We may be able to swap out some strings for
+	// iotas.
+	return &IPTables{
+		tables: map[string]Table{
+			TablenameNat: Table{
+				Rules: []Rule{
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: ErrorTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Prerouting:  0,
+					Input:       1,
+					Output:      2,
+					Postrouting: 3,
+				},
+				Underflows: map[Hook]int{
+					Prerouting:  0,
+					Input:       1,
+					Output:      2,
+					Postrouting: 3,
+				},
+				UserChains: map[string]int{},
+			},
+			TablenameMangle: Table{
+				Rules: []Rule{
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: ErrorTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Prerouting: 0,
+					Output:     1,
+				},
+				Underflows: map[Hook]int{
+					Prerouting: 0,
+					Output:     1,
+				},
+				UserChains: map[string]int{},
+			},
+			TablenameFilter: Table{
+				Rules: []Rule{
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: ErrorTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Input:   0,
+					Forward: 1,
+					Output:  2,
+				},
+				Underflows: map[Hook]int{
+					Input:   0,
+					Forward: 1,
+					Output:  2,
+				},
+				UserChains: map[string]int{},
+			},
+		},
+		priorities: map[Hook][]string{
+			Input:      []string{TablenameNat, TablenameFilter},
+			Prerouting: []string{TablenameMangle, TablenameNat},
+			Output:     []string{TablenameMangle, TablenameNat, TablenameFilter},
+		},
+		connections: ConnTrackTable{
+			CtMap: make(map[uint32]ConnTrackTupleHolder),
+			Seed:  generateRandUint32(),
+		},
+	}
+}
+
+// EmptyFilterTable returns a Table with no rules and the filter table chains
+// mapped to HookUnset.
+func EmptyFilterTable() Table {
+	return Table{
+		Rules: []Rule{},
+		BuiltinChains: map[Hook]int{
+			Input:   HookUnset,
+			Forward: HookUnset,
+			Output:  HookUnset,
+		},
+		Underflows: map[Hook]int{
+			Input:   HookUnset,
+			Forward: HookUnset,
+			Output:  HookUnset,
+		},
+		UserChains: map[string]int{},
+	}
+}
+
+// EmptyNatTable returns a Table with no rules and the filter table chains
+// mapped to HookUnset.
+func EmptyNatTable() Table {
+	return Table{
+		Rules: []Rule{},
+		BuiltinChains: map[Hook]int{
+			Prerouting:  HookUnset,
+			Input:       HookUnset,
+			Output:      HookUnset,
+			Postrouting: HookUnset,
+		},
+		Underflows: map[Hook]int{
+			Prerouting:  HookUnset,
+			Input:       HookUnset,
+			Output:      HookUnset,
+			Postrouting: HookUnset,
+		},
+		UserChains: map[string]int{},
+	}
+}
+
+// GetTable returns table by name.
+func (it *IPTables) GetTable(name string) (Table, bool) {
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	t, ok := it.tables[name]
+	return t, ok
+}
+
+// ReplaceTable replaces or inserts table by name.
+func (it *IPTables) ReplaceTable(name string, table Table) {
+	it.mu.Lock()
+	defer it.mu.Unlock()
+	it.tables[name] = table
+}
+
+// ModifyTables acquires write-lock and calls fn with internal name-to-table
+// map. This function can be used to update multiple tables atomically.
+func (it *IPTables) ModifyTables(fn func(map[string]Table)) {
+	it.mu.Lock()
+	defer it.mu.Unlock()
+	fn(it.tables)
+}
+
+// GetPriorities returns slice of priorities associated with hook.
+func (it *IPTables) GetPriorities(hook Hook) []string {
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	return it.priorities[hook]
+}
+
+// A chainVerdict is what a table decides should be done with a packet.
+type chainVerdict int
+
+const (
+	// chainAccept indicates the packet should continue through netstack.
+	chainAccept chainVerdict = iota
+
+	// chainAccept indicates the packet should be dropped.
+	chainDrop
+
+	// chainReturn indicates the packet should return to the calling chain
+	// or the underflow rule of a builtin chain.
+	chainReturn
+)
+
+// Check runs pkt through the rules for hook. It returns true when the packet
+// should continue traversing the network stack and false when it should be
+// dropped.
+//
+// Precondition: pkt.NetworkHeader is set.
+func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, address tcpip.Address, nicName string) bool {
+	// Packets are manipulated only if connection and matching
+	// NAT rule exists.
+	it.connections.HandlePacket(pkt, hook, gso, r)
+
+	// Go through each table containing the hook.
+	for _, tablename := range it.GetPriorities(hook) {
+		table, _ := it.GetTable(tablename)
+		ruleIdx := table.BuiltinChains[hook]
+		switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
+		// If the table returns Accept, move on to the next table.
+		case chainAccept:
+			continue
+		// The Drop verdict is final.
+		case chainDrop:
+			return false
+		case chainReturn:
+			// Any Return from a built-in chain means we have to
+			// call the underflow.
+			underflow := table.Rules[table.Underflows[hook]]
+			switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, address); v {
+			case RuleAccept:
+				continue
+			case RuleDrop:
+				return false
+			case RuleJump, RuleReturn:
+				panic("Underflows should only return RuleAccept or RuleDrop.")
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", v))
+			}
+
+		default:
+			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
+		}
+	}
+
+	// Every table returned Accept.
+	return true
+}
+
+// CheckPackets runs pkts through the rules for hook and returns a map of packets that
+// should not go forward.
+//
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+//
+// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
+// precondition.
+//
+// NOTE: unlike the Check API the returned map contains packets that should be
+// dropped.
+func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *Route, nicName string) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) {
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if !pkt.NatDone {
+			if ok := it.Check(hook, pkt, gso, r, "", nicName); !ok {
+				if drop == nil {
+					drop = make(map[*PacketBuffer]struct{})
+				}
+				drop[pkt] = struct{}{}
+			}
+			if pkt.NatDone {
+				if natPkts == nil {
+					natPkts = make(map[*PacketBuffer]struct{})
+				}
+				natPkts[pkt] = struct{}{}
+			}
+		}
+	}
+	return drop, natPkts
+}
+
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// TODO(gvisor.dev/issue/170): pkt.NetworkHeader will always be set as a
+// precondition.
+func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) chainVerdict {
+	// Start from ruleIdx and walk the list of rules until a rule gives us
+	// a verdict.
+	for ruleIdx < len(table.Rules) {
+		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
+		case RuleAccept:
+			return chainAccept
+
+		case RuleDrop:
+			return chainDrop
+
+		case RuleReturn:
+			return chainReturn
+
+		case RuleJump:
+			// "Jumping" to the next rule just means we're
+			// continuing on down the list.
+			if jumpTo == ruleIdx+1 {
+				ruleIdx++
+				continue
+			}
+			switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, address, nicName); verdict {
+			case chainAccept:
+				return chainAccept
+			case chainDrop:
+				return chainDrop
+			case chainReturn:
+				ruleIdx++
+				continue
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", verdict))
+			}
+
+		default:
+			panic(fmt.Sprintf("Unknown verdict: %d", verdict))
+		}
+
+	}
+
+	// We got through the entire table without a decision. Default to DROP
+	// for safety.
+	return chainDrop
+}
+
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// TODO(gvisor.dev/issue/170): pkt.NetworkHeader will always be set as a
+// precondition.
+func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) (RuleVerdict, int) {
+	rule := table.Rules[ruleIdx]
+
+	// If pkt.NetworkHeader hasn't been set yet, it will be contained in
+	// pkt.Data.
+	if pkt.NetworkHeader == nil {
+		var ok bool
+		pkt.NetworkHeader, ok = pkt.Data.PullUp(header.IPv4MinimumSize)
+		if !ok {
+			// Precondition has been violated.
+			panic(fmt.Sprintf("iptables checks require IPv4 headers of at least %d bytes", header.IPv4MinimumSize))
+		}
+	}
+
+	// Check whether the packet matches the IP header filter.
+	if !rule.Filter.match(header.IPv4(pkt.NetworkHeader), hook, nicName) {
+		// Continue on to the next rule.
+		return RuleJump, ruleIdx + 1
+	}
+
+	// Go through each rule matcher. If they all match, run
+	// the rule target.
+	for _, matcher := range rule.Matchers {
+		matches, hotdrop := matcher.Match(hook, pkt, "")
+		if hotdrop {
+			return RuleDrop, 0
+		}
+		if !matches {
+			// Continue on to the next rule.
+			return RuleJump, ruleIdx + 1
+		}
+	}
+
+	// All the matchers matched, so run the target.
+	return rule.Target.Action(pkt, &it.connections, hook, gso, r, address)
+}
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
new file mode 100644
index 000000000..92e31643e
--- /dev/null
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -0,0 +1,165 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// AcceptTarget accepts packets.
+type AcceptTarget struct{}
+
+// Action implements Target.Action.
+func (AcceptTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+	return RuleAccept, 0
+}
+
+// DropTarget drops packets.
+type DropTarget struct{}
+
+// Action implements Target.Action.
+func (DropTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+	return RuleDrop, 0
+}
+
+// ErrorTarget logs an error and drops the packet. It represents a target that
+// should be unreachable.
+type ErrorTarget struct{}
+
+// Action implements Target.Action.
+func (ErrorTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+	log.Debugf("ErrorTarget triggered.")
+	return RuleDrop, 0
+}
+
+// UserChainTarget marks a rule as the beginning of a user chain.
+type UserChainTarget struct {
+	Name string
+}
+
+// Action implements Target.Action.
+func (UserChainTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+	panic("UserChainTarget should never be called.")
+}
+
+// ReturnTarget returns from the current chain. If the chain is a built-in, the
+// hook's underflow should be called.
+type ReturnTarget struct{}
+
+// Action implements Target.Action.
+func (ReturnTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+	return RuleReturn, 0
+}
+
+// RedirectTarget redirects the packet by modifying the destination port/IP.
+// Min and Max values for IP and Ports in the struct indicate the range of
+// values which can be used to redirect.
+type RedirectTarget struct {
+	// TODO(gvisor.dev/issue/170): Other flags need to be added after
+	// we support them.
+	// RangeProtoSpecified flag indicates single port is specified to
+	// redirect.
+	RangeProtoSpecified bool
+
+	// MinIP indicates address used to redirect.
+	MinIP tcpip.Address
+
+	// MaxIP indicates address used to redirect.
+	MaxIP tcpip.Address
+
+	// MinPort indicates port used to redirect.
+	MinPort uint16
+
+	// MaxPort indicates port used to redirect.
+	MaxPort uint16
+}
+
+// Action implements Target.Action.
+// TODO(gvisor.dev/issue/170): Parse headers without copying. The current
+// implementation only works for PREROUTING and calls pkt.Clone(), neither
+// of which should be the case.
+func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrackTable, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int) {
+	// Packet is already manipulated.
+	if pkt.NatDone {
+		return RuleAccept, 0
+	}
+
+	// Drop the packet if network and transport header are not set.
+	if pkt.NetworkHeader == nil || pkt.TransportHeader == nil {
+		return RuleDrop, 0
+	}
+
+	// Change the address to localhost (127.0.0.1) in Output and
+	// to primary address of the incoming interface in Prerouting.
+	switch hook {
+	case Output:
+		rt.MinIP = tcpip.Address([]byte{127, 0, 0, 1})
+		rt.MaxIP = tcpip.Address([]byte{127, 0, 0, 1})
+	case Prerouting:
+		rt.MinIP = address
+		rt.MaxIP = address
+	default:
+		panic("redirect target is supported only on output and prerouting hooks")
+	}
+
+	// TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if
+	// we need to change dest address (for OUTPUT chain) or ports.
+	netHeader := header.IPv4(pkt.NetworkHeader)
+	switch protocol := netHeader.TransportProtocol(); protocol {
+	case header.UDPProtocolNumber:
+		udpHeader := header.UDP(pkt.TransportHeader)
+		udpHeader.SetDestinationPort(rt.MinPort)
+
+		// Calculate UDP checksum and set it.
+		if hook == Output {
+			udpHeader.SetChecksum(0)
+			hdr := &pkt.Header
+			length := uint16(pkt.Data.Size()+hdr.UsedLength()) - uint16(netHeader.HeaderLength())
+
+			// Only calculate the checksum if offloading isn't supported.
+			if r.Capabilities()&CapabilityTXChecksumOffload == 0 {
+				xsum := r.PseudoHeaderChecksum(protocol, length)
+				for _, v := range pkt.Data.Views() {
+					xsum = header.Checksum(v, xsum)
+				}
+				udpHeader.SetChecksum(0)
+				udpHeader.SetChecksum(^udpHeader.CalculateChecksum(xsum))
+			}
+		}
+		// Change destination address.
+		netHeader.SetDestinationAddress(rt.MinIP)
+		netHeader.SetChecksum(0)
+		netHeader.SetChecksum(^netHeader.CalculateChecksum())
+		pkt.NatDone = true
+	case header.TCPProtocolNumber:
+		if ct == nil {
+			return RuleAccept, 0
+		}
+
+		// Set up conection for matching NAT rule.
+		// Only the first packet of the connection comes here.
+		// Other packets will be manipulated in connection tracking.
+		if conn, _ := ct.connTrackForPacket(pkt, hook, true); conn != nil {
+			ct.SetNatInfo(pkt, rt, hook)
+			ct.HandlePacket(pkt, hook, gso, r)
+		}
+	default:
+		return RuleDrop, 0
+	}
+
+	return RuleAccept, 0
+}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/stack/iptables_types.go
index 5735d001b..4a6a5c6f1 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -12,10 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package iptables
+package stack
 
 import (
+	"strings"
+	"sync"
+
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 // A Hook specifies one of the hooks built into the network stack.
@@ -56,17 +60,6 @@ const (
 	NumHooks
 )
 
-// A TableVerdict is what a table decides should be done with a packet.
-type TableVerdict int
-
-const (
-	// TableAccept indicates the packet should continue through netstack.
-	TableAccept TableVerdict = iota
-
-	// TableAccept indicates the packet should be dropped.
-	TableDrop
-)
-
 // A RuleVerdict is what a rule decides should be done with a packet.
 type RuleVerdict int
 
@@ -74,25 +67,31 @@ const (
 	// RuleAccept indicates the packet should continue through netstack.
 	RuleAccept RuleVerdict = iota
 
-	// RuleContinue indicates the packet should continue to the next rule.
-	RuleContinue
-
 	// RuleDrop indicates the packet should be dropped.
 	RuleDrop
 
+	// RuleJump indicates the packet should jump to another chain.
+	RuleJump
+
 	// RuleReturn indicates the packet should return to the previous chain.
 	RuleReturn
 )
 
 // IPTables holds all the tables for a netstack.
 type IPTables struct {
-	// Tables maps table names to tables. User tables have arbitrary names.
-	Tables map[string]Table
+	// mu protects tables and priorities.
+	mu sync.RWMutex
 
-	// Priorities maps each hook to a list of table names. The order of the
+	// tables maps table names to tables. User tables have arbitrary names. mu
+	// needs to be locked for accessing.
+	tables map[string]Table
+
+	// priorities maps each hook to a list of table names. The order of the
 	// list is the order in which each table should be visited for that
-	// hook.
-	Priorities map[Hook][]string
+	// hook. mu needs to be locked for accessing.
+	priorities map[Hook][]string
+
+	connections ConnTrackTable
 }
 
 // A Table defines a set of chains and hooks into the network stack. It is
@@ -155,6 +154,90 @@ type Rule struct {
 type IPHeaderFilter struct {
 	// Protocol matches the transport protocol.
 	Protocol tcpip.TransportProtocolNumber
+
+	// Dst matches the destination IP address.
+	Dst tcpip.Address
+
+	// DstMask masks bits of the destination IP address when comparing with
+	// Dst.
+	DstMask tcpip.Address
+
+	// DstInvert inverts the meaning of the destination IP check, i.e. when
+	// true the filter will match packets that fail the destination
+	// comparison.
+	DstInvert bool
+
+	// Src matches the source IP address.
+	Src tcpip.Address
+
+	// SrcMask masks bits of the source IP address when comparing with Src.
+	SrcMask tcpip.Address
+
+	// SrcInvert inverts the meaning of the source IP check, i.e. when true the
+	// filter will match packets that fail the source comparison.
+	SrcInvert bool
+
+	// OutputInterface matches the name of the outgoing interface for the
+	// packet.
+	OutputInterface string
+
+	// OutputInterfaceMask masks the characters of the interface name when
+	// comparing with OutputInterface.
+	OutputInterfaceMask string
+
+	// OutputInterfaceInvert inverts the meaning of outgoing interface check,
+	// i.e. when true the filter will match packets that fail the outgoing
+	// interface comparison.
+	OutputInterfaceInvert bool
+}
+
+// match returns whether hdr matches the filter.
+func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool {
+	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
+	// Check the transport protocol.
+	if fl.Protocol != 0 && fl.Protocol != hdr.TransportProtocol() {
+		return false
+	}
+
+	// Check the source and destination IPs.
+	if !filterAddress(hdr.DestinationAddress(), fl.DstMask, fl.Dst, fl.DstInvert) || !filterAddress(hdr.SourceAddress(), fl.SrcMask, fl.Src, fl.SrcInvert) {
+		return false
+	}
+
+	// Check the output interface.
+	// TODO(gvisor.dev/issue/170): Add the check for FORWARD and POSTROUTING
+	// hooks after supported.
+	if hook == Output {
+		n := len(fl.OutputInterface)
+		if n == 0 {
+			return true
+		}
+
+		// If the interface name ends with '+', any interface which begins
+		// with the name should be matched.
+		ifName := fl.OutputInterface
+		matches := true
+		if strings.HasSuffix(ifName, "+") {
+			matches = strings.HasPrefix(nicName, ifName[:n-1])
+		} else {
+			matches = nicName == ifName
+		}
+		return fl.OutputInterfaceInvert != matches
+	}
+
+	return true
+}
+
+// filterAddress returns whether addr matches the filter.
+func filterAddress(addr, mask, filterAddr tcpip.Address, invert bool) bool {
+	matches := true
+	for i := range filterAddr {
+		if addr[i]&mask[i] != filterAddr[i] {
+			matches = false
+			break
+		}
+	}
+	return matches != invert
 }
 
 // A Matcher is the interface for matching packets.
@@ -167,13 +250,13 @@ type Matcher interface {
 	// used for suspicious packets.
 	//
 	// Precondition: packet.NetworkHeader is set.
-	Match(hook Hook, packet tcpip.PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
+	Match(hook Hook, packet *PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
 }
 
 // A Target is the interface for taking an action for a packet.
 type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
-	// Jump, it also returns the name of the chain to jump to.
-	Action(packet tcpip.PacketBuffer) (RuleVerdict, string)
+	// Jump, it also returns the index of the rule to jump to.
+	Action(packet *PacketBuffer, connections *ConnTrackTable, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int)
 }
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 045409bda..ae7a8f740 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -15,6 +15,7 @@
 package stack
 
 import (
+	"fmt"
 	"log"
 	"math/rand"
 	"time"
@@ -118,6 +119,36 @@ const (
 	// identifier (IID) is 64 bits and an IPv6 address is 128 bits, so
 	// 128 - 64 = 64.
 	validPrefixLenForAutoGen = 64
+
+	// defaultAutoGenTempGlobalAddresses is the default configuration for whether
+	// or not to generate temporary SLAAC addresses.
+	defaultAutoGenTempGlobalAddresses = true
+
+	// defaultMaxTempAddrValidLifetime is the default maximum valid lifetime
+	// for temporary SLAAC addresses generated as part of RFC 4941.
+	//
+	// Default = 7 days (from RFC 4941 section 5).
+	defaultMaxTempAddrValidLifetime = 7 * 24 * time.Hour
+
+	// defaultMaxTempAddrPreferredLifetime is the default preferred lifetime
+	// for temporary SLAAC addresses generated as part of RFC 4941.
+	//
+	// Default = 1 day (from RFC 4941 section 5).
+	defaultMaxTempAddrPreferredLifetime = 24 * time.Hour
+
+	// defaultRegenAdvanceDuration is the default duration before the deprecation
+	// of a temporary address when a new address will be generated.
+	//
+	// Default = 5s (from RFC 4941 section 5).
+	defaultRegenAdvanceDuration = 5 * time.Second
+
+	// minRegenAdvanceDuration is the minimum duration before the deprecation
+	// of a temporary address when a new address will be generated.
+	minRegenAdvanceDuration = time.Duration(0)
+
+	// maxSLAACAddrLocalRegenAttempts is the maximum number of times to attempt
+	// SLAAC address regenerations in response to a NIC-local conflict.
+	maxSLAACAddrLocalRegenAttempts = 10
 )
 
 var (
@@ -130,6 +161,37 @@ var (
 	//
 	// Min = 2hrs.
 	MinPrefixInformationValidLifetimeForUpdate = 2 * time.Hour
+
+	// MaxDesyncFactor is the upper bound for the preferred lifetime's desync
+	// factor for temporary SLAAC addresses.
+	//
+	// This is exported as a variable (instead of a constant) so tests
+	// can update it to a smaller value.
+	//
+	// Must be greater than 0.
+	//
+	// Max = 10m (from RFC 4941 section 5).
+	MaxDesyncFactor = 10 * time.Minute
+
+	// MinMaxTempAddrPreferredLifetime is the minimum value allowed for the
+	// maximum preferred lifetime for temporary SLAAC addresses.
+	//
+	// This is exported as a variable (instead of a constant) so tests
+	// can update it to a smaller value.
+	//
+	// This value guarantees that a temporary address will be preferred for at
+	// least 1hr if the SLAAC prefix is valid for at least that time.
+	MinMaxTempAddrPreferredLifetime = defaultRegenAdvanceDuration + MaxDesyncFactor + time.Hour
+
+	// MinMaxTempAddrValidLifetime is the minimum value allowed for the
+	// maximum valid lifetime for temporary SLAAC addresses.
+	//
+	// This is exported as a variable (instead of a constant) so tests
+	// can update it to a smaller value.
+	//
+	// This value guarantees that a temporary address will be valid for at least
+	// 2hrs if the SLAAC prefix is valid for at least that time.
+	MinMaxTempAddrValidLifetime = 2 * time.Hour
 )
 
 // DHCPv6ConfigurationFromNDPRA is a configuration available via DHCPv6 that an
@@ -137,9 +199,11 @@ var (
 type DHCPv6ConfigurationFromNDPRA int
 
 const (
+	_ DHCPv6ConfigurationFromNDPRA = iota
+
 	// DHCPv6NoConfiguration indicates that no configurations are available via
 	// DHCPv6.
-	DHCPv6NoConfiguration DHCPv6ConfigurationFromNDPRA = iota
+	DHCPv6NoConfiguration
 
 	// DHCPv6ManagedAddress indicates that addresses are available via DHCPv6.
 	//
@@ -240,12 +304,19 @@ type NDPDispatcher interface {
 	// call functions on the stack itself.
 	OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration)
 
+	// OnDNSSearchListOption will be called when an NDP option with a DNS
+	// search list has been received.
+	//
+	// It is up to the caller to use the domain names in the search list
+	// for only their valid lifetime. OnDNSSearchListOption may be called
+	// with new or already known domain names. If called with known domain
+	// names, their valid lifetimes must be refreshed to lifetime (it may
+	// be increased, decreased or completely invalidated when lifetime = 0.
+	OnDNSSearchListOption(nicID tcpip.NICID, domainNames []string, lifetime time.Duration)
+
 	// OnDHCPv6Configuration will be called with an updated configuration that is
 	// available via DHCPv6 for a specified NIC.
 	//
-	// NDPDispatcher assumes that the initial configuration available by DHCPv6 is
-	// DHCPv6NoConfiguration.
-	//
 	// This function is not permitted to block indefinitely. It must not
 	// call functions on the stack itself.
 	OnDHCPv6Configuration(tcpip.NICID, DHCPv6ConfigurationFromNDPRA)
@@ -304,35 +375,58 @@ type NDPConfigurations struct {
 	// lifetime(s) of the generated address changes; this option only
 	// affects the generation of new addresses as part of SLAAC.
 	AutoGenGlobalAddresses bool
+
+	// AutoGenAddressConflictRetries determines how many times to attempt to retry
+	// generation of a permanent auto-generated address in response to DAD
+	// conflicts.
+	//
+	// If the method used to generate the address does not support creating
+	// alternative addresses (e.g. IIDs based on the modified EUI64 of a NIC's
+	// MAC address), then no attempt will be made to resolve the conflict.
+	AutoGenAddressConflictRetries uint8
+
+	// AutoGenTempGlobalAddresses determines whether or not temporary SLAAC
+	// addresses will be generated for a NIC as part of SLAAC privacy extensions,
+	// RFC 4941.
+	//
+	// Ignored if AutoGenGlobalAddresses is false.
+	AutoGenTempGlobalAddresses bool
+
+	// MaxTempAddrValidLifetime is the maximum valid lifetime for temporary
+	// SLAAC addresses.
+	MaxTempAddrValidLifetime time.Duration
+
+	// MaxTempAddrPreferredLifetime is the maximum preferred lifetime for
+	// temporary SLAAC addresses.
+	MaxTempAddrPreferredLifetime time.Duration
+
+	// RegenAdvanceDuration is the duration before the deprecation of a temporary
+	// address when a new address will be generated.
+	RegenAdvanceDuration time.Duration
 }
 
 // DefaultNDPConfigurations returns an NDPConfigurations populated with
 // default values.
 func DefaultNDPConfigurations() NDPConfigurations {
 	return NDPConfigurations{
-		DupAddrDetectTransmits:  defaultDupAddrDetectTransmits,
-		RetransmitTimer:         defaultRetransmitTimer,
-		MaxRtrSolicitations:     defaultMaxRtrSolicitations,
-		RtrSolicitationInterval: defaultRtrSolicitationInterval,
-		MaxRtrSolicitationDelay: defaultMaxRtrSolicitationDelay,
-		HandleRAs:               defaultHandleRAs,
-		DiscoverDefaultRouters:  defaultDiscoverDefaultRouters,
-		DiscoverOnLinkPrefixes:  defaultDiscoverOnLinkPrefixes,
-		AutoGenGlobalAddresses:  defaultAutoGenGlobalAddresses,
+		DupAddrDetectTransmits:       defaultDupAddrDetectTransmits,
+		RetransmitTimer:              defaultRetransmitTimer,
+		MaxRtrSolicitations:          defaultMaxRtrSolicitations,
+		RtrSolicitationInterval:      defaultRtrSolicitationInterval,
+		MaxRtrSolicitationDelay:      defaultMaxRtrSolicitationDelay,
+		HandleRAs:                    defaultHandleRAs,
+		DiscoverDefaultRouters:       defaultDiscoverDefaultRouters,
+		DiscoverOnLinkPrefixes:       defaultDiscoverOnLinkPrefixes,
+		AutoGenGlobalAddresses:       defaultAutoGenGlobalAddresses,
+		AutoGenTempGlobalAddresses:   defaultAutoGenTempGlobalAddresses,
+		MaxTempAddrValidLifetime:     defaultMaxTempAddrValidLifetime,
+		MaxTempAddrPreferredLifetime: defaultMaxTempAddrPreferredLifetime,
+		RegenAdvanceDuration:         defaultRegenAdvanceDuration,
 	}
 }
 
 // validate modifies an NDPConfigurations with valid values. If invalid values
 // are present in c, the corresponding default values will be used instead.
-//
-// If RetransmitTimer is less than minimumRetransmitTimer, then a value of
-// defaultRetransmitTimer will be used.
-//
-// If RtrSolicitationInterval is less than minimumRtrSolicitationInterval, then
-// a value of defaultRtrSolicitationInterval will be used.
-//
-// If MaxRtrSolicitationDelay is less than minimumMaxRtrSolicitationDelay, then
-// a value of defaultMaxRtrSolicitationDelay will be used.
 func (c *NDPConfigurations) validate() {
 	if c.RetransmitTimer < minimumRetransmitTimer {
 		c.RetransmitTimer = defaultRetransmitTimer
@@ -345,6 +439,18 @@ func (c *NDPConfigurations) validate() {
 	if c.MaxRtrSolicitationDelay < minimumMaxRtrSolicitationDelay {
 		c.MaxRtrSolicitationDelay = defaultMaxRtrSolicitationDelay
 	}
+
+	if c.MaxTempAddrValidLifetime < MinMaxTempAddrValidLifetime {
+		c.MaxTempAddrValidLifetime = MinMaxTempAddrValidLifetime
+	}
+
+	if c.MaxTempAddrPreferredLifetime < MinMaxTempAddrPreferredLifetime || c.MaxTempAddrPreferredLifetime > c.MaxTempAddrValidLifetime {
+		c.MaxTempAddrPreferredLifetime = MinMaxTempAddrPreferredLifetime
+	}
+
+	if c.RegenAdvanceDuration < minRegenAdvanceDuration {
+		c.RegenAdvanceDuration = minRegenAdvanceDuration
+	}
 }
 
 // ndpState is the per-interface NDP state.
@@ -361,19 +467,27 @@ type ndpState struct {
 	// The default routers discovered through Router Advertisements.
 	defaultRouters map[tcpip.Address]defaultRouterState
 
+	// The timer used to send the next router solicitation message.
+	rtrSolicitTimer *time.Timer
+
 	// The on-link prefixes discovered through Router Advertisements' Prefix
 	// Information option.
 	onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState
 
-	// The timer used to send the next router solicitation message.
-	// If routers are being solicited, rtrSolicitTimer MUST NOT be nil.
-	rtrSolicitTimer *time.Timer
-
-	// The addresses generated by SLAAC.
-	autoGenAddresses map[tcpip.Address]autoGenAddressState
+	// The SLAAC prefixes discovered through Router Advertisements' Prefix
+	// Information option.
+	slaacPrefixes map[tcpip.Subnet]slaacPrefixState
 
 	// The last learned DHCPv6 configuration from an NDP RA.
 	dhcpv6Configuration DHCPv6ConfigurationFromNDPRA
+
+	// temporaryIIDHistory is the history value used to generate a new temporary
+	// IID.
+	temporaryIIDHistory [header.IIDSize]byte
+
+	// temporaryAddressDesyncFactor is the preferred lifetime's desync factor for
+	// temporary SLAAC addresses.
+	temporaryAddressDesyncFactor time.Duration
 }
 
 // dadState holds the Duplicate Address Detection timer and channel to signal
@@ -392,28 +506,97 @@ type dadState struct {
 // defaultRouterState holds data associated with a default router discovered by
 // a Router Advertisement (RA).
 type defaultRouterState struct {
-	invalidationTimer tcpip.CancellableTimer
+	// Timer to invalidate the default router.
+	//
+	// Must not be nil.
+	invalidationTimer *tcpip.CancellableTimer
 }
 
 // onLinkPrefixState holds data associated with an on-link prefix discovered by
 // a Router Advertisement's Prefix Information option (PI) when the NDP
 // configurations was configured to do so.
 type onLinkPrefixState struct {
-	invalidationTimer tcpip.CancellableTimer
+	// Timer to invalidate the on-link prefix.
+	//
+	// Must not be nil.
+	invalidationTimer *tcpip.CancellableTimer
 }
 
-// autoGenAddressState holds data associated with an address generated via
-// SLAAC.
-type autoGenAddressState struct {
-	// A reference to the referencedNetworkEndpoint that this autoGenAddressState
-	// is holding state for.
+// tempSLAACAddrState holds state associated with a temporary SLAAC address.
+type tempSLAACAddrState struct {
+	// Timer to deprecate the temporary SLAAC address.
+	//
+	// Must not be nil.
+	deprecationTimer *tcpip.CancellableTimer
+
+	// Timer to invalidate the temporary SLAAC address.
+	//
+	// Must not be nil.
+	invalidationTimer *tcpip.CancellableTimer
+
+	// Timer to regenerate the temporary SLAAC address.
+	//
+	// Must not be nil.
+	regenTimer *tcpip.CancellableTimer
+
+	createdAt time.Time
+
+	// The address's endpoint.
+	//
+	// Must not be nil.
 	ref *referencedNetworkEndpoint
 
-	deprecationTimer  tcpip.CancellableTimer
-	invalidationTimer tcpip.CancellableTimer
+	// Has a new temporary SLAAC address already been regenerated?
+	regenerated bool
+}
+
+// slaacPrefixState holds state associated with a SLAAC prefix.
+type slaacPrefixState struct {
+	// Timer to deprecate the prefix.
+	//
+	// Must not be nil.
+	deprecationTimer *tcpip.CancellableTimer
+
+	// Timer to invalidate the prefix.
+	//
+	// Must not be nil.
+	invalidationTimer *tcpip.CancellableTimer
 
 	// Nonzero only when the address is not valid forever.
 	validUntil time.Time
+
+	// Nonzero only when the address is not preferred forever.
+	preferredUntil time.Time
+
+	// State associated with the stable address generated for the prefix.
+	stableAddr struct {
+		// The address's endpoint.
+		//
+		// May only be nil when the address is being (re-)generated. Otherwise,
+		// must not be nil as all SLAAC prefixes must have a stable address.
+		ref *referencedNetworkEndpoint
+
+		// The number of times an address has been generated locally where the NIC
+		// already had the generated address.
+		localGenerationFailures uint8
+	}
+
+	// The temporary (short-lived) addresses generated for the SLAAC prefix.
+	tempAddrs map[tcpip.Address]tempSLAACAddrState
+
+	// The next two fields are used by both stable and temporary addresses
+	// generated for a SLAAC prefix. This is safe as only 1 address will be
+	// in the generation and DAD process at any time. That is, no two addresses
+	// will be generated at the same time for a given SLAAC prefix.
+
+	// The number of times an address has been generated and added to the NIC.
+	//
+	// Addresses may be regenerated in reseponse to a DAD conflicts.
+	generationAttempts uint8
+
+	// The maximum number of times to attempt regeneration of a SLAAC address
+	// in response to DAD conflicts.
+	maxGenerationAttempts uint8
 }
 
 // startDuplicateAddressDetection performs Duplicate Address Detection.
@@ -430,7 +613,7 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 
 	if ref.getKind() != permanentTentative {
 		// The endpoint should be marked as tentative since we are starting DAD.
-		log.Fatalf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID())
+		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID()))
 	}
 
 	// Should not attempt to perform DAD on an address that is currently in the
@@ -442,7 +625,7 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		// address, or its reference count would have been increased without doing
 		// the work that would have been done for an address that was brand new.
 		// See NIC.addAddressLocked.
-		log.Fatalf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID())
+		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
 	}
 
 	remaining := ndp.configs.DupAddrDetectTransmits
@@ -478,7 +661,7 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		if ref.getKind() != permanentTentative {
 			// The endpoint should still be marked as tentative since we are still
 			// performing DAD on it.
-			log.Fatalf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.nic.ID())
+			panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.nic.ID()))
 		}
 
 		dadDone := remaining == 0
@@ -490,10 +673,10 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		}
 
 		ndp.nic.mu.Lock()
+		defer ndp.nic.mu.Unlock()
 		if done {
 			// If we reach this point, it means that DAD was stopped after we released
 			// the NIC's read lock and before we obtained the write lock.
-			ndp.nic.mu.Unlock()
 			return
 		}
 
@@ -505,8 +688,6 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 			// schedule the next DAD timer.
 			remaining--
 			timer.Reset(ndp.nic.stack.ndpConfigs.RetransmitTimer)
-
-			ndp.nic.mu.Unlock()
 			return
 		}
 
@@ -514,15 +695,18 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		// the last NDP NS. Either way, clean up addr's DAD state and let the
 		// integrator know DAD has completed.
 		delete(ndp.dad, addr)
-		ndp.nic.mu.Unlock()
-
-		if err != nil {
-			log.Printf("ndpdad: error occured during DAD iteration for addr (%s) on NIC(%d); err = %s", addr, ndp.nic.ID(), err)
-		}
 
 		if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
 			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, dadDone, err)
 		}
+
+		// If DAD resolved for a stable SLAAC address, attempt generation of a
+		// temporary SLAAC address.
+		if dadDone && ref.configType == slaac {
+			// Reset the generation attempts counter as we are starting the generation
+			// of a new address for the SLAAC prefix.
+			ndp.regenerateTempSLAACAddr(ref.addrWithPrefix().Subnet(), true /* resetGenAttempts */)
+		}
 	})
 
 	ndp.dad[addr] = dadState{
@@ -548,9 +732,9 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 	// Route should resolve immediately since snmc is a multicast address so a
 	// remote link address can be calculated without a resolution process.
 	if c, err := r.Resolve(nil); err != nil {
-		log.Fatalf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err)
+		panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err))
 	} else if c != nil {
-		log.Fatalf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID())
+		panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID()))
 	}
 
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
@@ -566,7 +750,7 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 			Protocol: header.ICMPv6ProtocolNumber,
 			TTL:      header.NDPHopLimit,
 			TOS:      DefaultTOS,
-		}, tcpip.PacketBuffer{Header: hdr},
+		}, &PacketBuffer{Header: hdr},
 	); err != nil {
 		sent.Dropped.Increment()
 		return err
@@ -688,7 +872,16 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 				continue
 			}
 
-			ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), opt.Addresses(), opt.Lifetime())
+			addrs, _ := opt.Addresses()
+			ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), addrs, opt.Lifetime())
+
+		case header.NDPDNSSearchList:
+			if ndp.nic.stack.ndpDisp == nil {
+				continue
+			}
+
+			domainNames, _ := opt.DomainNames()
+			ndp.nic.stack.ndpDisp.OnDNSSearchListOption(ndp.nic.ID(), domainNames, opt.Lifetime())
 
 		case header.NDPPrefixInformation:
 			prefix := opt.Subnet()
@@ -733,7 +926,6 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 	}
 
 	rtr.invalidationTimer.StopLocked()
-
 	delete(ndp.defaultRouters, ip)
 
 	// Let the integrator know a discovered default router is invalidated.
@@ -762,7 +954,7 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 	}
 
 	state := defaultRouterState{
-		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
 			ndp.invalidateDefaultRouter(ip)
 		}),
 	}
@@ -792,7 +984,7 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration)
 	}
 
 	state := onLinkPrefixState{
-		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
 			ndp.invalidateOnLinkPrefix(prefix)
 		}),
 	}
@@ -817,7 +1009,6 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 	}
 
 	s.invalidationTimer.StopLocked()
-
 	delete(ndp.onLinkPrefixes, prefix)
 
 	// Let the integrator know a discovered on-link prefix is invalidated.
@@ -899,23 +1090,16 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 
 	prefix := pi.Subnet()
 
-	// Check if we already have an auto-generated address for prefix.
-	for addr, addrState := range ndp.autoGenAddresses {
-		refAddrWithPrefix := tcpip.AddressWithPrefix{Address: addr, PrefixLen: addrState.ref.ep.PrefixLen()}
-		if refAddrWithPrefix.Subnet() != prefix {
-			continue
-		}
-
-		// At this point, we know we are refreshing a SLAAC generated IPv6 address
-		// with the prefix prefix. Do the work as outlined by RFC 4862 section
-		// 5.5.3.e.
-		ndp.refreshAutoGenAddressLifetimes(addr, pl, vl)
+	// Check if we already maintain SLAAC state for prefix.
+	if state, ok := ndp.slaacPrefixes[prefix]; ok {
+		// As per RFC 4862 section 5.5.3.e, refresh prefix's SLAAC lifetimes.
+		ndp.refreshSLAACPrefixLifetimes(prefix, &state, pl, vl)
+		ndp.slaacPrefixes[prefix] = state
 		return
 	}
 
-	// We do not already have an address with the prefix prefix. Do the
-	// work as outlined by RFC 4862 section 5.5.3.d if n is configured
-	// to auto-generate global addresses by SLAAC.
+	// prefix is a new SLAAC prefix. Do the work as outlined by RFC 4862 section
+	// 5.5.3.d if ndp is configured to auto-generate new addresses via SLAAC.
 	if !ndp.configs.AutoGenGlobalAddresses {
 		return
 	}
@@ -927,6 +1111,8 @@ func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInform
 // for prefix.
 //
 // pl is the new preferred lifetime. vl is the new valid lifetime.
+//
+// The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	// If we do not already have an address for this prefix and the valid
 	// lifetime is 0, no need to do anything further, as per RFC 4862
@@ -942,237 +1128,668 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 		return
 	}
 
-	addrBytes := []byte(prefix.ID())
-	if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
-		addrBytes = header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], prefix, oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name), 0 /* dadCounter */, oIID.SecretKey)
-	} else {
-		// Only attempt to generate an interface-specific IID if we have a valid
-		// link address.
-		//
-		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
-		// LinkEndpoint.LinkAddress) before reaching this point.
-		linkAddr := ndp.nic.linkEP.LinkAddress()
-		if !header.IsValidUnicastEthernetAddress(linkAddr) {
-			return
-		}
+	state := slaacPrefixState{
+		deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			state, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix))
+			}
+
+			ndp.deprecateSLAACAddress(state.stableAddr.ref)
+		}),
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			state, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix))
+			}
 
-		// Generate an address within prefix from the modified EUI-64 of ndp's NIC's
-		// Ethernet MAC address.
-		header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+			ndp.invalidateSLAACPrefix(prefix, state)
+		}),
+		tempAddrs:             make(map[tcpip.Address]tempSLAACAddrState),
+		maxGenerationAttempts: ndp.configs.AutoGenAddressConflictRetries + 1,
 	}
-	addr := tcpip.Address(addrBytes)
-	addrWithPrefix := tcpip.AddressWithPrefix{
-		Address:   addr,
-		PrefixLen: validPrefixLenForAutoGen,
+
+	now := time.Now()
+
+	// The time an address is preferred until is needed to properly generate the
+	// address.
+	if pl < header.NDPInfiniteLifetime {
+		state.preferredUntil = now.Add(pl)
 	}
 
-	// If the nic already has this address, do nothing further.
-	if ndp.nic.hasPermanentAddrLocked(addr) {
+	if !ndp.generateSLAACAddr(prefix, &state) {
+		// We were unable to generate an address for the prefix, we do not nothing
+		// further as there is no reason to maintain state or timers for a prefix we
+		// do not have an address for.
 		return
 	}
 
+	// Setup the initial timers to deprecate and invalidate prefix.
+
+	if pl < header.NDPInfiniteLifetime && pl != 0 {
+		state.deprecationTimer.Reset(pl)
+	}
+
+	if vl < header.NDPInfiniteLifetime {
+		state.invalidationTimer.Reset(vl)
+		state.validUntil = now.Add(vl)
+	}
+
+	// If the address is assigned (DAD resolved), generate a temporary address.
+	if state.stableAddr.ref.getKind() == permanent {
+		// Reset the generation attempts counter as we are starting the generation
+		// of a new address for the SLAAC prefix.
+		ndp.generateTempSLAACAddr(prefix, &state, true /* resetGenAttempts */)
+	}
+
+	ndp.slaacPrefixes[prefix] = state
+}
+
+// addSLAACAddr adds a SLAAC address to the NIC.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) addSLAACAddr(addr tcpip.AddressWithPrefix, configType networkEndpointConfigType, deprecated bool) *referencedNetworkEndpoint {
 	// Inform the integrator that we have a new SLAAC address.
 	ndpDisp := ndp.nic.stack.ndpDisp
 	if ndpDisp == nil {
-		return
+		return nil
 	}
-	if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addrWithPrefix) {
+
+	if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addr) {
 		// Informed by the integrator not to add the address.
-		return
+		return nil
 	}
 
 	protocolAddr := tcpip.ProtocolAddress{
 		Protocol:          header.IPv6ProtocolNumber,
-		AddressWithPrefix: addrWithPrefix,
+		AddressWithPrefix: addr,
 	}
-	// If the preferred lifetime is zero, then the address should be considered
-	// deprecated.
-	deprecated := pl == 0
-	ref, err := ndp.nic.addAddressLocked(protocolAddr, FirstPrimaryEndpoint, permanent, slaac, deprecated)
+
+	ref, err := ndp.nic.addAddressLocked(protocolAddr, FirstPrimaryEndpoint, permanent, configType, deprecated)
 	if err != nil {
-		log.Fatalf("ndp: error when adding address %s: %s", protocolAddr, err)
+		panic(fmt.Sprintf("ndp: error when adding SLAAC address %+v: %s", protocolAddr, err))
+	}
+
+	return ref
+}
+
+// generateSLAACAddr generates a SLAAC address for prefix.
+//
+// Returns true if an address was successfully generated.
+//
+// Panics if the prefix is not a SLAAC prefix or it already has an address.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixState) bool {
+	if r := state.stableAddr.ref; r != nil {
+		panic(fmt.Sprintf("ndp: SLAAC prefix %s already has a permenant address %s", prefix, r.addrWithPrefix()))
+	}
+
+	// If we have already reached the maximum address generation attempts for the
+	// prefix, do not generate another address.
+	if state.generationAttempts == state.maxGenerationAttempts {
+		return false
+	}
+
+	var generatedAddr tcpip.AddressWithPrefix
+	addrBytes := []byte(prefix.ID())
+
+	for i := 0; ; i++ {
+		// If we were unable to generate an address after the maximum SLAAC address
+		// local regeneration attempts, do nothing further.
+		if i == maxSLAACAddrLocalRegenAttempts {
+			return false
+		}
+
+		dadCounter := state.generationAttempts + state.stableAddr.localGenerationFailures
+		if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
+			addrBytes = header.AppendOpaqueInterfaceIdentifier(
+				addrBytes[:header.IIDOffsetInIPv6Address],
+				prefix,
+				oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name),
+				dadCounter,
+				oIID.SecretKey,
+			)
+		} else if dadCounter == 0 {
+			// Modified-EUI64 based IIDs have no way to resolve DAD conflicts, so if
+			// the DAD counter is non-zero, we cannot use this method.
+			//
+			// Only attempt to generate an interface-specific IID if we have a valid
+			// link address.
+			//
+			// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+			// LinkEndpoint.LinkAddress) before reaching this point.
+			linkAddr := ndp.nic.linkEP.LinkAddress()
+			if !header.IsValidUnicastEthernetAddress(linkAddr) {
+				return false
+			}
+
+			// Generate an address within prefix from the modified EUI-64 of ndp's
+			// NIC's Ethernet MAC address.
+			header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+		} else {
+			// We have no way to regenerate an address in response to an address
+			// conflict when addresses are not generated with opaque IIDs.
+			return false
+		}
+
+		generatedAddr = tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(addrBytes),
+			PrefixLen: validPrefixLenForAutoGen,
+		}
+
+		if !ndp.nic.hasPermanentAddrLocked(generatedAddr.Address) {
+			break
+		}
+
+		state.stableAddr.localGenerationFailures++
+	}
+
+	if ref := ndp.addSLAACAddr(generatedAddr, slaac, time.Since(state.preferredUntil) >= 0 /* deprecated */); ref != nil {
+		state.stableAddr.ref = ref
+		state.generationAttempts++
+		return true
+	}
+
+	return false
+}
+
+// regenerateSLAACAddr regenerates an address for a SLAAC prefix.
+//
+// If generating a new address for the prefix fails, the prefix will be
+// invalidated.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) regenerateSLAACAddr(prefix tcpip.Subnet) {
+	state, ok := ndp.slaacPrefixes[prefix]
+	if !ok {
+		panic(fmt.Sprintf("ndp: SLAAC prefix state not found to regenerate address for %s", prefix))
+	}
+
+	if ndp.generateSLAACAddr(prefix, &state) {
+		ndp.slaacPrefixes[prefix] = state
+		return
+	}
+
+	// We were unable to generate a permanent address for the SLAAC prefix so
+	// invalidate the prefix as there is no reason to maintain state for a
+	// SLAAC prefix we do not have an address for.
+	ndp.invalidateSLAACPrefix(prefix, state)
+}
+
+// generateTempSLAACAddr generates a new temporary SLAAC address.
+//
+// If resetGenAttempts is true, the prefix's generation counter will be reset.
+//
+// Returns true if a new address was generated.
+func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *slaacPrefixState, resetGenAttempts bool) bool {
+	// Are we configured to auto-generate new temporary global addresses for the
+	// prefix?
+	if !ndp.configs.AutoGenTempGlobalAddresses || prefix == header.IPv6LinkLocalPrefix.Subnet() {
+		return false
+	}
+
+	if resetGenAttempts {
+		prefixState.generationAttempts = 0
+		prefixState.maxGenerationAttempts = ndp.configs.AutoGenAddressConflictRetries + 1
+	}
+
+	// If we have already reached the maximum address generation attempts for the
+	// prefix, do not generate another address.
+	if prefixState.generationAttempts == prefixState.maxGenerationAttempts {
+		return false
+	}
+
+	stableAddr := prefixState.stableAddr.ref.ep.ID().LocalAddress
+	now := time.Now()
+
+	// As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary
+	// address is the lower of the valid lifetime of the stable address or the
+	// maximum temporary address valid lifetime.
+	vl := ndp.configs.MaxTempAddrValidLifetime
+	if prefixState.validUntil != (time.Time{}) {
+		if prefixVL := prefixState.validUntil.Sub(now); vl > prefixVL {
+			vl = prefixVL
+		}
+	}
+
+	if vl <= 0 {
+		// Cannot create an address without a valid lifetime.
+		return false
+	}
+
+	// As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary
+	// address is the lower of the preferred lifetime of the stable address or the
+	// maximum temporary address preferred lifetime - the temporary address desync
+	// factor.
+	pl := ndp.configs.MaxTempAddrPreferredLifetime - ndp.temporaryAddressDesyncFactor
+	if prefixState.preferredUntil != (time.Time{}) {
+		if prefixPL := prefixState.preferredUntil.Sub(now); pl > prefixPL {
+			// Respect the preferred lifetime of the prefix, as per RFC 4941 section
+			// 3.3 step 4.
+			pl = prefixPL
+		}
 	}
 
-	state := autoGenAddressState{
-		ref: ref,
-		deprecationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
-			addrState, ok := ndp.autoGenAddresses[addr]
+	// As per RFC 4941 section 3.3 step 5, a temporary address is created only if
+	// the calculated preferred lifetime is greater than the advance regeneration
+	// duration. In particular, we MUST NOT create a temporary address with a zero
+	// Preferred Lifetime.
+	if pl <= ndp.configs.RegenAdvanceDuration {
+		return false
+	}
+
+	// Attempt to generate a new address that is not already assigned to the NIC.
+	var generatedAddr tcpip.AddressWithPrefix
+	for i := 0; ; i++ {
+		// If we were unable to generate an address after the maximum SLAAC address
+		// local regeneration attempts, do nothing further.
+		if i == maxSLAACAddrLocalRegenAttempts {
+			return false
+		}
+
+		generatedAddr = header.GenerateTempIPv6SLAACAddr(ndp.temporaryIIDHistory[:], stableAddr)
+		if !ndp.nic.hasPermanentAddrLocked(generatedAddr.Address) {
+			break
+		}
+	}
+
+	// As per RFC RFC 4941 section 3.3 step 5, we MUST NOT create a temporary
+	// address with a zero preferred lifetime. The checks above ensure this
+	// so we know the address is not deprecated.
+	ref := ndp.addSLAACAddr(generatedAddr, slaacTemp, false /* deprecated */)
+	if ref == nil {
+		return false
+	}
+
+	state := tempSLAACAddrState{
+		deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			prefixState, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to deprecate temporary address %s", prefix, generatedAddr))
+			}
+
+			tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address]
 			if !ok {
-				log.Fatalf("ndp: must have an autoGenAddressess entry for the SLAAC generated IPv6 address %s", addr)
+				panic(fmt.Sprintf("ndp: must have a tempAddr entry to deprecate temporary address %s", generatedAddr))
 			}
-			addrState.ref.deprecated = true
-			ndp.notifyAutoGenAddressDeprecated(addr)
+
+			ndp.deprecateSLAACAddress(tempAddrState.ref)
 		}),
-		invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
-			ndp.invalidateAutoGenAddress(addr)
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			prefixState, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to invalidate temporary address %s", prefix, generatedAddr))
+			}
+
+			tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a tempAddr entry to invalidate temporary address %s", generatedAddr))
+			}
+
+			ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, generatedAddr.Address, tempAddrState)
 		}),
-	}
+		regenTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			prefixState, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to regenerate temporary address after %s", prefix, generatedAddr))
+			}
 
-	// Setup the initial timers to deprecate and invalidate this newly generated
-	// address.
+			tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a tempAddr entry to regenerate temporary address after %s", generatedAddr))
+			}
 
-	if !deprecated && pl < header.NDPInfiniteLifetime {
-		state.deprecationTimer.Reset(pl)
-	}
+			// If an address has already been regenerated for this address, don't
+			// regenerate another address.
+			if tempAddrState.regenerated {
+				return
+			}
 
-	if vl < header.NDPInfiniteLifetime {
-		state.invalidationTimer.Reset(vl)
-		state.validUntil = time.Now().Add(vl)
+			// Reset the generation attempts counter as we are starting the generation
+			// of a new address for the SLAAC prefix.
+			tempAddrState.regenerated = ndp.generateTempSLAACAddr(prefix, &prefixState, true /* resetGenAttempts */)
+			prefixState.tempAddrs[generatedAddr.Address] = tempAddrState
+			ndp.slaacPrefixes[prefix] = prefixState
+		}),
+		createdAt: now,
+		ref:       ref,
 	}
 
-	ndp.autoGenAddresses[addr] = state
+	state.deprecationTimer.Reset(pl)
+	state.invalidationTimer.Reset(vl)
+	state.regenTimer.Reset(pl - ndp.configs.RegenAdvanceDuration)
+
+	prefixState.generationAttempts++
+	prefixState.tempAddrs[generatedAddr.Address] = state
+
+	return true
 }
 
-// refreshAutoGenAddressLifetimes refreshes the lifetime of a SLAAC generated
-// address addr.
+// regenerateTempSLAACAddr regenerates a temporary address for a SLAAC prefix.
 //
-// pl is the new preferred lifetime. vl is the new valid lifetime.
-func (ndp *ndpState) refreshAutoGenAddressLifetimes(addr tcpip.Address, pl, vl time.Duration) {
-	addrState, ok := ndp.autoGenAddresses[addr]
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) regenerateTempSLAACAddr(prefix tcpip.Subnet, resetGenAttempts bool) {
+	state, ok := ndp.slaacPrefixes[prefix]
 	if !ok {
-		log.Fatalf("ndp: SLAAC state not found to refresh lifetimes for %s", addr)
+		panic(fmt.Sprintf("ndp: SLAAC prefix state not found to regenerate temporary address for %s", prefix))
 	}
-	defer func() { ndp.autoGenAddresses[addr] = addrState }()
 
-	// If the preferred lifetime is zero, then the address should be considered
-	// deprecated.
-	deprecated := pl == 0
-	wasDeprecated := addrState.ref.deprecated
-	addrState.ref.deprecated = deprecated
+	ndp.generateTempSLAACAddr(prefix, &state, resetGenAttempts)
+	ndp.slaacPrefixes[prefix] = state
+}
 
-	// Only send the deprecation event if the deprecated status for addr just
-	// changed from non-deprecated to deprecated.
-	if !wasDeprecated && deprecated {
-		ndp.notifyAutoGenAddressDeprecated(addr)
+// refreshSLAACPrefixLifetimes refreshes the lifetimes of a SLAAC prefix.
+//
+// pl is the new preferred lifetime. vl is the new valid lifetime.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixState *slaacPrefixState, pl, vl time.Duration) {
+	// If the preferred lifetime is zero, then the prefix should be deprecated.
+	deprecated := pl == 0
+	if deprecated {
+		ndp.deprecateSLAACAddress(prefixState.stableAddr.ref)
+	} else {
+		prefixState.stableAddr.ref.deprecated = false
 	}
 
-	// If addr was preferred for some finite lifetime before, stop the deprecation
-	// timer so it can be reset.
-	addrState.deprecationTimer.StopLocked()
+	// If prefix was preferred for some finite lifetime before, stop the
+	// deprecation timer so it can be reset.
+	prefixState.deprecationTimer.StopLocked()
 
-	// Reset the deprecation timer if addr has a finite preferred lifetime.
-	if !deprecated && pl < header.NDPInfiniteLifetime {
-		addrState.deprecationTimer.Reset(pl)
+	now := time.Now()
+
+	// Reset the deprecation timer if prefix has a finite preferred lifetime.
+	if pl < header.NDPInfiniteLifetime {
+		if !deprecated {
+			prefixState.deprecationTimer.Reset(pl)
+		}
+		prefixState.preferredUntil = now.Add(pl)
+	} else {
+		prefixState.preferredUntil = time.Time{}
 	}
 
-	// As per RFC 4862 section 5.5.3.e, the valid lifetime of the address
-	//
+	// As per RFC 4862 section 5.5.3.e, update the valid lifetime for prefix:
 	//
 	// 1) If the received Valid Lifetime is greater than 2 hours or greater than
-	//    RemainingLifetime, set the valid lifetime of the address to the
+	//    RemainingLifetime, set the valid lifetime of the prefix to the
 	//    advertised Valid Lifetime.
 	//
 	// 2) If RemainingLifetime is less than or equal to 2 hours, ignore the
 	//    advertised Valid Lifetime.
 	//
-	// 3) Otherwise, reset the valid lifetime of the address to 2 hours.
+	// 3) Otherwise, reset the valid lifetime of the prefix to 2 hours.
 
-	// Handle the infinite valid lifetime separately as we do not keep a timer in
-	// this case.
 	if vl >= header.NDPInfiniteLifetime {
-		addrState.invalidationTimer.StopLocked()
-		addrState.validUntil = time.Time{}
+		// Handle the infinite valid lifetime separately as we do not keep a timer
+		// in this case.
+		prefixState.invalidationTimer.StopLocked()
+		prefixState.validUntil = time.Time{}
+	} else {
+		var effectiveVl time.Duration
+		var rl time.Duration
+
+		// If the prefix was originally set to be valid forever, assume the
+		// remaining time to be the maximum possible value.
+		if prefixState.validUntil == (time.Time{}) {
+			rl = header.NDPInfiniteLifetime
+		} else {
+			rl = time.Until(prefixState.validUntil)
+		}
+
+		if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl {
+			effectiveVl = vl
+		} else if rl > MinPrefixInformationValidLifetimeForUpdate {
+			effectiveVl = MinPrefixInformationValidLifetimeForUpdate
+		}
+
+		if effectiveVl != 0 {
+			prefixState.invalidationTimer.StopLocked()
+			prefixState.invalidationTimer.Reset(effectiveVl)
+			prefixState.validUntil = now.Add(effectiveVl)
+		}
+	}
+
+	// If DAD is not yet complete on the stable address, there is no need to do
+	// work with temporary addresses.
+	if prefixState.stableAddr.ref.getKind() != permanent {
 		return
 	}
 
-	var effectiveVl time.Duration
-	var rl time.Duration
+	// Note, we do not need to update the entries in the temporary address map
+	// after updating the timers because the timers are held as pointers.
+	var regenForAddr tcpip.Address
+	allAddressesRegenerated := true
+	for tempAddr, tempAddrState := range prefixState.tempAddrs {
+		// As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary
+		// address is the lower of the valid lifetime of the stable address or the
+		// maximum temporary address valid lifetime. Note, the valid lifetime of a
+		// temporary address is relative to the address's creation time.
+		validUntil := tempAddrState.createdAt.Add(ndp.configs.MaxTempAddrValidLifetime)
+		if prefixState.validUntil != (time.Time{}) && validUntil.Sub(prefixState.validUntil) > 0 {
+			validUntil = prefixState.validUntil
+		}
 
-	// If the address was originally set to be valid forever, assume the remaining
-	// time to be the maximum possible value.
-	if addrState.validUntil == (time.Time{}) {
-		rl = header.NDPInfiniteLifetime
-	} else {
-		rl = time.Until(addrState.validUntil)
+		// If the address is no longer valid, invalidate it immediately. Otherwise,
+		// reset the invalidation timer.
+		newValidLifetime := validUntil.Sub(now)
+		if newValidLifetime <= 0 {
+			ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, tempAddr, tempAddrState)
+			continue
+		}
+		tempAddrState.invalidationTimer.StopLocked()
+		tempAddrState.invalidationTimer.Reset(newValidLifetime)
+
+		// As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary
+		// address is the lower of the preferred lifetime of the stable address or
+		// the maximum temporary address preferred lifetime - the temporary address
+		// desync factor. Note, the preferred lifetime of a temporary address is
+		// relative to the address's creation time.
+		preferredUntil := tempAddrState.createdAt.Add(ndp.configs.MaxTempAddrPreferredLifetime - ndp.temporaryAddressDesyncFactor)
+		if prefixState.preferredUntil != (time.Time{}) && preferredUntil.Sub(prefixState.preferredUntil) > 0 {
+			preferredUntil = prefixState.preferredUntil
+		}
+
+		// If the address is no longer preferred, deprecate it immediately.
+		// Otherwise, reset the deprecation timer.
+		newPreferredLifetime := preferredUntil.Sub(now)
+		tempAddrState.deprecationTimer.StopLocked()
+		if newPreferredLifetime <= 0 {
+			ndp.deprecateSLAACAddress(tempAddrState.ref)
+		} else {
+			tempAddrState.ref.deprecated = false
+			tempAddrState.deprecationTimer.Reset(newPreferredLifetime)
+		}
+
+		tempAddrState.regenTimer.StopLocked()
+		if tempAddrState.regenerated {
+		} else {
+			allAddressesRegenerated = false
+
+			if newPreferredLifetime <= ndp.configs.RegenAdvanceDuration {
+				// The new preferred lifetime is less than the advance regeneration
+				// duration so regenerate an address for this temporary address
+				// immediately after we finish iterating over the temporary addresses.
+				regenForAddr = tempAddr
+			} else {
+				tempAddrState.regenTimer.Reset(newPreferredLifetime - ndp.configs.RegenAdvanceDuration)
+			}
+		}
 	}
 
-	if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl {
-		effectiveVl = vl
-	} else if rl <= MinPrefixInformationValidLifetimeForUpdate {
+	// Generate a new temporary address if all of the existing temporary addresses
+	// have been regenerated, or we need to immediately regenerate an address
+	// due to an update in preferred lifetime.
+	//
+	// If each temporay address has already been regenerated, no new temporary
+	// address will be generated. To ensure continuation of temporary SLAAC
+	// addresses, we manually try to regenerate an address here.
+	if len(regenForAddr) != 0 || allAddressesRegenerated {
+		// Reset the generation attempts counter as we are starting the generation
+		// of a new address for the SLAAC prefix.
+		if state, ok := prefixState.tempAddrs[regenForAddr]; ndp.generateTempSLAACAddr(prefix, prefixState, true /* resetGenAttempts */) && ok {
+			state.regenerated = true
+			prefixState.tempAddrs[regenForAddr] = state
+		}
+	}
+}
+
+// deprecateSLAACAddress marks ref as deprecated and notifies the stack's NDP
+// dispatcher that ref has been deprecated.
+//
+// deprecateSLAACAddress does nothing if ref is already deprecated.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) deprecateSLAACAddress(ref *referencedNetworkEndpoint) {
+	if ref.deprecated {
 		return
-	} else {
-		effectiveVl = MinPrefixInformationValidLifetimeForUpdate
 	}
 
-	addrState.invalidationTimer.StopLocked()
-	addrState.invalidationTimer.Reset(effectiveVl)
-	addrState.validUntil = time.Now().Add(effectiveVl)
+	ref.deprecated = true
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), ref.addrWithPrefix())
+	}
 }
 
-// notifyAutoGenAddressDeprecated notifies the stack's NDP dispatcher that addr
-// has been deprecated.
-func (ndp *ndpState) notifyAutoGenAddressDeprecated(addr tcpip.Address) {
-	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), tcpip.AddressWithPrefix{
-			Address:   addr,
-			PrefixLen: validPrefixLenForAutoGen,
-		})
+// invalidateSLAACPrefix invalidates a SLAAC prefix.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, state slaacPrefixState) {
+	if r := state.stableAddr.ref; r != nil {
+		// Since we are already invalidating the prefix, do not invalidate the
+		// prefix when removing the address.
+		if err := ndp.nic.removePermanentIPv6EndpointLocked(r, false /* allowSLAACInvalidation */); err != nil {
+			panic(fmt.Sprintf("ndp: error removing stable SLAAC address %s: %s", r.addrWithPrefix(), err))
+		}
 	}
+
+	ndp.cleanupSLAACPrefixResources(prefix, state)
 }
 
-// invalidateAutoGenAddress invalidates an auto-generated address.
+// cleanupSLAACAddrResourcesAndNotify cleans up an invalidated SLAAC address's
+// resources.
 //
 // The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) invalidateAutoGenAddress(addr tcpip.Address) {
-	if !ndp.cleanupAutoGenAddrResourcesAndNotify(addr) {
+func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidatePrefix bool) {
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), addr)
+	}
+
+	prefix := addr.Subnet()
+	state, ok := ndp.slaacPrefixes[prefix]
+	if !ok || state.stableAddr.ref == nil || addr.Address != state.stableAddr.ref.ep.ID().LocalAddress {
 		return
 	}
 
-	ndp.nic.removePermanentAddressLocked(addr)
+	if !invalidatePrefix {
+		// If the prefix is not being invalidated, disassociate the address from the
+		// prefix and do nothing further.
+		state.stableAddr.ref = nil
+		ndp.slaacPrefixes[prefix] = state
+		return
+	}
+
+	ndp.cleanupSLAACPrefixResources(prefix, state)
 }
 
-// cleanupAutoGenAddrResourcesAndNotify cleans up an invalidated auto-generated
-// address's resources from ndp. If the stack has an NDP dispatcher, it will
-// be notified that addr has been invalidated.
+// cleanupSLAACPrefixResources cleansup a SLAAC prefix's timers and entry.
 //
-// Returns true if ndp had resources for addr to cleanup.
+// Panics if the SLAAC prefix is not known.
 //
 // The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bool {
-	state, ok := ndp.autoGenAddresses[addr]
-	if !ok {
-		return false
+func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaacPrefixState) {
+	// Invalidate all temporary addresses.
+	for tempAddr, tempAddrState := range state.tempAddrs {
+		ndp.invalidateTempSLAACAddr(state.tempAddrs, tempAddr, tempAddrState)
 	}
 
+	state.stableAddr.ref = nil
 	state.deprecationTimer.StopLocked()
 	state.invalidationTimer.StopLocked()
-	delete(ndp.autoGenAddresses, addr)
+	delete(ndp.slaacPrefixes, prefix)
+}
+
+// invalidateTempSLAACAddr invalidates a temporary SLAAC address.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) invalidateTempSLAACAddr(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
+	// Since we are already invalidating the address, do not invalidate the
+	// address when removing the address.
+	if err := ndp.nic.removePermanentIPv6EndpointLocked(tempAddrState.ref, false /* allowSLAACInvalidation */); err != nil {
+		panic(fmt.Sprintf("error removing temporary SLAAC address %s: %s", tempAddrState.ref.addrWithPrefix(), err))
+	}
+
+	ndp.cleanupTempSLAACAddrResources(tempAddrs, tempAddr, tempAddrState)
+}
 
+// cleanupTempSLAACAddrResourcesAndNotify cleans up an invalidated temporary
+// SLAAC address's resources from ndp.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidateAddr bool) {
 	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
-		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), tcpip.AddressWithPrefix{
-			Address:   addr,
-			PrefixLen: validPrefixLenForAutoGen,
-		})
+		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), addr)
 	}
 
-	return true
+	if !invalidateAddr {
+		return
+	}
+
+	prefix := addr.Subnet()
+	state, ok := ndp.slaacPrefixes[prefix]
+	if !ok {
+		panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry to clean up temp addr %s resources", addr))
+	}
+
+	tempAddrState, ok := state.tempAddrs[addr.Address]
+	if !ok {
+		panic(fmt.Sprintf("ndp: must have a tempAddr entry to clean up temp addr %s resources", addr))
+	}
+
+	ndp.cleanupTempSLAACAddrResources(state.tempAddrs, addr.Address, tempAddrState)
+}
+
+// cleanupTempSLAACAddrResourcesAndNotify cleans up a temporary SLAAC address's
+// timers and entry.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupTempSLAACAddrResources(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
+	tempAddrState.deprecationTimer.StopLocked()
+	tempAddrState.invalidationTimer.StopLocked()
+	tempAddrState.regenTimer.StopLocked()
+	delete(tempAddrs, tempAddr)
 }
 
-// cleanupHostOnlyState cleans up any state that is only useful for hosts.
+// cleanupState cleans up ndp's state.
+//
+// If hostOnly is true, then only host-specific state will be cleaned up.
 //
-// cleanupHostOnlyState MUST be called when ndp's NIC is transitioning from a
-// host to a router. This function will invalidate all discovered on-link
-// prefixes, discovered routers, and auto-generated addresses as routers do not
-// normally process Router Advertisements to discover default routers and
-// on-link prefixes, and auto-generate addresses via SLAAC.
+// cleanupState MUST be called with hostOnly set to true when ndp's NIC is
+// transitioning from a host to a router. This function will invalidate all
+// discovered on-link prefixes, discovered routers, and auto-generated
+// addresses.
+//
+// If hostOnly is true, then the link-local auto-generated address will not be
+// invalidated as routers are also expected to generate a link-local address.
 //
 // The NIC that ndp belongs to MUST be locked.
-func (ndp *ndpState) cleanupHostOnlyState() {
+func (ndp *ndpState) cleanupState(hostOnly bool) {
 	linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet()
-	linkLocalAddrs := 0
-	for addr := range ndp.autoGenAddresses {
+	linkLocalPrefixes := 0
+	for prefix, state := range ndp.slaacPrefixes {
 		// RFC 4862 section 5 states that routers are also expected to generate a
-		// link-local address so we do not invalidate them.
-		if linkLocalSubnet.Contains(addr) {
-			linkLocalAddrs++
+		// link-local address so we do not invalidate them if we are cleaning up
+		// host-only state.
+		if hostOnly && prefix == linkLocalSubnet {
+			linkLocalPrefixes++
 			continue
 		}
 
-		ndp.invalidateAutoGenAddress(addr)
+		ndp.invalidateSLAACPrefix(prefix, state)
 	}
 
-	if got := len(ndp.autoGenAddresses); got != linkLocalAddrs {
-		log.Fatalf("ndp: still have non-linklocal auto-generated addresses after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalAddrs)
+	if got := len(ndp.slaacPrefixes); got != linkLocalPrefixes {
+		panic(fmt.Sprintf("ndp: still have non-linklocal SLAAC prefixes after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalPrefixes))
 	}
 
 	for prefix := range ndp.onLinkPrefixes {
@@ -1180,7 +1797,7 @@ func (ndp *ndpState) cleanupHostOnlyState() {
 	}
 
 	if got := len(ndp.onLinkPrefixes); got != 0 {
-		log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got)
+		panic(fmt.Sprintf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got))
 	}
 
 	for router := range ndp.defaultRouters {
@@ -1188,8 +1805,10 @@ func (ndp *ndpState) cleanupHostOnlyState() {
 	}
 
 	if got := len(ndp.defaultRouters); got != 0 {
-		log.Fatalf("ndp: still have discovered default routers after cleaning up; found = %d", got)
+		panic(fmt.Sprintf("ndp: still have discovered default routers after cleaning up; found = %d", got))
 	}
+
+	ndp.dhcpv6Configuration = 0
 }
 
 // startSolicitingRouters starts soliciting routers, as per RFC 4861 section
@@ -1215,24 +1834,45 @@ func (ndp *ndpState) startSolicitingRouters() {
 	}
 
 	ndp.rtrSolicitTimer = time.AfterFunc(delay, func() {
-		// Send an RS message with the unspecified source address.
-		ref := ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
-		r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+		// As per RFC 4861 section 4.1, the source of the RS is an address assigned
+		// to the sending interface, or the unspecified address if no address is
+		// assigned to the sending interface.
+		ref := ndp.nic.primaryIPv6Endpoint(header.IPv6AllRoutersMulticastAddress)
+		if ref == nil {
+			ref = ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
+		}
+		localAddr := ref.ep.ID().LocalAddress
+		r := makeRoute(header.IPv6ProtocolNumber, localAddr, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 		defer r.Release()
 
 		// Route should resolve immediately since
 		// header.IPv6AllRoutersMulticastAddress is a multicast address so a
 		// remote link address can be calculated without a resolution process.
 		if c, err := r.Resolve(nil); err != nil {
-			log.Fatalf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err)
+			panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err))
 		} else if c != nil {
-			log.Fatalf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID())
+			panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID()))
 		}
 
-		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize
-		hdr := buffer.NewPrependable(header.IPv6MinimumSize + payloadSize)
+		// As per RFC 4861 section 4.1, an NDP RS SHOULD include the source
+		// link-layer address option if the source address of the NDP RS is
+		// specified. This option MUST NOT be included if the source address is
+		// unspecified.
+		//
+		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+		// LinkEndpoint.LinkAddress) before reaching this point.
+		var optsSerializer header.NDPOptionsSerializer
+		if localAddr != header.IPv6Any && header.IsValidUnicastEthernetAddress(r.LocalLinkAddress) {
+			optsSerializer = header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(r.LocalLinkAddress),
+			}
+		}
+		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize + int(optsSerializer.Length())
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + payloadSize)
 		pkt := header.ICMPv6(hdr.Prepend(payloadSize))
 		pkt.SetType(header.ICMPv6RouterSolicit)
+		rs := header.NDPRouterSolicit(pkt.NDPPayload())
+		rs.Options().Serialize(optsSerializer)
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 		sent := r.Stats().ICMP.V6PacketsSent
@@ -1241,7 +1881,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 				Protocol: header.ICMPv6ProtocolNumber,
 				TTL:      header.NDPHopLimit,
 				TOS:      DefaultTOS,
-			}, tcpip.PacketBuffer{Header: hdr},
+			}, &PacketBuffer{Header: hdr},
 		); err != nil {
 			sent.Dropped.Increment()
 			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.nic.ID(), err)
@@ -1281,3 +1921,13 @@ func (ndp *ndpState) stopSolicitingRouters() {
 	ndp.rtrSolicitTimer.Stop()
 	ndp.rtrSolicitTimer = nil
 }
+
+// initializeTempAddrState initializes state related to temporary SLAAC
+// addresses.
+func (ndp *ndpState) initializeTempAddrState() {
+	header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.nic.stack.tempIIDSeed, ndp.nic.ID())
+
+	if MaxDesyncFactor != 0 {
+		ndp.temporaryAddressDesyncFactor = time.Duration(rand.Int63n(int64(MaxDesyncFactor)))
+	}
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 1f6f77439..58f1ebf60 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -133,6 +133,12 @@ type ndpRDNSSEvent struct {
 	rdnss ndpRDNSS
 }
 
+type ndpDNSSLEvent struct {
+	nicID       tcpip.NICID
+	domainNames []string
+	lifetime    time.Duration
+}
+
 type ndpDHCPv6Event struct {
 	nicID         tcpip.NICID
 	configuration stack.DHCPv6ConfigurationFromNDPRA
@@ -150,6 +156,8 @@ type ndpDispatcher struct {
 	rememberPrefix       bool
 	autoGenAddrC         chan ndpAutoGenAddrEvent
 	rdnssC               chan ndpRDNSSEvent
+	dnsslC               chan ndpDNSSLEvent
+	routeTable           []tcpip.Route
 	dhcpv6ConfigurationC chan ndpDHCPv6Event
 }
 
@@ -257,6 +265,17 @@ func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tc
 	}
 }
 
+// Implements stack.NDPDispatcher.OnDNSSearchListOption.
+func (n *ndpDispatcher) OnDNSSearchListOption(nicID tcpip.NICID, domainNames []string, lifetime time.Duration) {
+	if n.dnsslC != nil {
+		n.dnsslC <- ndpDNSSLEvent{
+			nicID,
+			domainNames,
+			lifetime,
+		}
+	}
+}
+
 // Implements stack.NDPDispatcher.OnDHCPv6Configuration.
 func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration stack.DHCPv6ConfigurationFromNDPRA) {
 	if c := n.dhcpv6ConfigurationC; c != nil {
@@ -267,6 +286,17 @@ func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration s
 	}
 }
 
+// channelLinkWithHeaderLength is a channel.Endpoint with a configurable
+// header length.
+type channelLinkWithHeaderLength struct {
+	*channel.Endpoint
+	headerLength uint16
+}
+
+func (l *channelLinkWithHeaderLength) MaxHeaderLength() uint16 {
+	return l.headerLength
+}
+
 // Check e to make sure that the event is for addr on nic with ID 1, and the
 // resolved flag set to resolved with the specified err.
 func checkDADEvent(e ndpDADEvent, nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) string {
@@ -323,21 +353,46 @@ func TestDADDisabled(t *testing.T) {
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
 // RetransmitTimer (<1ms) values get fixed to the default RetransmitTimer of 1s.
+// This tests also validates the NDP NS packet that is transmitted.
 func TestDADResolve(t *testing.T) {
 	const nicID = 1
 
 	tests := []struct {
 		name                    string
+		linkHeaderLen           uint16
 		dupAddrDetectTransmits  uint8
 		retransTimer            time.Duration
 		expectedRetransmitTimer time.Duration
 	}{
-		{"1:1s:1s", 1, time.Second, time.Second},
-		{"2:1s:1s", 2, time.Second, time.Second},
-		{"1:2s:2s", 1, 2 * time.Second, 2 * time.Second},
+		{
+			name:                    "1:1s:1s",
+			dupAddrDetectTransmits:  1,
+			retransTimer:            time.Second,
+			expectedRetransmitTimer: time.Second,
+		},
+		{
+			name:                    "2:1s:1s",
+			linkHeaderLen:           1,
+			dupAddrDetectTransmits:  2,
+			retransTimer:            time.Second,
+			expectedRetransmitTimer: time.Second,
+		},
+		{
+			name:                    "1:2s:2s",
+			linkHeaderLen:           2,
+			dupAddrDetectTransmits:  1,
+			retransTimer:            2 * time.Second,
+			expectedRetransmitTimer: 2 * time.Second,
+		},
 		// 0s is an invalid RetransmitTimer timer and will be fixed to
 		// the default RetransmitTimer value of 1s.
-		{"1:0s:1s", 1, 0, time.Second},
+		{
+			name:                    "1:0s:1s",
+			linkHeaderLen:           3,
+			dupAddrDetectTransmits:  1,
+			retransTimer:            0,
+			expectedRetransmitTimer: time.Second,
+		},
 	}
 
 	for _, test := range tests {
@@ -356,10 +411,13 @@ func TestDADResolve(t *testing.T) {
 			opts.NDPConfigs.RetransmitTimer = test.retransTimer
 			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
 
-			e := channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1)
-			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			e := channelLinkWithHeaderLength{
+				Endpoint:     channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1),
+				headerLength: test.linkHeaderLen,
+			}
+			e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 			s := stack.New(opts)
-			if err := s.CreateNIC(nicID, e); err != nil {
+			if err := s.CreateNIC(nicID, &e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
@@ -367,8 +425,7 @@ func TestDADResolve(t *testing.T) {
 				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
 			}
 
-			// Address should not be considered bound to the NIC yet
-			// (DAD ongoing).
+			// Address should not be considered bound to the NIC yet (DAD ongoing).
 			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
@@ -377,10 +434,9 @@ func TestDADResolve(t *testing.T) {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 
-			// Wait for the remaining time - some delta (500ms), to
-			// make sure the address is still not resolved.
-			const delta = 500 * time.Millisecond
-			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - delta)
+			// Make sure the address does not resolve before the resolution time has
+			// passed.
+			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - defaultAsyncEventTimeout)
 			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
 			if err != nil {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
@@ -391,13 +447,7 @@ func TestDADResolve(t *testing.T) {
 
 			// Wait for DAD to resolve.
 			select {
-			case <-time.After(2 * delta):
-				// We should get a resolution event after 500ms
-				// (delta) since we wait for 500ms less than the
-				// expected resolution time above to make sure
-				// that the address did not yet resolve. Waiting
-				// for 1s (2x delta) without a resolution event
-				// means something is wrong.
+			case <-time.After(2 * defaultAsyncEventTimeout):
 				t.Fatal("timed out waiting for DAD resolution")
 			case e := <-ndpDisp.dadC:
 				if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" {
@@ -437,7 +487,7 @@ func TestDADResolve(t *testing.T) {
 				// As per RFC 4861 section 4.3, a possible option is the Source Link
 				// Layer option, but this option MUST NOT be included when the source
 				// address of the packet is the unspecified address.
-				checker.IPv6(t, p.Pkt.Header.View().ToVectorisedView().First(),
+				checker.IPv6(t, p.Pkt.Header.View(),
 					checker.SrcAddr(header.IPv6Any),
 					checker.DstAddr(snmc),
 					checker.TTL(header.NDPHopLimit),
@@ -445,6 +495,10 @@ func TestDADResolve(t *testing.T) {
 						checker.NDPNSTargetAddress(addr1),
 						checker.NDPNSOptions(nil),
 					))
+
+				if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
+					t.Errorf("got p.Pkt.Header.AvailableLength() = %d; want = %d", l, want)
+				}
 			}
 		})
 	}
@@ -559,7 +613,7 @@ func TestDADFail(t *testing.T) {
 			// Receive a packet to simulate multiple nodes owning or
 			// attempting to own the same address.
 			hdr := test.makeBuf(addr1)
-			e.InjectInbound(header.IPv6ProtocolNumber, tcpip.PacketBuffer{
+			e.InjectInbound(header.IPv6ProtocolNumber, &stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -588,74 +642,121 @@ func TestDADFail(t *testing.T) {
 			if want := (tcpip.AddressWithPrefix{}); addr != want {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
+
+			// Attempting to add the address again should not fail if the address's
+			// state was cleaned up when DAD failed.
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+			}
 		})
 	}
 }
 
-// TestDADStop tests to make sure that the DAD process stops when an address is
-// removed.
 func TestDADStop(t *testing.T) {
 	const nicID = 1
 
-	ndpDisp := ndpDispatcher{
-		dadC: make(chan ndpDADEvent, 1),
-	}
-	ndpConfigs := stack.NDPConfigurations{
-		RetransmitTimer:        time.Second,
-		DupAddrDetectTransmits: 2,
-	}
-	opts := stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPDisp:          &ndpDisp,
-		NDPConfigs:       ndpConfigs,
-	}
+	tests := []struct {
+		name               string
+		stopFn             func(t *testing.T, s *stack.Stack)
+		skipFinalAddrCheck bool
+	}{
+		// Tests to make sure that DAD stops when an address is removed.
+		{
+			name: "Remove address",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.RemoveAddress(nicID, addr1); err != nil {
+					t.Fatalf("RemoveAddress(%d, %s): %s", nicID, addr1, err)
+				}
+			},
+		},
 
-	e := channel.New(0, 1280, linkAddr1)
-	s := stack.New(opts)
-	if err := s.CreateNIC(nicID, e); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
-	}
+		// Tests to make sure that DAD stops when the NIC is disabled.
+		{
+			name: "Disable NIC",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.DisableNIC(nicID); err != nil {
+					t.Fatalf("DisableNIC(%d): %s", nicID, err)
+				}
+			},
+		},
 
-	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
-		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+		// Tests to make sure that DAD stops when the NIC is removed.
+		{
+			name: "Remove NIC",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.RemoveNIC(nicID); err != nil {
+					t.Fatalf("RemoveNIC(%d): %s", nicID, err)
+				}
+			},
+			// The NIC is removed so we can't check its addresses after calling
+			// stopFn.
+			skipFinalAddrCheck: true,
+		},
 	}
 
-	// Address should not be considered bound to the NIC yet (DAD ongoing).
-	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
-	}
-	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
-	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent, 1),
+			}
+			ndpConfigs := stack.NDPConfigurations{
+				RetransmitTimer:        time.Second,
+				DupAddrDetectTransmits: 2,
+			}
+			opts := stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPDisp:          &ndpDisp,
+				NDPConfigs:       ndpConfigs,
+			}
 
-	// Remove the address. This should stop DAD.
-	if err := s.RemoveAddress(nicID, addr1); err != nil {
-		t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr1, err)
-	}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(opts)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
 
-	// Wait for DAD to fail (since the address was removed during DAD).
-	select {
-	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
-		// If we don't get a failure event after the expected resolution
-		// time + extra 1s buffer, something is wrong.
-		t.Fatal("timed out waiting for DAD failure")
-	case e := <-ndpDisp.dadC:
-		if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
-			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
-		}
-	}
-	addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-	if err != nil {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
-	}
-	if want := (tcpip.AddressWithPrefix{}); addr != want {
-		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
-	}
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+			}
+
+			// Address should not be considered bound to the NIC yet (DAD ongoing).
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
+
+			test.stopFn(t, s)
+
+			// Wait for DAD to fail (since the address was removed during DAD).
+			select {
+			case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+				// If we don't get a failure event after the expected resolution
+				// time + extra 1s buffer, something is wrong.
+				t.Fatal("timed out waiting for DAD failure")
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			}
+
+			if !test.skipFinalAddrCheck {
+				addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+				if err != nil {
+					t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+				}
+				if want := (tcpip.AddressWithPrefix{}); addr != want {
+					t.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+				}
+			}
 
-	// Should not have sent more than 1 NS message.
-	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 {
-		t.Fatalf("got NeighborSolicit = %d, want <= 1", got)
+			// Should not have sent more than 1 NS message.
+			if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 {
+				t.Errorf("got NeighborSolicit = %d, want <= 1", got)
+			}
+		})
 	}
 }
 
@@ -834,7 +935,7 @@ func TestSetNDPConfigurations(t *testing.T) {
 
 // raBufWithOptsAndDHCPv6 returns a valid NDP Router Advertisement with options
 // and DHCPv6 configurations specified.
-func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) *stack.PacketBuffer {
 	icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize + int(optSer.Length())
 	hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
 	pkt := header.ICMPv6(hdr.Prepend(icmpSize))
@@ -869,14 +970,14 @@ func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherCo
 		DstAddr:       header.IPv6AllNodesMulticastAddress,
 	})
 
-	return tcpip.PacketBuffer{Data: hdr.View().ToVectorisedView()}
+	return &stack.PacketBuffer{Data: hdr.View().ToVectorisedView()}
 }
 
 // raBufWithOpts returns a valid NDP Router Advertisement with options.
 //
 // Note, raBufWithOpts does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) *stack.PacketBuffer {
 	return raBufWithOptsAndDHCPv6(ip, rl, false, false, optSer)
 }
 
@@ -885,7 +986,7 @@ func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializ
 //
 // Note, raBufWithDHCPv6 does not populate any of the RA fields other than the
 // DHCPv6 related ones.
-func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) tcpip.PacketBuffer {
+func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) *stack.PacketBuffer {
 	return raBufWithOptsAndDHCPv6(ip, 0, managedAddresses, otherConfiguratiosns, header.NDPOptionsSerializer{})
 }
 
@@ -893,7 +994,7 @@ func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bo
 //
 // Note, raBuf does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
+func raBuf(ip tcpip.Address, rl uint16) *stack.PacketBuffer {
 	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{})
 }
 
@@ -902,7 +1003,7 @@ func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
 //
 // Note, raBufWithPI does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) tcpip.PacketBuffer {
+func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) *stack.PacketBuffer {
 	flags := uint8(0)
 	if onLink {
 		// The OnLink flag is the 7th bit in the flags byte.
@@ -950,8 +1051,6 @@ func TestNoRouterDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverDefaultRouters(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				routerC: make(chan ndpRouterEvent, 1),
 			}
@@ -990,8 +1089,6 @@ func checkRouterEvent(e ndpRouterEvent, addr tcpip.Address, discovered bool) str
 // TestRouterDiscoveryDispatcherNoRemember tests that the stack does not
 // remember a discovered router when the dispatcher asks it not to.
 func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
-	t.Parallel()
-
 	ndpDisp := ndpDispatcher{
 		routerC: make(chan ndpRouterEvent, 1),
 	}
@@ -1032,8 +1129,6 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 }
 
 func TestRouterDiscovery(t *testing.T) {
-	t.Parallel()
-
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, 1),
 		rememberRouter: true,
@@ -1135,8 +1230,6 @@ func TestRouterDiscovery(t *testing.T) {
 // TestRouterDiscoveryMaxRouters tests that only
 // stack.MaxDiscoveredDefaultRouters discovered routers are remembered.
 func TestRouterDiscoveryMaxRouters(t *testing.T) {
-	t.Parallel()
-
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, 1),
 		rememberRouter: true,
@@ -1203,8 +1296,6 @@ func TestNoPrefixDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverOnLinkPrefixes(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				prefixC: make(chan ndpPrefixEvent, 1),
 			}
@@ -1244,8 +1335,6 @@ func checkPrefixEvent(e ndpPrefixEvent, prefix tcpip.Subnet, discovered bool) st
 // TestPrefixDiscoveryDispatcherNoRemember tests that the stack does not
 // remember a discovered on-link prefix when the dispatcher asks it not to.
 func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
-	t.Parallel()
-
 	prefix, subnet, _ := prefixSubnetAddr(0, "")
 
 	ndpDisp := ndpDispatcher{
@@ -1289,8 +1378,6 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 }
 
 func TestPrefixDiscovery(t *testing.T) {
-	t.Parallel()
-
 	prefix1, subnet1, _ := prefixSubnetAddr(0, "")
 	prefix2, subnet2, _ := prefixSubnetAddr(1, "")
 	prefix3, subnet3, _ := prefixSubnetAddr(2, "")
@@ -1479,8 +1566,6 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 // TestPrefixDiscoveryMaxRouters tests that only
 // stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
 func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
-	t.Parallel()
-
 	ndpDisp := ndpDispatcher{
 		prefixC:        make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3),
 		rememberPrefix: true,
@@ -1575,8 +1660,6 @@ func TestNoAutoGenAddr(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), AutoGenAddr(%t), Forwarding(%t)", handle, autogen, forwarding), func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
 			}
@@ -1718,6 +1801,935 @@ func TestAutoGenAddr(t *testing.T) {
 	}
 }
 
+func addressCheck(addrs []tcpip.ProtocolAddress, containList, notContainList []tcpip.AddressWithPrefix) string {
+	ret := ""
+	for _, c := range containList {
+		if !containsV6Addr(addrs, c) {
+			ret += fmt.Sprintf("should have %s in the list of addresses\n", c)
+		}
+	}
+	for _, c := range notContainList {
+		if containsV6Addr(addrs, c) {
+			ret += fmt.Sprintf("should not have %s in the list of addresses\n", c)
+		}
+	}
+	return ret
+}
+
+// TestAutoGenTempAddr tests that temporary SLAAC addresses are generated when
+// configured to do so as part of IPv6 Privacy Extensions.
+func TestAutoGenTempAddr(t *testing.T) {
+	const (
+		nicID            = 1
+		newMinVL         = 5
+		newMinVLDuration = newMinVL * time.Second
+	)
+
+	savedMinPrefixInformationValidLifetimeForUpdate := stack.MinPrefixInformationValidLifetimeForUpdate
+	savedMaxDesync := stack.MaxDesyncFactor
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = savedMinPrefixInformationValidLifetimeForUpdate
+		stack.MaxDesyncFactor = savedMaxDesync
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+	stack.MaxDesyncFactor = time.Nanosecond
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+	tests := []struct {
+		name             string
+		dupAddrTransmits uint8
+		retransmitTimer  time.Duration
+	}{
+		{
+			name: "DAD disabled",
+		},
+		{
+			name:             "DAD enabled",
+			dupAddrTransmits: 1,
+			retransmitTimer:  time.Second,
+		},
+	}
+
+	// This Run will not return until the parallel tests finish.
+	//
+	// We need this because we need to do some teardown work after the
+	// parallel tests complete.
+	//
+	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+	// more details.
+	t.Run("group", func(t *testing.T) {
+		for i, test := range tests {
+			i := i
+			test := test
+
+			t.Run(test.name, func(t *testing.T) {
+				t.Parallel()
+
+				seed := []byte{uint8(i)}
+				var tempIIDHistory [header.IIDSize]byte
+				header.InitialTempIID(tempIIDHistory[:], seed, nicID)
+				newTempAddr := func(stableAddr tcpip.Address) tcpip.AddressWithPrefix {
+					return header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], stableAddr)
+				}
+
+				ndpDisp := ndpDispatcher{
+					dadC:         make(chan ndpDADEvent, 2),
+					autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+				}
+				e := channel.New(0, 1280, linkAddr1)
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+					NDPConfigs: stack.NDPConfigurations{
+						DupAddrDetectTransmits:     test.dupAddrTransmits,
+						RetransmitTimer:            test.retransmitTimer,
+						HandleRAs:                  true,
+						AutoGenGlobalAddresses:     true,
+						AutoGenTempGlobalAddresses: true,
+					},
+					NDPDisp:     &ndpDisp,
+					TempIIDSeed: seed,
+				})
+
+				if err := s.CreateNIC(nicID, e); err != nil {
+					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+				}
+
+				expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+					t.Helper()
+
+					select {
+					case e := <-ndpDisp.autoGenAddrC:
+						if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+					default:
+						t.Fatal("expected addr auto gen event")
+					}
+				}
+
+				expectAutoGenAddrEventAsync := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+					t.Helper()
+
+					select {
+					case e := <-ndpDisp.autoGenAddrC:
+						if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+					case <-time.After(defaultAsyncEventTimeout):
+						t.Fatal("timed out waiting for addr auto gen event")
+					}
+				}
+
+				expectDADEventAsync := func(addr tcpip.Address) {
+					t.Helper()
+
+					select {
+					case e := <-ndpDisp.dadC:
+						if diff := checkDADEvent(e, nicID, addr, true, nil); diff != "" {
+							t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+						}
+					case <-time.After(time.Duration(test.dupAddrTransmits)*test.retransmitTimer + defaultAsyncEventTimeout):
+						t.Fatal("timed out waiting for DAD event")
+					}
+				}
+
+				// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+				// with zero valid lifetime.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 0, 0))
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Fatalf("unexpectedly auto-generated an address with 0 lifetime; event = %+v", e)
+				default:
+				}
+
+				// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+				// with non-zero valid lifetime.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+				expectAutoGenAddrEvent(addr1, newAddr)
+				expectDADEventAsync(addr1.Address)
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Fatalf("unexpectedly got an auto gen addr event = %+v", e)
+				default:
+				}
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+				// with non-zero valid & preferred lifetimes.
+				tempAddr1 := newTempAddr(addr1.Address)
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+				expectAutoGenAddrEvent(tempAddr1, newAddr)
+				expectDADEventAsync(tempAddr1.Address)
+				if mismatch := addressCheck(s.NICInfo()[1].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Receive an RA with prefix2 in an NDP Prefix Information option (PI)
+				// with preferred lifetime > valid lifetime
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 5, 6))
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Fatalf("unexpectedly auto-generated an address with preferred lifetime > valid lifetime; event = %+v", e)
+				default:
+				}
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Receive an RA with prefix2 in a PI w/ non-zero valid and preferred
+				// lifetimes.
+				tempAddr2 := newTempAddr(addr2.Address)
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+				expectAutoGenAddrEvent(addr2, newAddr)
+				expectDADEventAsync(addr2.Address)
+				expectAutoGenAddrEventAsync(tempAddr2, newAddr)
+				expectDADEventAsync(tempAddr2.Address)
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Deprecate prefix1.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+				expectAutoGenAddrEvent(addr1, deprecatedAddr)
+				expectAutoGenAddrEvent(tempAddr1, deprecatedAddr)
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Refresh lifetimes for prefix1.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Reduce valid lifetime and deprecate addresses of prefix1.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+				expectAutoGenAddrEvent(addr1, deprecatedAddr)
+				expectAutoGenAddrEvent(tempAddr1, deprecatedAddr)
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Wait for addrs of prefix1 to be invalidated. They should be
+				// invalidated at the same time.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					var nextAddr tcpip.AddressWithPrefix
+					if e.addr == addr1 {
+						if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+						nextAddr = tempAddr1
+					} else {
+						if diff := checkAutoGenAddrEvent(e, tempAddr1, invalidatedAddr); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+						nextAddr = addr1
+					}
+
+					select {
+					case e := <-ndpDisp.autoGenAddrC:
+						if diff := checkAutoGenAddrEvent(e, nextAddr, invalidatedAddr); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+					case <-time.After(defaultTimeout):
+						t.Fatal("timed out waiting for addr auto gen event")
+					}
+				case <-time.After(newMinVLDuration + defaultTimeout):
+					t.Fatal("timed out waiting for addr auto gen event")
+				}
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr2, tempAddr2}, []tcpip.AddressWithPrefix{addr1, tempAddr1}); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Receive an RA with prefix2 in a PI w/ 0 lifetimes.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 0, 0))
+				expectAutoGenAddrEvent(addr2, deprecatedAddr)
+				expectAutoGenAddrEvent(tempAddr2, deprecatedAddr)
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Errorf("got unexpected auto gen addr event = %+v", e)
+				default:
+				}
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr2, tempAddr2}, []tcpip.AddressWithPrefix{addr1, tempAddr1}); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+			})
+		}
+	})
+}
+
+// TestNoAutoGenTempAddrForLinkLocal test that temporary SLAAC addresses are not
+// generated for auto generated link-local addresses.
+func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) {
+	const nicID = 1
+
+	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	defer func() {
+		stack.MaxDesyncFactor = savedMaxDesyncFactor
+	}()
+	stack.MaxDesyncFactor = time.Nanosecond
+
+	tests := []struct {
+		name             string
+		dupAddrTransmits uint8
+		retransmitTimer  time.Duration
+	}{
+		{
+			name: "DAD disabled",
+		},
+		{
+			name:             "DAD enabled",
+			dupAddrTransmits: 1,
+			retransmitTimer:  time.Second,
+		},
+	}
+
+	// This Run will not return until the parallel tests finish.
+	//
+	// We need this because we need to do some teardown work after the
+	// parallel tests complete.
+	//
+	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+	// more details.
+	t.Run("group", func(t *testing.T) {
+		for _, test := range tests {
+			test := test
+
+			t.Run(test.name, func(t *testing.T) {
+				t.Parallel()
+
+				ndpDisp := ndpDispatcher{
+					dadC:         make(chan ndpDADEvent, 1),
+					autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+				}
+				e := channel.New(0, 1280, linkAddr1)
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+					NDPConfigs: stack.NDPConfigurations{
+						AutoGenTempGlobalAddresses: true,
+					},
+					NDPDisp:              &ndpDisp,
+					AutoGenIPv6LinkLocal: true,
+				})
+
+				if err := s.CreateNIC(nicID, e); err != nil {
+					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+				}
+
+				// The stable link-local address should auto-generate and resolve DAD.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, tcpip.AddressWithPrefix{Address: llAddr1, PrefixLen: header.IIDOffsetInIPv6Address * 8}, newAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+				select {
+				case e := <-ndpDisp.dadC:
+					if diff := checkDADEvent(e, nicID, llAddr1, true, nil); diff != "" {
+						t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(time.Duration(test.dupAddrTransmits)*test.retransmitTimer + defaultAsyncEventTimeout):
+					t.Fatal("timed out waiting for DAD event")
+				}
+
+				// No new addresses should be generated.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Errorf("got unxpected auto gen addr event = %+v", e)
+				case <-time.After(defaultAsyncEventTimeout):
+				}
+			})
+		}
+	})
+}
+
+// TestNoAutoGenTempAddrWithoutStableAddr tests that a temporary SLAAC address
+// will not be generated until after DAD completes, even if a new Router
+// Advertisement is received to refresh lifetimes.
+func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) {
+	const (
+		nicID           = 1
+		dadTransmits    = 1
+		retransmitTimer = 2 * time.Second
+	)
+
+	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	defer func() {
+		stack.MaxDesyncFactor = savedMaxDesyncFactor
+	}()
+	stack.MaxDesyncFactor = 0
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+	var tempIIDHistory [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistory[:], nil, nicID)
+	tempAddr := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+
+	ndpDisp := ndpDispatcher{
+		dadC:         make(chan ndpDADEvent, 1),
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			DupAddrDetectTransmits:     dadTransmits,
+			RetransmitTimer:            retransmitTimer,
+			HandleRAs:                  true,
+			AutoGenGlobalAddresses:     true,
+			AutoGenTempGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	// Receive an RA to trigger SLAAC for prefix.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" {
+			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+		}
+	default:
+		t.Fatal("expected addr auto gen event")
+	}
+
+	// DAD on the stable address for prefix has not yet completed. Receiving a new
+	// RA that would refresh lifetimes should not generate a temporary SLAAC
+	// address for the prefix.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		t.Fatalf("unexpected auto gen addr event = %+v", e)
+	default:
+	}
+
+	// Wait for DAD to complete for the stable address then expect the temporary
+	// address to be generated.
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr.Address, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+		t.Fatal("timed out waiting for DAD event")
+	}
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if diff := checkAutoGenAddrEvent(e, tempAddr, newAddr); diff != "" {
+			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(defaultAsyncEventTimeout):
+		t.Fatal("timed out waiting for addr auto gen event")
+	}
+}
+
+// TestAutoGenTempAddrRegen tests that temporary SLAAC addresses are
+// regenerated.
+func TestAutoGenTempAddrRegen(t *testing.T) {
+	const (
+		nicID            = 1
+		regenAfter       = 2 * time.Second
+		newMinVL         = 10
+		newMinVLDuration = newMinVL * time.Second
+	)
+
+	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	savedMinMaxTempAddrPreferredLifetime := stack.MinMaxTempAddrPreferredLifetime
+	savedMinMaxTempAddrValidLifetime := stack.MinMaxTempAddrValidLifetime
+	defer func() {
+		stack.MaxDesyncFactor = savedMaxDesyncFactor
+		stack.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
+		stack.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
+	}()
+	stack.MaxDesyncFactor = 0
+	stack.MinMaxTempAddrPreferredLifetime = newMinVLDuration
+	stack.MinMaxTempAddrValidLifetime = newMinVLDuration
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+	var tempIIDHistory [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistory[:], nil, nicID)
+	tempAddr1 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+	tempAddr2 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+	tempAddr3 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	ndpConfigs := stack.NDPConfigurations{
+		HandleRAs:                  true,
+		AutoGenGlobalAddresses:     true,
+		AutoGenTempGlobalAddresses: true,
+		RegenAdvanceDuration:       newMinVLDuration - regenAfter,
+	}
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs:       ndpConfigs,
+		NDPDisp:          &ndpDisp,
+	})
+
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectAutoGenAddrEventAsync := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timed out waiting for addr auto gen event")
+		}
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with non-zero valid & preferred lifetimes.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	expectAutoGenAddrEvent(addr, newAddr)
+	expectAutoGenAddrEvent(tempAddr1, newAddr)
+	if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1}, nil); mismatch != "" {
+		t.Fatal(mismatch)
+	}
+
+	// Wait for regeneration
+	expectAutoGenAddrEventAsync(tempAddr2, newAddr, regenAfter+defaultAsyncEventTimeout)
+	if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1, tempAddr2}, nil); mismatch != "" {
+		t.Fatal(mismatch)
+	}
+
+	// Wait for regeneration
+	expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncEventTimeout)
+	if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1, tempAddr2, tempAddr3}, nil); mismatch != "" {
+		t.Fatal(mismatch)
+	}
+
+	// Stop generating temporary addresses
+	ndpConfigs.AutoGenTempGlobalAddresses = false
+	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
+		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+	}
+
+	// Wait for all the temporary addresses to get invalidated.
+	tempAddrs := []tcpip.AddressWithPrefix{tempAddr1, tempAddr2, tempAddr3}
+	invalidateAfter := newMinVLDuration - 2*regenAfter
+	for _, addr := range tempAddrs {
+		// Wait for a deprecation then invalidation event, or just an invalidation
+		// event. We need to cover both cases but cannot deterministically hit both
+		// cases because the deprecation and invalidation timers could fire in any
+		// order.
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, deprecatedAddr); diff == "" {
+				// If we get a deprecation event first, we should get an invalidation
+				// event almost immediately after.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(defaultAsyncEventTimeout):
+					t.Fatal("timed out waiting for addr auto gen event")
+				}
+			} else if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff == "" {
+				// If we get an invalidation event first, we shouldn't get a deprecation
+				// event after.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Fatalf("unexpectedly got an auto-generated event = %+v", e)
+				case <-time.After(defaultTimeout):
+				}
+			} else {
+				t.Fatalf("got unexpected auto-generated event = %+v", e)
+			}
+		case <-time.After(invalidateAfter + defaultAsyncEventTimeout):
+			t.Fatal("timed out waiting for addr auto gen event")
+		}
+
+		invalidateAfter = regenAfter
+	}
+	if mismatch := addressCheck(s.NICInfo()[1].ProtocolAddresses, []tcpip.AddressWithPrefix{addr}, tempAddrs); mismatch != "" {
+		t.Fatal(mismatch)
+	}
+}
+
+// TestAutoGenTempAddrRegenTimerUpdates tests that a temporary address's
+// regeneration timer gets updated when refreshing the address's lifetimes.
+func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) {
+	const (
+		nicID            = 1
+		regenAfter       = 2 * time.Second
+		newMinVL         = 10
+		newMinVLDuration = newMinVL * time.Second
+	)
+
+	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	savedMinMaxTempAddrPreferredLifetime := stack.MinMaxTempAddrPreferredLifetime
+	savedMinMaxTempAddrValidLifetime := stack.MinMaxTempAddrValidLifetime
+	defer func() {
+		stack.MaxDesyncFactor = savedMaxDesyncFactor
+		stack.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
+		stack.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
+	}()
+	stack.MaxDesyncFactor = 0
+	stack.MinMaxTempAddrPreferredLifetime = newMinVLDuration
+	stack.MinMaxTempAddrValidLifetime = newMinVLDuration
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+	var tempIIDHistory [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistory[:], nil, nicID)
+	tempAddr1 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+	tempAddr2 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+	tempAddr3 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	ndpConfigs := stack.NDPConfigurations{
+		HandleRAs:                  true,
+		AutoGenGlobalAddresses:     true,
+		AutoGenTempGlobalAddresses: true,
+		RegenAdvanceDuration:       newMinVLDuration - regenAfter,
+	}
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs:       ndpConfigs,
+		NDPDisp:          &ndpDisp,
+	})
+
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectAutoGenAddrEventAsync := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timed out waiting for addr auto gen event")
+		}
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with non-zero valid & preferred lifetimes.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	expectAutoGenAddrEvent(addr, newAddr)
+	expectAutoGenAddrEvent(tempAddr1, newAddr)
+	if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1}, nil); mismatch != "" {
+		t.Fatal(mismatch)
+	}
+
+	// Deprecate the prefix.
+	//
+	// A new temporary address should be generated after the regeneration
+	// time has passed since the prefix is deprecated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 0))
+	expectAutoGenAddrEvent(addr, deprecatedAddr)
+	expectAutoGenAddrEvent(tempAddr1, deprecatedAddr)
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		t.Fatalf("unexpected auto gen addr event = %+v", e)
+	case <-time.After(regenAfter + defaultAsyncEventTimeout):
+	}
+
+	// Prefer the prefix again.
+	//
+	// A new temporary address should immediately be generated since the
+	// regeneration time has already passed since the last address was generated
+	// - this regeneration does not depend on a timer.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	expectAutoGenAddrEvent(tempAddr2, newAddr)
+
+	// Increase the maximum lifetimes for temporary addresses to large values
+	// then refresh the lifetimes of the prefix.
+	//
+	// A new address should not be generated after the regeneration time that was
+	// expected for the previous check. This is because the preferred lifetime for
+	// the temporary addresses has increased, so it will take more time to
+	// regenerate a new temporary address. Note, new addresses are only
+	// regenerated after the preferred lifetime - the regenerate advance duration
+	// as paased.
+	ndpConfigs.MaxTempAddrValidLifetime = 100 * time.Second
+	ndpConfigs.MaxTempAddrPreferredLifetime = 100 * time.Second
+	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
+		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+	}
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		t.Fatalf("unexpected auto gen addr event = %+v", e)
+	case <-time.After(regenAfter + defaultAsyncEventTimeout):
+	}
+
+	// Set the maximum lifetimes for temporary addresses such that on the next
+	// RA, the regeneration timer gets reset.
+	//
+	// The maximum lifetime is the sum of the minimum lifetimes for temporary
+	// addresses + the time that has already passed since the last address was
+	// generated so that the regeneration timer is needed to generate the next
+	// address.
+	newLifetimes := newMinVLDuration + regenAfter + defaultAsyncEventTimeout
+	ndpConfigs.MaxTempAddrValidLifetime = newLifetimes
+	ndpConfigs.MaxTempAddrPreferredLifetime = newLifetimes
+	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
+		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+	}
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncEventTimeout)
+}
+
+// TestMixedSLAACAddrConflictRegen tests SLAAC address regeneration in response
+// to a mix of DAD conflicts and NIC-local conflicts.
+func TestMixedSLAACAddrConflictRegen(t *testing.T) {
+	const (
+		nicID           = 1
+		nicName         = "nic"
+		lifetimeSeconds = 9999
+		// From stack.maxSLAACAddrLocalRegenAttempts
+		maxSLAACAddrLocalRegenAttempts = 10
+		// We use 2 more addreses than the maximum local regeneration attempts
+		// because we want to also trigger regeneration in response to a DAD
+		// conflicts for this test.
+		maxAddrs         = maxSLAACAddrLocalRegenAttempts + 2
+		dupAddrTransmits = 1
+		retransmitTimer  = time.Second
+	)
+
+	var tempIIDHistoryWithModifiedEUI64 [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistoryWithModifiedEUI64[:], nil, nicID)
+
+	var tempIIDHistoryWithOpaqueIID [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistoryWithOpaqueIID[:], nil, nicID)
+
+	prefix, subnet, stableAddrWithModifiedEUI64 := prefixSubnetAddr(0, linkAddr1)
+	var stableAddrsWithOpaqueIID [maxAddrs]tcpip.AddressWithPrefix
+	var tempAddrsWithOpaqueIID [maxAddrs]tcpip.AddressWithPrefix
+	var tempAddrsWithModifiedEUI64 [maxAddrs]tcpip.AddressWithPrefix
+	addrBytes := []byte(subnet.ID())
+	for i := 0; i < maxAddrs; i++ {
+		stableAddrsWithOpaqueIID[i] = tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet, nicName, uint8(i), nil)),
+			PrefixLen: header.IIDOffsetInIPv6Address * 8,
+		}
+		// When generating temporary addresses, the resolved stable address for the
+		// SLAAC prefix will be the first address stable address generated for the
+		// prefix as we will not simulate address conflicts for the stable addresses
+		// in tests involving temporary addresses. Address conflicts for stable
+		// addresses will be done in their own tests.
+		tempAddrsWithOpaqueIID[i] = header.GenerateTempIPv6SLAACAddr(tempIIDHistoryWithOpaqueIID[:], stableAddrsWithOpaqueIID[0].Address)
+		tempAddrsWithModifiedEUI64[i] = header.GenerateTempIPv6SLAACAddr(tempIIDHistoryWithModifiedEUI64[:], stableAddrWithModifiedEUI64.Address)
+	}
+
+	tests := []struct {
+		name          string
+		addrs         []tcpip.AddressWithPrefix
+		tempAddrs     bool
+		initialExpect tcpip.AddressWithPrefix
+		nicNameFromID func(tcpip.NICID, string) string
+	}{
+		{
+			name:  "Stable addresses with opaque IIDs",
+			addrs: stableAddrsWithOpaqueIID[:],
+			nicNameFromID: func(tcpip.NICID, string) string {
+				return nicName
+			},
+		},
+		{
+			name:          "Temporary addresses with opaque IIDs",
+			addrs:         tempAddrsWithOpaqueIID[:],
+			tempAddrs:     true,
+			initialExpect: stableAddrsWithOpaqueIID[0],
+			nicNameFromID: func(tcpip.NICID, string) string {
+				return nicName
+			},
+		},
+		{
+			name:          "Temporary addresses with modified EUI64",
+			addrs:         tempAddrsWithModifiedEUI64[:],
+			tempAddrs:     true,
+			initialExpect: stableAddrWithModifiedEUI64,
+		},
+	}
+
+	for _, test := range tests {
+		test := test
+
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			ndpConfigs := stack.NDPConfigurations{
+				HandleRAs:                     true,
+				AutoGenGlobalAddresses:        true,
+				AutoGenTempGlobalAddresses:    test.tempAddrs,
+				AutoGenAddressConflictRetries: 1,
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NDPConfigs:         ndpConfigs,
+				NDPDisp:            &ndpDisp,
+				OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+					NICNameFromID: test.nicNameFromID,
+				},
+			})
+
+			s.SetRouteTable([]tcpip.Route{{
+				Destination: header.IPv6EmptySubnet,
+				Gateway:     llAddr2,
+				NIC:         nicID,
+			}})
+
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			for j := 0; j < len(test.addrs)-1; j++ {
+				// The NIC will not attempt to generate an address in response to a
+				// NIC-local conflict after some maximum number of attempts. We skip
+				// creating a conflict for the address that would be generated as part
+				// of the last attempt so we can simulate a DAD conflict for this
+				// address and restart the NIC-local generation process.
+				if j == maxSLAACAddrLocalRegenAttempts-1 {
+					continue
+				}
+
+				if err := s.AddAddress(nicID, ipv6.ProtocolNumber, test.addrs[j].Address); err != nil {
+					t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, test.addrs[j].Address, err)
+				}
+			}
+
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+			}
+
+			expectAutoGenAddrAsyncEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(defaultAsyncEventTimeout):
+					t.Fatal("timed out waiting for addr auto gen event")
+				}
+			}
+
+			expectDADEventAsync := func(addr tcpip.Address) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.dadC:
+					if diff := checkDADEvent(e, nicID, addr, true, nil); diff != "" {
+						t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(dupAddrTransmits*retransmitTimer + defaultAsyncEventTimeout):
+					t.Fatal("timed out waiting for DAD event")
+				}
+			}
+
+			// Enable DAD.
+			ndpDisp.dadC = make(chan ndpDADEvent, 2)
+			ndpConfigs.DupAddrDetectTransmits = dupAddrTransmits
+			ndpConfigs.RetransmitTimer = retransmitTimer
+			if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
+				t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+			}
+
+			// Do SLAAC for prefix.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+			if test.initialExpect != (tcpip.AddressWithPrefix{}) {
+				expectAutoGenAddrEvent(test.initialExpect, newAddr)
+				expectDADEventAsync(test.initialExpect.Address)
+			}
+
+			// The last local generation attempt should succeed, but we introduce a
+			// DAD failure to restart the local generation process.
+			addr := test.addrs[maxSLAACAddrLocalRegenAttempts-1]
+			expectAutoGenAddrAsyncEvent(addr, newAddr)
+			if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+				t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+			}
+			select {
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			default:
+				t.Fatal("expected DAD event")
+			}
+			expectAutoGenAddrEvent(addr, invalidatedAddr)
+
+			// The last address generated should resolve DAD.
+			addr = test.addrs[len(test.addrs)-1]
+			expectAutoGenAddrAsyncEvent(addr, newAddr)
+			expectDADEventAsync(addr.Address)
+
+			select {
+			case e := <-ndpDisp.autoGenAddrC:
+				t.Fatalf("unexpected auto gen addr event = %+v", e)
+			default:
+			}
+		})
+	}
+}
+
 // stackAndNdpDispatcherWithDefaultRoute returns an ndpDispatcher,
 // channel.Endpoint and stack.Stack.
 //
@@ -1901,7 +2913,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	// addr2 is deprecated but if explicitly requested, it should be used.
 	fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
 	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address)
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
 	}
 
 	// Another PI w/ 0 preferred lifetime should not result in a deprecation
@@ -1914,7 +2926,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	}
 	expectPrimaryAddr(addr1)
 	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address)
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
 	}
 
 	// Refresh lifetimes of addr generated from prefix2.
@@ -2026,7 +3038,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	// addr1 is deprecated but if explicitly requested, it should be used.
 	fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
 	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
 	}
 
 	// Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
@@ -2039,7 +3051,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	}
 	expectPrimaryAddr(addr2)
 	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
 	}
 
 	// Refresh lifetimes for addr of prefix1.
@@ -2063,7 +3075,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	// addr2 should be the primary endpoint now since it is not deprecated.
 	expectPrimaryAddr(addr2)
 	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
-		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
 	}
 
 	// Wait for addr of prefix1 to be invalidated.
@@ -2113,7 +3125,6 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 		} else {
 			t.Fatalf("got unexpected auto-generated event")
 		}
-
 	case <-time.After(newMinVLDuration + defaultAsyncEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
@@ -2326,8 +3337,6 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 		},
 	}
 
-	const delta = 500 * time.Millisecond
-
 	// This Run will not return until the parallel tests finish.
 	//
 	// We need this because we need to do some teardown work after the
@@ -2380,24 +3389,21 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 				// to test.evl.
 				//
 
-				// Make sure we do not get any invalidation
-				// events until atleast 500ms (delta) before
-				// test.evl.
+				// The address should not be invalidated until the effective valid
+				// lifetime has passed.
 				select {
 				case <-ndpDisp.autoGenAddrC:
 					t.Fatal("unexpectedly received an auto gen addr event")
-				case <-time.After(time.Duration(test.evl)*time.Second - delta):
+				case <-time.After(time.Duration(test.evl)*time.Second - defaultAsyncEventTimeout):
 				}
 
-				// Wait for another second (2x delta), but now
-				// we expect the invalidation event.
+				// Wait for the invalidation event.
 				select {
 				case e := <-ndpDisp.autoGenAddrC:
 					if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
 						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 					}
-
-				case <-time.After(2 * delta):
+				case <-time.After(2 * defaultAsyncEventTimeout):
 					t.Fatal("timeout waiting for addr auto gen event")
 				}
 			})
@@ -2409,8 +3415,6 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 // by the user, its resources will be cleaned up and an invalidation event will
 // be sent to the integrator.
 func TestAutoGenAddrRemoval(t *testing.T) {
-	t.Parallel()
-
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 
 	ndpDisp := ndpDispatcher{
@@ -2467,8 +3471,6 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 // TestAutoGenAddrAfterRemoval tests adding a SLAAC address that was previously
 // assigned to the NIC but is in the permanentExpired state.
 func TestAutoGenAddrAfterRemoval(t *testing.T) {
-	t.Parallel()
-
 	const nicID = 1
 
 	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
@@ -2515,7 +3517,7 @@ func TestAutoGenAddrAfterRemoval(t *testing.T) {
 		AddressWithPrefix: addr2,
 	}
 	if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
-		t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d, %s) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
+		t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
 	}
 	// addr2 should be more preferred now since it is at the front of the primary
 	// list.
@@ -2580,8 +3582,6 @@ func TestAutoGenAddrAfterRemoval(t *testing.T) {
 // TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that
 // is already assigned to the NIC, the static address remains.
 func TestAutoGenAddrStaticConflict(t *testing.T) {
-	t.Parallel()
-
 	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
 
 	ndpDisp := ndpDispatcher{
@@ -2637,8 +3637,6 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 // TestAutoGenAddrWithOpaqueIID tests that SLAAC generated addresses will use
 // opaque interface identifiers when configured to do so.
 func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
-	t.Parallel()
-
 	const nicID = 1
 	const nicName = "nic1"
 	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
@@ -2738,12 +3736,519 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 	}
 }
 
+func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
+	const nicID = 1
+	const nicName = "nic"
+	const dadTransmits = 1
+	const retransmitTimer = time.Second
+	const maxMaxRetries = 3
+	const lifetimeSeconds = 10
+
+	// Needed for the temporary address sub test.
+	savedMaxDesync := stack.MaxDesyncFactor
+	defer func() {
+		stack.MaxDesyncFactor = savedMaxDesync
+	}()
+	stack.MaxDesyncFactor = time.Nanosecond
+
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
+	secretKey := secretKeyBuf[:]
+	n, err := rand.Read(secretKey)
+	if err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	}
+	if n != header.OpaqueIIDSecretKeyMinBytes {
+		t.Fatalf("got rand.Read(_) = (%d, _), want = (%d, _)", n, header.OpaqueIIDSecretKeyMinBytes)
+	}
+
+	prefix, subnet, _ := prefixSubnetAddr(0, linkAddr1)
+
+	addrForSubnet := func(subnet tcpip.Subnet, dadCounter uint8) tcpip.AddressWithPrefix {
+		addrBytes := []byte(subnet.ID())
+		return tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet, nicName, dadCounter, secretKey)),
+			PrefixLen: 64,
+		}
+	}
+
+	expectAutoGenAddrEvent := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectAutoGenAddrEventAsync := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(defaultAsyncEventTimeout):
+			t.Fatal("timed out waiting for addr auto gen event")
+		}
+	}
+
+	expectDADEvent := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.Address, resolved bool) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.dadC:
+			if diff := checkDADEvent(e, nicID, addr, resolved, nil); diff != "" {
+				t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected DAD event")
+		}
+	}
+
+	expectDADEventAsync := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.Address, resolved bool) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.dadC:
+			if diff := checkDADEvent(e, nicID, addr, resolved, nil); diff != "" {
+				t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+			t.Fatal("timed out waiting for DAD event")
+		}
+	}
+
+	stableAddrForTempAddrTest := addrForSubnet(subnet, 0)
+
+	addrTypes := []struct {
+		name             string
+		ndpConfigs       stack.NDPConfigurations
+		autoGenLinkLocal bool
+		prepareFn        func(t *testing.T, ndpDisp *ndpDispatcher, e *channel.Endpoint, tempIIDHistory []byte) []tcpip.AddressWithPrefix
+		addrGenFn        func(dadCounter uint8, tempIIDHistory []byte) tcpip.AddressWithPrefix
+	}{
+		{
+			name: "Global address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits: dadTransmits,
+				RetransmitTimer:        retransmitTimer,
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
+			},
+			prepareFn: func(_ *testing.T, _ *ndpDispatcher, e *channel.Endpoint, _ []byte) []tcpip.AddressWithPrefix {
+				// Receive an RA with prefix1 in a PI.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+				return nil
+
+			},
+			addrGenFn: func(dadCounter uint8, _ []byte) tcpip.AddressWithPrefix {
+				return addrForSubnet(subnet, dadCounter)
+			},
+		},
+		{
+			name: "LinkLocal address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits: dadTransmits,
+				RetransmitTimer:        retransmitTimer,
+			},
+			autoGenLinkLocal: true,
+			prepareFn: func(*testing.T, *ndpDispatcher, *channel.Endpoint, []byte) []tcpip.AddressWithPrefix {
+				return nil
+			},
+			addrGenFn: func(dadCounter uint8, _ []byte) tcpip.AddressWithPrefix {
+				return addrForSubnet(header.IPv6LinkLocalPrefix.Subnet(), dadCounter)
+			},
+		},
+		{
+			name: "Temporary address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits:     dadTransmits,
+				RetransmitTimer:            retransmitTimer,
+				HandleRAs:                  true,
+				AutoGenGlobalAddresses:     true,
+				AutoGenTempGlobalAddresses: true,
+			},
+			prepareFn: func(t *testing.T, ndpDisp *ndpDispatcher, e *channel.Endpoint, tempIIDHistory []byte) []tcpip.AddressWithPrefix {
+				header.InitialTempIID(tempIIDHistory, nil, nicID)
+
+				// Generate a stable SLAAC address so temporary addresses will be
+				// generated.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+				expectAutoGenAddrEvent(t, ndpDisp, stableAddrForTempAddrTest, newAddr)
+				expectDADEventAsync(t, ndpDisp, stableAddrForTempAddrTest.Address, true)
+
+				// The stable address will be assigned throughout the test.
+				return []tcpip.AddressWithPrefix{stableAddrForTempAddrTest}
+			},
+			addrGenFn: func(_ uint8, tempIIDHistory []byte) tcpip.AddressWithPrefix {
+				return header.GenerateTempIPv6SLAACAddr(tempIIDHistory, stableAddrForTempAddrTest.Address)
+			},
+		},
+	}
+
+	for _, addrType := range addrTypes {
+		// This Run will not return until the parallel tests finish.
+		//
+		// We need this because we need to do some teardown work after the parallel
+		// tests complete and limit the number of parallel tests running at the same
+		// time to reduce flakes.
+		//
+		// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+		// more details.
+		t.Run(addrType.name, func(t *testing.T) {
+			for maxRetries := uint8(0); maxRetries <= maxMaxRetries; maxRetries++ {
+				for numFailures := uint8(0); numFailures <= maxRetries+1; numFailures++ {
+					maxRetries := maxRetries
+					numFailures := numFailures
+					addrType := addrType
+
+					t.Run(fmt.Sprintf("%d max retries and %d failures", maxRetries, numFailures), func(t *testing.T) {
+						t.Parallel()
+
+						ndpDisp := ndpDispatcher{
+							dadC:         make(chan ndpDADEvent, 1),
+							autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+						}
+						e := channel.New(0, 1280, linkAddr1)
+						ndpConfigs := addrType.ndpConfigs
+						ndpConfigs.AutoGenAddressConflictRetries = maxRetries
+						s := stack.New(stack.Options{
+							NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+							AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
+							NDPConfigs:           ndpConfigs,
+							NDPDisp:              &ndpDisp,
+							OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+								NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+									return nicName
+								},
+								SecretKey: secretKey,
+							},
+						})
+						opts := stack.NICOptions{Name: nicName}
+						if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
+							t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
+						}
+
+						var tempIIDHistory [header.IIDSize]byte
+						stableAddrs := addrType.prepareFn(t, &ndpDisp, e, tempIIDHistory[:])
+
+						// Simulate DAD conflicts so the address is regenerated.
+						for i := uint8(0); i < numFailures; i++ {
+							addr := addrType.addrGenFn(i, tempIIDHistory[:])
+							expectAutoGenAddrEventAsync(t, &ndpDisp, addr, newAddr)
+
+							// Should not have any new addresses assigned to the NIC.
+							if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, stableAddrs, nil); mismatch != "" {
+								t.Fatal(mismatch)
+							}
+
+							// Simulate a DAD conflict.
+							if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+								t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+							}
+							expectAutoGenAddrEvent(t, &ndpDisp, addr, invalidatedAddr)
+							expectDADEvent(t, &ndpDisp, addr.Address, false)
+
+							// Attempting to add the address manually should not fail if the
+							// address's state was cleaned up when DAD failed.
+							if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr.Address); err != nil {
+								t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr.Address, err)
+							}
+							if err := s.RemoveAddress(nicID, addr.Address); err != nil {
+								t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr.Address, err)
+							}
+							expectDADEvent(t, &ndpDisp, addr.Address, false)
+						}
+
+						// Should not have any new addresses assigned to the NIC.
+						if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, stableAddrs, nil); mismatch != "" {
+							t.Fatal(mismatch)
+						}
+
+						// If we had less failures than generation attempts, we should have
+						// an address after DAD resolves.
+						if maxRetries+1 > numFailures {
+							addr := addrType.addrGenFn(numFailures, tempIIDHistory[:])
+							expectAutoGenAddrEventAsync(t, &ndpDisp, addr, newAddr)
+							expectDADEventAsync(t, &ndpDisp, addr.Address, true)
+							if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, append(stableAddrs, addr), nil); mismatch != "" {
+								t.Fatal(mismatch)
+							}
+						}
+
+						// Should not attempt address generation again.
+						select {
+						case e := <-ndpDisp.autoGenAddrC:
+							t.Fatalf("unexpectedly got an auto-generated address event = %+v", e)
+						case <-time.After(defaultAsyncEventTimeout):
+						}
+					})
+				}
+			}
+		})
+	}
+}
+
+// TestAutoGenAddrWithEUI64IIDNoDADRetries tests that a regeneration attempt is
+// not made for SLAAC addresses generated with an IID based on the NIC's link
+// address.
+func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
+	const nicID = 1
+	const dadTransmits = 1
+	const retransmitTimer = time.Second
+	const maxRetries = 3
+	const lifetimeSeconds = 10
+
+	prefix, subnet, _ := prefixSubnetAddr(0, linkAddr1)
+
+	addrTypes := []struct {
+		name             string
+		ndpConfigs       stack.NDPConfigurations
+		autoGenLinkLocal bool
+		subnet           tcpip.Subnet
+		triggerSLAACFn   func(e *channel.Endpoint)
+	}{
+		{
+			name: "Global address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits:        dadTransmits,
+				RetransmitTimer:               retransmitTimer,
+				HandleRAs:                     true,
+				AutoGenGlobalAddresses:        true,
+				AutoGenAddressConflictRetries: maxRetries,
+			},
+			subnet: subnet,
+			triggerSLAACFn: func(e *channel.Endpoint) {
+				// Receive an RA with prefix1 in a PI.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+
+			},
+		},
+		{
+			name: "LinkLocal address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits:        dadTransmits,
+				RetransmitTimer:               retransmitTimer,
+				AutoGenAddressConflictRetries: maxRetries,
+			},
+			autoGenLinkLocal: true,
+			subnet:           header.IPv6LinkLocalPrefix.Subnet(),
+			triggerSLAACFn:   func(e *channel.Endpoint) {},
+		},
+	}
+
+	for _, addrType := range addrTypes {
+		addrType := addrType
+
+		t.Run(addrType.name, func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				dadC:         make(chan ndpDADEvent, 1),
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
+				NDPConfigs:           addrType.ndpConfigs,
+				NDPDisp:              &ndpDisp,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+			}
+
+			addrType.triggerSLAACFn(e)
+
+			addrBytes := []byte(addrType.subnet.ID())
+			header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr1, addrBytes[header.IIDOffsetInIPv6Address:])
+			addr := tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(addrBytes),
+				PrefixLen: 64,
+			}
+			expectAutoGenAddrEvent(addr, newAddr)
+
+			// Simulate a DAD conflict.
+			if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+				t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+			}
+			expectAutoGenAddrEvent(addr, invalidatedAddr)
+			select {
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			default:
+				t.Fatal("expected DAD event")
+			}
+
+			// Should not attempt address regeneration.
+			select {
+			case e := <-ndpDisp.autoGenAddrC:
+				t.Fatalf("unexpectedly got an auto-generated address event = %+v", e)
+			case <-time.After(defaultAsyncEventTimeout):
+			}
+		})
+	}
+}
+
+// TestAutoGenAddrContinuesLifetimesAfterRetry tests that retrying address
+// generation in response to DAD conflicts does not refresh the lifetimes.
+func TestAutoGenAddrContinuesLifetimesAfterRetry(t *testing.T) {
+	const nicID = 1
+	const nicName = "nic"
+	const dadTransmits = 1
+	const retransmitTimer = 2 * time.Second
+	const failureTimer = time.Second
+	const maxRetries = 1
+	const lifetimeSeconds = 5
+
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
+	secretKey := secretKeyBuf[:]
+	n, err := rand.Read(secretKey)
+	if err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	}
+	if n != header.OpaqueIIDSecretKeyMinBytes {
+		t.Fatalf("got rand.Read(_) = (%d, _), want = (%d, _)", n, header.OpaqueIIDSecretKeyMinBytes)
+	}
+
+	prefix, subnet, _ := prefixSubnetAddr(0, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		dadC:         make(chan ndpDADEvent, 1),
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			DupAddrDetectTransmits:        dadTransmits,
+			RetransmitTimer:               retransmitTimer,
+			HandleRAs:                     true,
+			AutoGenGlobalAddresses:        true,
+			AutoGenAddressConflictRetries: maxRetries,
+		},
+		NDPDisp: &ndpDisp,
+		OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+			NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+				return nicName
+			},
+			SecretKey: secretKey,
+		},
+	})
+	opts := stack.NICOptions{Name: nicName}
+	if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
+		t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	// Receive an RA with prefix in a PI.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+
+	addrBytes := []byte(subnet.ID())
+	addr := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet, nicName, 0, secretKey)),
+		PrefixLen: 64,
+	}
+	expectAutoGenAddrEvent(addr, newAddr)
+
+	// Simulate a DAD conflict after some time has passed.
+	time.Sleep(failureTimer)
+	if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+		t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+	}
+	expectAutoGenAddrEvent(addr, invalidatedAddr)
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	default:
+		t.Fatal("expected DAD event")
+	}
+
+	// Let the next address resolve.
+	addr.Address = tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet, nicName, 1, secretKey))
+	expectAutoGenAddrEvent(addr, newAddr)
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr.Address, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+		t.Fatal("timed out waiting for DAD event")
+	}
+
+	// Address should be deprecated/invalidated after the lifetime expires.
+	//
+	// Note, the remaining lifetime is calculated from when the PI was first
+	// processed. Since we wait for some time before simulating a DAD conflict
+	// and more time for the new address to resolve, the new address is only
+	// expected to be valid for the remaining time. The DAD conflict should
+	// not have reset the lifetimes.
+	//
+	// We expect either just the invalidation event or the deprecation event
+	// followed by the invalidation event.
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if e.eventType == deprecatedAddr {
+			if diff := checkAutoGenAddrEvent(e, addr, deprecatedAddr); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+
+			select {
+			case e := <-ndpDisp.autoGenAddrC:
+				if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				}
+			case <-time.After(defaultAsyncEventTimeout):
+				t.Fatal("timed out waiting for invalidated auto gen addr event after deprecation")
+			}
+		} else {
+			if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		}
+	case <-time.After(lifetimeSeconds*time.Second - failureTimer - dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+		t.Fatal("timed out waiting for auto gen addr event")
+	}
+}
+
 // TestNDPRecursiveDNSServerDispatch tests that we properly dispatch an event
 // to the integrator when an RA is received with the NDP Recursive DNS Server
 // option with at least one valid address.
 func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
-	t.Parallel()
-
 	tests := []struct {
 		name     string
 		opt      header.NDPRecursiveDNSServer
@@ -2835,11 +4340,7 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		test := test
-
 		t.Run(test.name, func(t *testing.T) {
-			t.Parallel()
-
 			ndpDisp := ndpDispatcher{
 				// We do not expect more than a single RDNSS
 				// event at any time for this test.
@@ -2886,17 +4387,120 @@ func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
 	}
 }
 
-// TestCleanupHostOnlyStateOnBecomingRouter tests that all discovered routers
-// and prefixes, and non-linklocal auto-generated addresses are invalidated when
-// a NIC becomes a router.
-func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
-	t.Parallel()
+// TestNDPDNSSearchListDispatch tests that the integrator is informed when an
+// NDP DNS Search List option is received with at least one domain name in the
+// search list.
+func TestNDPDNSSearchListDispatch(t *testing.T) {
+	const nicID = 1
+
+	ndpDisp := ndpDispatcher{
+		dnsslC: make(chan ndpDNSSLEvent, 3),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	optSer := header.NDPOptionsSerializer{
+		header.NDPDNSSearchList([]byte{
+			0, 0,
+			0, 0, 0, 0,
+			2, 'h', 'i',
+			0,
+		}),
+		header.NDPDNSSearchList([]byte{
+			0, 0,
+			0, 0, 0, 1,
+			1, 'i',
+			0,
+			2, 'a', 'm',
+			2, 'm', 'e',
+			0,
+		}),
+		header.NDPDNSSearchList([]byte{
+			0, 0,
+			0, 0, 1, 0,
+			3, 'x', 'y', 'z',
+			0,
+			5, 'h', 'e', 'l', 'l', 'o',
+			5, 'w', 'o', 'r', 'l', 'd',
+			0,
+			4, 't', 'h', 'i', 's',
+			2, 'i', 's',
+			1, 'a',
+			4, 't', 'e', 's', 't',
+			0,
+		}),
+	}
+	expected := []struct {
+		domainNames []string
+		lifetime    time.Duration
+	}{
+		{
+			domainNames: []string{
+				"hi",
+			},
+			lifetime: 0,
+		},
+		{
+			domainNames: []string{
+				"i",
+				"am.me",
+			},
+			lifetime: time.Second,
+		},
+		{
+			domainNames: []string{
+				"xyz",
+				"hello.world",
+				"this.is.a.test",
+			},
+			lifetime: 256 * time.Second,
+		},
+	}
+
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, optSer))
 
+	for i, expected := range expected {
+		select {
+		case dnssl := <-ndpDisp.dnsslC:
+			if dnssl.nicID != nicID {
+				t.Errorf("got %d-th dnssl nicID = %d, want = %d", i, dnssl.nicID, nicID)
+			}
+			if diff := cmp.Diff(dnssl.domainNames, expected.domainNames); diff != "" {
+				t.Errorf("%d-th dnssl domain names mismatch (-want +got):\n%s", i, diff)
+			}
+			if dnssl.lifetime != expected.lifetime {
+				t.Errorf("got %d-th dnssl lifetime = %s, want = %s", i, dnssl.lifetime, expected.lifetime)
+			}
+		default:
+			t.Fatal("expected a DNSSL event")
+		}
+	}
+
+	// Should have no more DNSSL options.
+	select {
+	case <-ndpDisp.dnsslC:
+		t.Fatal("unexpectedly got a DNSSL event")
+	default:
+	}
+}
+
+// TestCleanupNDPState tests that all discovered routers and prefixes, and
+// auto-generated addresses are invalidated when a NIC becomes a router.
+func TestCleanupNDPState(t *testing.T) {
 	const (
-		lifetimeSeconds = 5
-		maxEvents       = 4
-		nicID1          = 1
-		nicID2          = 2
+		lifetimeSeconds          = 5
+		maxRouterAndPrefixEvents = 4
+		nicID1                   = 1
+		nicID2                   = 2
 	)
 
 	prefix1, subnet1, e1Addr1 := prefixSubnetAddr(0, linkAddr1)
@@ -2912,254 +4516,331 @@ func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
 		PrefixLen: 64,
 	}
 
-	ndpDisp := ndpDispatcher{
-		routerC:        make(chan ndpRouterEvent, maxEvents),
-		rememberRouter: true,
-		prefixC:        make(chan ndpPrefixEvent, maxEvents),
-		rememberPrefix: true,
-		autoGenAddrC:   make(chan ndpAutoGenAddrEvent, maxEvents),
-	}
-	s := stack.New(stack.Options{
-		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
-		AutoGenIPv6LinkLocal: true,
-		NDPConfigs: stack.NDPConfigurations{
-			HandleRAs:              true,
-			DiscoverDefaultRouters: true,
-			DiscoverOnLinkPrefixes: true,
-			AutoGenGlobalAddresses: true,
+	tests := []struct {
+		name                 string
+		cleanupFn            func(t *testing.T, s *stack.Stack)
+		keepAutoGenLinkLocal bool
+		maxAutoGenAddrEvents int
+		skipFinalAddrCheck   bool
+	}{
+		// A NIC should still keep its auto-generated link-local address when
+		// becoming a router.
+		{
+			name: "Enable forwarding",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+				s.SetForwarding(true)
+			},
+			keepAutoGenLinkLocal: true,
+			maxAutoGenAddrEvents: 4,
 		},
-		NDPDisp: &ndpDisp,
-	})
 
-	expectRouterEvent := func() (bool, ndpRouterEvent) {
-		select {
-		case e := <-ndpDisp.routerC:
-			return true, e
-		default:
-		}
+		// A NIC should cleanup all NDP state when it is disabled.
+		{
+			name: "Disable NIC",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
 
-		return false, ndpRouterEvent{}
-	}
+				if err := s.DisableNIC(nicID1); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID1, err)
+				}
+				if err := s.DisableNIC(nicID2); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID2, err)
+				}
+			},
+			keepAutoGenLinkLocal: false,
+			maxAutoGenAddrEvents: 6,
+		},
 
-	expectPrefixEvent := func() (bool, ndpPrefixEvent) {
-		select {
-		case e := <-ndpDisp.prefixC:
-			return true, e
-		default:
-		}
+		// A NIC should cleanup all NDP state when it is removed.
+		{
+			name: "Remove NIC",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
 
-		return false, ndpPrefixEvent{}
+				if err := s.RemoveNIC(nicID1); err != nil {
+					t.Fatalf("s.RemoveNIC(%d): %s", nicID1, err)
+				}
+				if err := s.RemoveNIC(nicID2); err != nil {
+					t.Fatalf("s.RemoveNIC(%d): %s", nicID2, err)
+				}
+			},
+			keepAutoGenLinkLocal: false,
+			maxAutoGenAddrEvents: 6,
+			// The NICs are removed so we can't check their addresses after calling
+			// stopFn.
+			skipFinalAddrCheck: true,
+		},
 	}
 
-	expectAutoGenAddrEvent := func() (bool, ndpAutoGenAddrEvent) {
-		select {
-		case e := <-ndpDisp.autoGenAddrC:
-			return true, e
-		default:
-		}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				routerC:        make(chan ndpRouterEvent, maxRouterAndPrefixEvents),
+				rememberRouter: true,
+				prefixC:        make(chan ndpPrefixEvent, maxRouterAndPrefixEvents),
+				rememberPrefix: true,
+				autoGenAddrC:   make(chan ndpAutoGenAddrEvent, test.maxAutoGenAddrEvents),
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: true,
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              true,
+					DiscoverDefaultRouters: true,
+					DiscoverOnLinkPrefixes: true,
+					AutoGenGlobalAddresses: true,
+				},
+				NDPDisp: &ndpDisp,
+			})
 
-		return false, ndpAutoGenAddrEvent{}
-	}
+			expectRouterEvent := func() (bool, ndpRouterEvent) {
+				select {
+				case e := <-ndpDisp.routerC:
+					return true, e
+				default:
+				}
 
-	e1 := channel.New(0, 1280, linkAddr1)
-	if err := s.CreateNIC(nicID1, e1); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
-	}
-	// We have other tests that make sure we receive the *correct* events
-	// on normal discovery of routers/prefixes, and auto-generated
-	// addresses. Here we just make sure we get an event and let other tests
-	// handle the correctness check.
-	expectAutoGenAddrEvent()
+				return false, ndpRouterEvent{}
+			}
 
-	e2 := channel.New(0, 1280, linkAddr2)
-	if err := s.CreateNIC(nicID2, e2); err != nil {
-		t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
-	}
-	expectAutoGenAddrEvent()
+			expectPrefixEvent := func() (bool, ndpPrefixEvent) {
+				select {
+				case e := <-ndpDisp.prefixC:
+					return true, e
+				default:
+				}
 
-	// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr3 and
-	// llAddr4) w/ PI (for prefix1 in RA from llAddr3 and prefix2 in RA from
-	// llAddr4) to discover multiple routers and prefixes, and auto-gen
-	// multiple addresses.
+				return false, ndpPrefixEvent{}
+			}
 
-	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
-	}
+			expectAutoGenAddrEvent := func() (bool, ndpAutoGenAddrEvent) {
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					return true, e
+				default:
+				}
 
-	e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
-	}
+				return false, ndpAutoGenAddrEvent{}
+			}
 
-	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
-	}
+			e1 := channel.New(0, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID1, e1); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
+			}
+			// We have other tests that make sure we receive the *correct* events
+			// on normal discovery of routers/prefixes, and auto-generated
+			// addresses. Here we just make sure we get an event and let other tests
+			// handle the correctness check.
+			expectAutoGenAddrEvent()
+
+			e2 := channel.New(0, 1280, linkAddr2)
+			if err := s.CreateNIC(nicID2, e2); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
+			}
+			expectAutoGenAddrEvent()
 
-	e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
-	if ok, _ := expectRouterEvent(); !ok {
-		t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2)
-	}
-	if ok, _ := expectPrefixEvent(); !ok {
-		t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
-	}
-	if ok, _ := expectAutoGenAddrEvent(); !ok {
-		t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e2Addr2, nicID2)
-	}
+			// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr3 and
+			// llAddr4) w/ PI (for prefix1 in RA from llAddr3 and prefix2 in RA from
+			// llAddr4) to discover multiple routers and prefixes, and auto-gen
+			// multiple addresses.
 
-	// We should have the auto-generated addresses added.
-	nicinfo := s.NICInfo()
-	nic1Addrs := nicinfo[nicID1].ProtocolAddresses
-	nic2Addrs := nicinfo[nicID2].ProtocolAddresses
-	if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic1Addrs, e1Addr1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic1Addrs, e1Addr2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, e2Addr1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, e2Addr2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
-	}
+			e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
+			}
 
-	// We can't proceed any further if we already failed the test (missing
-	// some discovery/auto-generated address events or addresses).
-	if t.Failed() {
-		t.FailNow()
-	}
+			e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
+			}
 
-	s.SetForwarding(true)
+			e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
+			}
 
-	// Collect invalidation events after becoming a router
-	gotRouterEvents := make(map[ndpRouterEvent]int)
-	for i := 0; i < maxEvents; i++ {
-		ok, e := expectRouterEvent()
-		if !ok {
-			t.Errorf("expected %d router events after becoming a router; got = %d", maxEvents, i)
-			break
-		}
-		gotRouterEvents[e]++
-	}
-	gotPrefixEvents := make(map[ndpPrefixEvent]int)
-	for i := 0; i < maxEvents; i++ {
-		ok, e := expectPrefixEvent()
-		if !ok {
-			t.Errorf("expected %d prefix events after becoming a router; got = %d", maxEvents, i)
-			break
-		}
-		gotPrefixEvents[e]++
-	}
-	gotAutoGenAddrEvents := make(map[ndpAutoGenAddrEvent]int)
-	for i := 0; i < maxEvents; i++ {
-		ok, e := expectAutoGenAddrEvent()
-		if !ok {
-			t.Errorf("expected %d auto-generated address events after becoming a router; got = %d", maxEvents, i)
-			break
-		}
-		gotAutoGenAddrEvents[e]++
-	}
+			e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e2Addr2, nicID2)
+			}
 
-	// No need to proceed any further if we already failed the test (missing
-	// some invalidation events).
-	if t.Failed() {
-		t.FailNow()
-	}
+			// We should have the auto-generated addresses added.
+			nicinfo := s.NICInfo()
+			nic1Addrs := nicinfo[nicID1].ProtocolAddresses
+			nic2Addrs := nicinfo[nicID2].ProtocolAddresses
+			if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic1Addrs, e1Addr1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic1Addrs, e1Addr2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, e2Addr1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, e2Addr2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+			}
 
-	expectedRouterEvents := map[ndpRouterEvent]int{
-		{nicID: nicID1, addr: llAddr3, discovered: false}: 1,
-		{nicID: nicID1, addr: llAddr4, discovered: false}: 1,
-		{nicID: nicID2, addr: llAddr3, discovered: false}: 1,
-		{nicID: nicID2, addr: llAddr4, discovered: false}: 1,
-	}
-	if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
-		t.Errorf("router events mismatch (-want +got):\n%s", diff)
-	}
-	expectedPrefixEvents := map[ndpPrefixEvent]int{
-		{nicID: nicID1, prefix: subnet1, discovered: false}: 1,
-		{nicID: nicID1, prefix: subnet2, discovered: false}: 1,
-		{nicID: nicID2, prefix: subnet1, discovered: false}: 1,
-		{nicID: nicID2, prefix: subnet2, discovered: false}: 1,
-	}
-	if diff := cmp.Diff(expectedPrefixEvents, gotPrefixEvents); diff != "" {
-		t.Errorf("prefix events mismatch (-want +got):\n%s", diff)
-	}
-	expectedAutoGenAddrEvents := map[ndpAutoGenAddrEvent]int{
-		{nicID: nicID1, addr: e1Addr1, eventType: invalidatedAddr}: 1,
-		{nicID: nicID1, addr: e1Addr2, eventType: invalidatedAddr}: 1,
-		{nicID: nicID2, addr: e2Addr1, eventType: invalidatedAddr}: 1,
-		{nicID: nicID2, addr: e2Addr2, eventType: invalidatedAddr}: 1,
-	}
-	if diff := cmp.Diff(expectedAutoGenAddrEvents, gotAutoGenAddrEvents); diff != "" {
-		t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff)
-	}
+			// We can't proceed any further if we already failed the test (missing
+			// some discovery/auto-generated address events or addresses).
+			if t.Failed() {
+				t.FailNow()
+			}
 
-	// Make sure the auto-generated addresses got removed.
-	nicinfo = s.NICInfo()
-	nic1Addrs = nicinfo[nicID1].ProtocolAddresses
-	nic2Addrs = nicinfo[nicID2].ProtocolAddresses
-	if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
-	}
-	if containsV6Addr(nic1Addrs, e1Addr1) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
-	}
-	if containsV6Addr(nic1Addrs, e1Addr2) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
-	}
-	if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
-		t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
-	}
-	if containsV6Addr(nic2Addrs, e2Addr1) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
-	}
-	if containsV6Addr(nic2Addrs, e2Addr2) {
-		t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
-	}
+			test.cleanupFn(t, s)
 
-	// Should not get any more events (invalidation timers should have been
-	// cancelled when we transitioned into a router).
-	time.Sleep(lifetimeSeconds*time.Second + defaultTimeout)
-	select {
-	case <-ndpDisp.routerC:
-		t.Error("unexpected router event")
-	default:
-	}
-	select {
-	case <-ndpDisp.prefixC:
-		t.Error("unexpected prefix event")
-	default:
-	}
-	select {
-	case <-ndpDisp.autoGenAddrC:
-		t.Error("unexpected auto-generated address event")
-	default:
+			// Collect invalidation events after having NDP state cleaned up.
+			gotRouterEvents := make(map[ndpRouterEvent]int)
+			for i := 0; i < maxRouterAndPrefixEvents; i++ {
+				ok, e := expectRouterEvent()
+				if !ok {
+					t.Errorf("expected %d router events after becoming a router; got = %d", maxRouterAndPrefixEvents, i)
+					break
+				}
+				gotRouterEvents[e]++
+			}
+			gotPrefixEvents := make(map[ndpPrefixEvent]int)
+			for i := 0; i < maxRouterAndPrefixEvents; i++ {
+				ok, e := expectPrefixEvent()
+				if !ok {
+					t.Errorf("expected %d prefix events after becoming a router; got = %d", maxRouterAndPrefixEvents, i)
+					break
+				}
+				gotPrefixEvents[e]++
+			}
+			gotAutoGenAddrEvents := make(map[ndpAutoGenAddrEvent]int)
+			for i := 0; i < test.maxAutoGenAddrEvents; i++ {
+				ok, e := expectAutoGenAddrEvent()
+				if !ok {
+					t.Errorf("expected %d auto-generated address events after becoming a router; got = %d", test.maxAutoGenAddrEvents, i)
+					break
+				}
+				gotAutoGenAddrEvents[e]++
+			}
+
+			// No need to proceed any further if we already failed the test (missing
+			// some invalidation events).
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			expectedRouterEvents := map[ndpRouterEvent]int{
+				{nicID: nicID1, addr: llAddr3, discovered: false}: 1,
+				{nicID: nicID1, addr: llAddr4, discovered: false}: 1,
+				{nicID: nicID2, addr: llAddr3, discovered: false}: 1,
+				{nicID: nicID2, addr: llAddr4, discovered: false}: 1,
+			}
+			if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
+				t.Errorf("router events mismatch (-want +got):\n%s", diff)
+			}
+			expectedPrefixEvents := map[ndpPrefixEvent]int{
+				{nicID: nicID1, prefix: subnet1, discovered: false}: 1,
+				{nicID: nicID1, prefix: subnet2, discovered: false}: 1,
+				{nicID: nicID2, prefix: subnet1, discovered: false}: 1,
+				{nicID: nicID2, prefix: subnet2, discovered: false}: 1,
+			}
+			if diff := cmp.Diff(expectedPrefixEvents, gotPrefixEvents); diff != "" {
+				t.Errorf("prefix events mismatch (-want +got):\n%s", diff)
+			}
+			expectedAutoGenAddrEvents := map[ndpAutoGenAddrEvent]int{
+				{nicID: nicID1, addr: e1Addr1, eventType: invalidatedAddr}: 1,
+				{nicID: nicID1, addr: e1Addr2, eventType: invalidatedAddr}: 1,
+				{nicID: nicID2, addr: e2Addr1, eventType: invalidatedAddr}: 1,
+				{nicID: nicID2, addr: e2Addr2, eventType: invalidatedAddr}: 1,
+			}
+
+			if !test.keepAutoGenLinkLocal {
+				expectedAutoGenAddrEvents[ndpAutoGenAddrEvent{nicID: nicID1, addr: llAddrWithPrefix1, eventType: invalidatedAddr}] = 1
+				expectedAutoGenAddrEvents[ndpAutoGenAddrEvent{nicID: nicID2, addr: llAddrWithPrefix2, eventType: invalidatedAddr}] = 1
+			}
+
+			if diff := cmp.Diff(expectedAutoGenAddrEvents, gotAutoGenAddrEvents); diff != "" {
+				t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff)
+			}
+
+			if !test.skipFinalAddrCheck {
+				// Make sure the auto-generated addresses got removed.
+				nicinfo = s.NICInfo()
+				nic1Addrs = nicinfo[nicID1].ProtocolAddresses
+				nic2Addrs = nicinfo[nicID2].ProtocolAddresses
+				if containsV6Addr(nic1Addrs, llAddrWithPrefix1) != test.keepAutoGenLinkLocal {
+					if test.keepAutoGenLinkLocal {
+						t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+					} else {
+						t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+					}
+				}
+				if containsV6Addr(nic1Addrs, e1Addr1) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+				}
+				if containsV6Addr(nic1Addrs, e1Addr2) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+				}
+				if containsV6Addr(nic2Addrs, llAddrWithPrefix2) != test.keepAutoGenLinkLocal {
+					if test.keepAutoGenLinkLocal {
+						t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+					} else {
+						t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+					}
+				}
+				if containsV6Addr(nic2Addrs, e2Addr1) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+				}
+				if containsV6Addr(nic2Addrs, e2Addr2) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+				}
+			}
+
+			// Should not get any more events (invalidation timers should have been
+			// cancelled when the NDP state was cleaned up).
+			time.Sleep(lifetimeSeconds*time.Second + defaultTimeout)
+			select {
+			case <-ndpDisp.routerC:
+				t.Error("unexpected router event")
+			default:
+			}
+			select {
+			case <-ndpDisp.prefixC:
+				t.Error("unexpected prefix event")
+			default:
+			}
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Error("unexpected auto-generated address event")
+			default:
+			}
+		})
 	}
 }
 
@@ -3207,7 +4888,12 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 		}
 	}
 
-	// The initial DHCPv6 configuration should be stack.DHCPv6NoConfiguration.
+	// Even if the first RA reports no DHCPv6 configurations are available, the
+	// dispatcher should get an event.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+	expectDHCPv6Event(stack.DHCPv6NoConfiguration)
+	// Receiving the same update again should not result in an event to the
+	// dispatcher.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
 	expectNoDHCPv6Event()
 
@@ -3215,8 +4901,6 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	// Configurations.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
 	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
-	// Receiving the same update again should not result in an event to the
-	// NDPDispatcher.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
 	expectNoDHCPv6Event()
 
@@ -3252,15 +4936,35 @@ func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
 	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
 	expectNoDHCPv6Event()
+
+	// Cycling the NIC should cause the last DHCPv6 configuration to be cleared.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+
+	// Receive an RA that updates the DHCPv6 configuration to Other
+	// Configurations.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectNoDHCPv6Event()
 }
 
 // TestRouterSolicitation tests the initial Router Solicitations that are sent
 // when a NIC newly becomes enabled.
 func TestRouterSolicitation(t *testing.T) {
-	t.Parallel()
+	const nicID = 1
 
 	tests := []struct {
 		name                        string
+		linkHeaderLen               uint16
+		linkAddr                    tcpip.LinkAddress
+		nicAddr                     tcpip.Address
+		expectedSrcAddr             tcpip.Address
+		expectedNDPOpts             []header.NDPOption
 		maxRtrSolicit               uint8
 		rtrSolicitInt               time.Duration
 		effectiveRtrSolicitInt      time.Duration
@@ -3268,31 +4972,54 @@ func TestRouterSolicitation(t *testing.T) {
 		effectiveMaxRtrSolicitDelay time.Duration
 	}{
 		{
-			name:                        "Single RS with delay",
+			name:                        "Single RS with 2s delay and interval",
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               1,
-			rtrSolicitInt:               time.Second,
-			effectiveRtrSolicitInt:      time.Second,
-			maxRtrSolicitDelay:          time.Second,
-			effectiveMaxRtrSolicitDelay: time.Second,
+			rtrSolicitInt:               2 * time.Second,
+			effectiveRtrSolicitInt:      2 * time.Second,
+			maxRtrSolicitDelay:          2 * time.Second,
+			effectiveMaxRtrSolicitDelay: 2 * time.Second,
+		},
+		{
+			name:                        "Single RS with 4s delay and interval",
+			expectedSrcAddr:             header.IPv6Any,
+			maxRtrSolicit:               1,
+			rtrSolicitInt:               4 * time.Second,
+			effectiveRtrSolicitInt:      4 * time.Second,
+			maxRtrSolicitDelay:          4 * time.Second,
+			effectiveMaxRtrSolicitDelay: 4 * time.Second,
 		},
 		{
 			name:                        "Two RS with delay",
+			linkHeaderLen:               1,
+			nicAddr:                     llAddr1,
+			expectedSrcAddr:             llAddr1,
 			maxRtrSolicit:               2,
-			rtrSolicitInt:               time.Second,
-			effectiveRtrSolicitInt:      time.Second,
+			rtrSolicitInt:               2 * time.Second,
+			effectiveRtrSolicitInt:      2 * time.Second,
 			maxRtrSolicitDelay:          500 * time.Millisecond,
 			effectiveMaxRtrSolicitDelay: 500 * time.Millisecond,
 		},
 		{
-			name:                        "Single RS without delay",
+			name:            "Single RS without delay",
+			linkHeaderLen:   2,
+			linkAddr:        linkAddr1,
+			nicAddr:         llAddr1,
+			expectedSrcAddr: llAddr1,
+			expectedNDPOpts: []header.NDPOption{
+				header.NDPSourceLinkLayerAddressOption(linkAddr1),
+			},
 			maxRtrSolicit:               1,
-			rtrSolicitInt:               time.Second,
-			effectiveRtrSolicitInt:      time.Second,
+			rtrSolicitInt:               2 * time.Second,
+			effectiveRtrSolicitInt:      2 * time.Second,
 			maxRtrSolicitDelay:          0,
 			effectiveMaxRtrSolicitDelay: 0,
 		},
 		{
 			name:                        "Two RS without delay and invalid zero interval",
+			linkHeaderLen:               3,
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               0,
 			effectiveRtrSolicitInt:      4 * time.Second,
@@ -3301,6 +5028,8 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Three RS without delay",
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               3,
 			rtrSolicitInt:               500 * time.Millisecond,
 			effectiveRtrSolicitInt:      500 * time.Millisecond,
@@ -3309,6 +5038,8 @@ func TestRouterSolicitation(t *testing.T) {
 		},
 		{
 			name:                        "Two RS with invalid negative delay",
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
 			maxRtrSolicit:               2,
 			rtrSolicitInt:               time.Second,
 			effectiveRtrSolicitInt:      time.Second,
@@ -3330,11 +5061,16 @@ func TestRouterSolicitation(t *testing.T) {
 
 			t.Run(test.name, func(t *testing.T) {
 				t.Parallel()
-				e := channel.New(int(test.maxRtrSolicit), 1280, linkAddr1)
-				e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+				e := channelLinkWithHeaderLength{
+					Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, test.linkAddr),
+					headerLength: test.linkHeaderLen,
+				}
+				e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
 				waitForPkt := func(timeout time.Duration) {
 					t.Helper()
-					ctx, _ := context.WithTimeout(context.Background(), timeout)
+					ctx, cancel := context.WithTimeout(context.Background(), timeout)
+					defer cancel()
 					p, ok := e.ReadContext(ctx)
 					if !ok {
 						t.Fatal("timed out waiting for packet")
@@ -3352,15 +5088,20 @@ func TestRouterSolicitation(t *testing.T) {
 
 					checker.IPv6(t,
 						p.Pkt.Header.View(),
-						checker.SrcAddr(header.IPv6Any),
+						checker.SrcAddr(test.expectedSrcAddr),
 						checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
 						checker.TTL(header.NDPHopLimit),
-						checker.NDPRS(),
+						checker.NDPRS(checker.NDPRSOptions(test.expectedNDPOpts)),
 					)
+
+					if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
+						t.Errorf("got p.Pkt.Header.AvailableLength() = %d; want = %d", l, want)
+					}
 				}
 				waitForNothing := func(timeout time.Duration) {
 					t.Helper()
-					ctx, _ := context.WithTimeout(context.Background(), timeout)
+					ctx, cancel := context.WithTimeout(context.Background(), timeout)
+					defer cancel()
 					if _, ok := e.ReadContext(ctx); ok {
 						t.Fatal("unexpectedly got a packet")
 					}
@@ -3373,27 +5114,37 @@ func TestRouterSolicitation(t *testing.T) {
 						MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
 					},
 				})
-				if err := s.CreateNIC(1, e); err != nil {
-					t.Fatalf("CreateNIC(1) = %s", err)
+				if err := s.CreateNIC(nicID, &e); err != nil {
+					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+				}
+
+				if addr := test.nicAddr; addr != "" {
+					if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr); err != nil {
+						t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr, err)
+					}
 				}
 
-				// Make sure each RS got sent at the right
-				// times.
+				// Make sure each RS is sent at the right time.
 				remaining := test.maxRtrSolicit
 				if remaining > 0 {
 					waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultAsyncEventTimeout)
 					remaining--
 				}
+
 				for ; remaining > 0; remaining-- {
-					waitForNothing(test.effectiveRtrSolicitInt - defaultTimeout)
-					waitForPkt(defaultAsyncEventTimeout)
+					if test.effectiveRtrSolicitInt > defaultAsyncEventTimeout {
+						waitForNothing(test.effectiveRtrSolicitInt - defaultAsyncEventTimeout)
+						waitForPkt(2 * defaultAsyncEventTimeout)
+					} else {
+						waitForPkt(test.effectiveRtrSolicitInt * defaultAsyncEventTimeout)
+					}
 				}
 
 				// Make sure no more RS.
 				if test.effectiveRtrSolicitInt > test.effectiveMaxRtrSolicitDelay {
-					waitForNothing(test.effectiveRtrSolicitInt + defaultTimeout)
+					waitForNothing(test.effectiveRtrSolicitInt + defaultAsyncEventTimeout)
 				} else {
-					waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultTimeout)
+					waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultAsyncEventTimeout)
 				}
 
 				// Make sure the counter got properly
@@ -3406,77 +5157,153 @@ func TestRouterSolicitation(t *testing.T) {
 	})
 }
 
-// TestStopStartSolicitingRouters tests that when forwarding is enabled or
-// disabled, router solicitations are stopped or started, respecitively.
 func TestStopStartSolicitingRouters(t *testing.T) {
-	t.Parallel()
-
+	const nicID = 1
+	const delay = 0
 	const interval = 500 * time.Millisecond
-	const delay = time.Second
 	const maxRtrSolicitations = 3
-	e := channel.New(maxRtrSolicitations, 1280, linkAddr1)
-	waitForPkt := func(timeout time.Duration) {
-		t.Helper()
-		ctx, _ := context.WithTimeout(context.Background(), timeout)
-		p, ok := e.ReadContext(ctx)
-		if !ok {
-			t.Fatal("timed out waiting for packet")
-			return
-		}
 
-		if p.Proto != header.IPv6ProtocolNumber {
-			t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
-		}
-		checker.IPv6(t, p.Pkt.Header.View(),
-			checker.SrcAddr(header.IPv6Any),
-			checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
-			checker.TTL(header.NDPHopLimit),
-			checker.NDPRS())
-	}
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-		NDPConfigs: stack.NDPConfigurations{
-			MaxRtrSolicitations:     maxRtrSolicitations,
-			RtrSolicitationInterval: interval,
-			MaxRtrSolicitationDelay: delay,
+	tests := []struct {
+		name    string
+		startFn func(t *testing.T, s *stack.Stack)
+		// first is used to tell stopFn that it is being called for the first time
+		// after router solicitations were last enabled.
+		stopFn func(t *testing.T, s *stack.Stack, first bool)
+	}{
+		// Tests that when forwarding is enabled or disabled, router solicitations
+		// are stopped or started, respectively.
+		{
+			name: "Enable and disable forwarding",
+			startFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+				s.SetForwarding(false)
+			},
+			stopFn: func(t *testing.T, s *stack.Stack, _ bool) {
+				t.Helper()
+				s.SetForwarding(true)
+			},
 		},
-	})
-	if err := s.CreateNIC(1, e); err != nil {
-		t.Fatalf("CreateNIC(1) = %s", err)
-	}
 
-	// Enable forwarding which should stop router solicitations.
-	s.SetForwarding(true)
-	ctx, _ := context.WithTimeout(context.Background(), delay+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		// A single RS may have been sent before forwarding was enabled.
-		ctx, _ = context.WithTimeout(context.Background(), interval+defaultTimeout)
-		if _, ok = e.ReadContext(ctx); ok {
-			t.Fatal("Should not have sent more than one RS message")
-		}
-	}
+		// Tests that when a NIC is enabled or disabled, router solicitations
+		// are started or stopped, respectively.
+		{
+			name: "Enable and disable NIC",
+			startFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
 
-	// Enabling forwarding again should do nothing.
-	s.SetForwarding(true)
-	ctx, _ = context.WithTimeout(context.Background(), delay+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		t.Fatal("unexpectedly got a packet after becoming a router")
-	}
+				if err := s.EnableNIC(nicID); err != nil {
+					t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+				}
+			},
+			stopFn: func(t *testing.T, s *stack.Stack, _ bool) {
+				t.Helper()
+
+				if err := s.DisableNIC(nicID); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+				}
+			},
+		},
 
-	// Disable forwarding which should start router solicitations.
-	s.SetForwarding(false)
-	waitForPkt(delay + defaultAsyncEventTimeout)
-	waitForPkt(interval + defaultAsyncEventTimeout)
-	waitForPkt(interval + defaultAsyncEventTimeout)
-	ctx, _ = context.WithTimeout(context.Background(), interval+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
+		// Tests that when a NIC is removed, router solicitations are stopped. We
+		// cannot start router solications on a removed NIC.
+		{
+			name: "Remove NIC",
+			stopFn: func(t *testing.T, s *stack.Stack, first bool) {
+				t.Helper()
+
+				// Only try to remove the NIC the first time stopFn is called since it's
+				// impossible to remove an already removed NIC.
+				if !first {
+					return
+				}
+
+				if err := s.RemoveNIC(nicID); err != nil {
+					t.Fatalf("s.RemoveNIC(%d): %s", nicID, err)
+				}
+			},
+		},
 	}
 
-	// Disabling forwarding again should do nothing.
-	s.SetForwarding(false)
-	ctx, _ = context.WithTimeout(context.Background(), delay+defaultTimeout)
-	if _, ok := e.ReadContext(ctx); ok {
-		t.Fatal("unexpectedly got a packet after becoming a router")
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := channel.New(maxRtrSolicitations, 1280, linkAddr1)
+			waitForPkt := func(timeout time.Duration) {
+				t.Helper()
+
+				ctx, cancel := context.WithTimeout(context.Background(), timeout)
+				defer cancel()
+				p, ok := e.ReadContext(ctx)
+				if !ok {
+					t.Fatal("timed out waiting for packet")
+				}
+
+				if p.Proto != header.IPv6ProtocolNumber {
+					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+				}
+				checker.IPv6(t, p.Pkt.Header.View(),
+					checker.SrcAddr(header.IPv6Any),
+					checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+					checker.TTL(header.NDPHopLimit),
+					checker.NDPRS())
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					MaxRtrSolicitations:     maxRtrSolicitations,
+					RtrSolicitationInterval: interval,
+					MaxRtrSolicitationDelay: delay,
+				},
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			// Stop soliciting routers.
+			test.stopFn(t, s, true /* first */)
+			ctx, cancel := context.WithTimeout(context.Background(), delay+defaultAsyncEventTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				// A single RS may have been sent before solicitations were stopped.
+				ctx, cancel := context.WithTimeout(context.Background(), interval+defaultAsyncEventTimeout)
+				defer cancel()
+				if _, ok = e.ReadContext(ctx); ok {
+					t.Fatal("should not have sent more than one RS message")
+				}
+			}
+
+			// Stopping router solicitations after it has already been stopped should
+			// do nothing.
+			test.stopFn(t, s, false /* first */)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultAsyncEventTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got a packet after router solicitation has been stopepd")
+			}
+
+			// If test.startFn is nil, there is no way to restart router solications.
+			if test.startFn == nil {
+				return
+			}
+
+			// Start soliciting routers.
+			test.startFn(t, s)
+			waitForPkt(delay + defaultAsyncEventTimeout)
+			waitForPkt(interval + defaultAsyncEventTimeout)
+			waitForPkt(interval + defaultAsyncEventTimeout)
+			ctx, cancel = context.WithTimeout(context.Background(), interval+defaultAsyncEventTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
+			}
+
+			// Starting router solicitations after it has already completed should do
+			// nothing.
+			test.startFn(t, s)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultAsyncEventTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got a packet after finishing router solicitations")
+			}
+		})
 	}
 }
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index ca3a7a07e..d756ae6f5 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -15,7 +15,7 @@
 package stack
 
 import (
-	"log"
+	"fmt"
 	"reflect"
 	"sort"
 	"strings"
@@ -27,6 +27,14 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
+var ipv4BroadcastAddr = tcpip.ProtocolAddress{
+	Protocol: header.IPv4ProtocolNumber,
+	AddressWithPrefix: tcpip.AddressWithPrefix{
+		Address:   header.IPv4Broadcast,
+		PrefixLen: 8 * header.IPv4AddressSize,
+	},
+}
+
 // NIC represents a "network interface card" to which the networking stack is
 // attached.
 type NIC struct {
@@ -46,7 +54,7 @@ type NIC struct {
 		primary       map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
 		endpoints     map[NetworkEndpointID]*referencedNetworkEndpoint
 		addressRanges []tcpip.Subnet
-		mcastJoins    map[NetworkEndpointID]int32
+		mcastJoins    map[NetworkEndpointID]uint32
 		// packetEPs is protected by mu, but the contained PacketEndpoint
 		// values are not.
 		packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint
@@ -113,16 +121,17 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 	}
 	nic.mu.primary = make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint)
 	nic.mu.endpoints = make(map[NetworkEndpointID]*referencedNetworkEndpoint)
-	nic.mu.mcastJoins = make(map[NetworkEndpointID]int32)
+	nic.mu.mcastJoins = make(map[NetworkEndpointID]uint32)
 	nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint)
 	nic.mu.ndp = ndpState{
-		nic:              nic,
-		configs:          stack.ndpConfigs,
-		dad:              make(map[tcpip.Address]dadState),
-		defaultRouters:   make(map[tcpip.Address]defaultRouterState),
-		onLinkPrefixes:   make(map[tcpip.Subnet]onLinkPrefixState),
-		autoGenAddresses: make(map[tcpip.Address]autoGenAddressState),
+		nic:            nic,
+		configs:        stack.ndpConfigs,
+		dad:            make(map[tcpip.Address]dadState),
+		defaultRouters: make(map[tcpip.Address]defaultRouterState),
+		onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
+		slaacPrefixes:  make(map[tcpip.Subnet]slaacPrefixState),
 	}
+	nic.mu.ndp.initializeTempAddrState()
 
 	// Register supported packet endpoint protocols.
 	for _, netProto := range header.Ethertypes {
@@ -132,11 +141,86 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 		nic.mu.packetEPs[netProto.Number()] = []PacketEndpoint{}
 	}
 
+	nic.linkEP.Attach(nic)
+
 	return nic
 }
 
-// enable enables the NIC. enable will attach the link to its LinkEndpoint and
-// join the IPv6 All-Nodes Multicast address (ff02::1).
+// enabled returns true if n is enabled.
+func (n *NIC) enabled() bool {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	return enabled
+}
+
+// disable disables n.
+//
+// It undoes the work done by enable.
+func (n *NIC) disable() *tcpip.Error {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	if !enabled {
+		return nil
+	}
+
+	n.mu.Lock()
+	err := n.disableLocked()
+	n.mu.Unlock()
+	return err
+}
+
+// disableLocked disables n.
+//
+// It undoes the work done by enable.
+//
+// n MUST be locked.
+func (n *NIC) disableLocked() *tcpip.Error {
+	if !n.mu.enabled {
+		return nil
+	}
+
+	// TODO(b/147015577): Should Routes that are currently bound to n be
+	// invalidated? Currently, Routes will continue to work when a NIC is enabled
+	// again, and applications may not know that the underlying NIC was ever
+	// disabled.
+
+	if _, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]; ok {
+		n.mu.ndp.stopSolicitingRouters()
+		n.mu.ndp.cleanupState(false /* hostOnly */)
+
+		// Stop DAD for all the unicast IPv6 endpoints that are in the
+		// permanentTentative state.
+		for _, r := range n.mu.endpoints {
+			if addr := r.ep.ID().LocalAddress; r.getKind() == permanentTentative && header.IsV6UnicastAddress(addr) {
+				n.mu.ndp.stopDuplicateAddressDetection(addr)
+			}
+		}
+
+		// The NIC may have already left the multicast group.
+		if err := n.leaveGroupLocked(header.IPv6AllNodesMulticastAddress, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
+			return err
+		}
+	}
+
+	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
+		// The address may have already been removed.
+		if err := n.removePermanentAddressLocked(ipv4BroadcastAddr.AddressWithPrefix.Address); err != nil && err != tcpip.ErrBadLocalAddress {
+			return err
+		}
+	}
+
+	n.mu.enabled = false
+	return nil
+}
+
+// enable enables n.
+//
+// If the stack has IPv6 enabled, enable will join the IPv6 All-Nodes Multicast
+// address (ff02::1), start DAD for permanent addresses, and start soliciting
+// routers if the stack is not operating as a router. If the stack is also
+// configured to auto-generate a link-local address, one will be generated.
 func (n *NIC) enable() *tcpip.Error {
 	n.mu.RLock()
 	enabled := n.mu.enabled
@@ -154,14 +238,9 @@ func (n *NIC) enable() *tcpip.Error {
 
 	n.mu.enabled = true
 
-	n.attachLinkEndpoint()
-
 	// Create an endpoint to receive broadcast packets on this interface.
 	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
-		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
-			Protocol:          header.IPv4ProtocolNumber,
-			AddressWithPrefix: tcpip.AddressWithPrefix{header.IPv4Broadcast, 8 * header.IPv4AddressSize},
-		}, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
+		if _, err := n.addAddressLocked(ipv4BroadcastAddr, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
 			return err
 		}
 	}
@@ -183,6 +262,14 @@ func (n *NIC) enable() *tcpip.Error {
 		return nil
 	}
 
+	// Join the All-Nodes multicast group before starting DAD as responses to DAD
+	// (NDP NS) messages may be sent to the All-Nodes multicast group if the
+	// source address of the NDP NS is the unspecified address, as per RFC 4861
+	// section 7.2.4.
+	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
+		return err
+	}
+
 	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
 	// state.
 	//
@@ -200,10 +287,6 @@ func (n *NIC) enable() *tcpip.Error {
 		}
 	}
 
-	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
-		return err
-	}
-
 	// Do not auto-generate an IPv6 link-local address for loopback devices.
 	if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() {
 		// The valid and preferred lifetime is infinite for the auto-generated
@@ -225,6 +308,42 @@ func (n *NIC) enable() *tcpip.Error {
 	return nil
 }
 
+// remove detaches NIC from the link endpoint, and marks existing referenced
+// network endpoints expired. This guarantees no packets between this NIC and
+// the network stack.
+func (n *NIC) remove() *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	n.disableLocked()
+
+	// TODO(b/151378115): come up with a better way to pick an error than the
+	// first one.
+	var err *tcpip.Error
+
+	// Forcefully leave multicast groups.
+	for nid := range n.mu.mcastJoins {
+		if tempErr := n.leaveGroupLocked(nid.LocalAddress, true /* force */); tempErr != nil && err == nil {
+			err = tempErr
+		}
+	}
+
+	// Remove permanent and permanentTentative addresses, so no packet goes out.
+	for nid, ref := range n.mu.endpoints {
+		switch ref.getKind() {
+		case permanentTentative, permanent:
+			if tempErr := n.removePermanentAddressLocked(nid.LocalAddress); tempErr != nil && err == nil {
+				err = tempErr
+			}
+		}
+	}
+
+	// Detach from link endpoint, so no packet comes in.
+	n.linkEP.Attach(nil)
+
+	return err
+}
+
 // becomeIPv6Router transitions n into an IPv6 router.
 //
 // When transitioning into an IPv6 router, host-only state (NDP discovered
@@ -234,7 +353,7 @@ func (n *NIC) becomeIPv6Router() {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	n.mu.ndp.cleanupHostOnlyState()
+	n.mu.ndp.cleanupState(true /* hostOnly */)
 	n.mu.ndp.stopSolicitingRouters()
 }
 
@@ -249,12 +368,6 @@ func (n *NIC) becomeIPv6Host() {
 	n.mu.ndp.startSolicitingRouters()
 }
 
-// attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
-// to start delivering packets.
-func (n *NIC) attachLinkEndpoint() {
-	n.linkEP.Attach(n)
-}
-
 // setPromiscuousMode enables or disables promiscuous mode.
 func (n *NIC) setPromiscuousMode(enable bool) {
 	n.mu.Lock()
@@ -339,7 +452,7 @@ type ipv6AddrCandidate struct {
 // primaryIPv6Endpoint returns an IPv6 endpoint following Source Address
 // Selection (RFC 6724 section 5).
 //
-// Note, only rules 1-3 are followed.
+// Note, only rules 1-3 and 7 are followed.
 //
 // remoteAddr must be a valid IPv6 address.
 func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
@@ -357,7 +470,7 @@ func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEn
 	cs := make([]ipv6AddrCandidate, 0, len(primaryAddrs))
 	for _, r := range primaryAddrs {
 		// If r is not valid for outgoing connections, it is not a valid endpoint.
-		if !r.isValidForOutgoing() {
+		if !r.isValidForOutgoingRLocked() {
 			continue
 		}
 
@@ -367,7 +480,7 @@ func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEn
 			// Should never happen as we got r from the primary IPv6 endpoint list and
 			// ScopeForIPv6Address only returns an error if addr is not an IPv6
 			// address.
-			log.Fatalf("header.ScopeForIPv6Address(%s): %s", addr, err)
+			panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", addr, err))
 		}
 
 		cs = append(cs, ipv6AddrCandidate{
@@ -379,7 +492,7 @@ func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEn
 	remoteScope, err := header.ScopeForIPv6Address(remoteAddr)
 	if err != nil {
 		// primaryIPv6Endpoint should never be called with an invalid IPv6 address.
-		log.Fatalf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err)
+		panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err))
 	}
 
 	// Sort the addresses as per RFC 6724 section 5 rules 1-3.
@@ -410,6 +523,11 @@ func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEn
 			return sbDep
 		}
 
+		// Prefer temporary addresses as per RFC 6724 section 5 rule 7.
+		if saTemp, sbTemp := sa.ref.configType == slaacTemp, sb.ref.configType == slaacTemp; saTemp != sbTemp {
+			return saTemp
+		}
+
 		// sa and sb are equal, return the endpoint that is closest to the front of
 		// the primary endpoint list.
 		return i < j
@@ -712,6 +830,7 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
 		case permanentExpired, temporary:
 			continue
 		}
+
 		addrs = append(addrs, tcpip.ProtocolAddress{
 			Protocol: ref.protocol,
 			AddressWithPrefix: tcpip.AddressWithPrefix{
@@ -874,6 +993,7 @@ func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
 	for i, ref := range refs {
 		if ref == r {
 			n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+			refs[len(refs)-1] = nil
 			break
 		}
 	}
@@ -898,35 +1018,45 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 		return tcpip.ErrBadLocalAddress
 	}
 
-	isIPv6Unicast := r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr)
+	switch r.protocol {
+	case header.IPv6ProtocolNumber:
+		return n.removePermanentIPv6EndpointLocked(r, true /* allowSLAACInvalidation */)
+	default:
+		r.expireLocked()
+		return nil
+	}
+}
+
+func (n *NIC) removePermanentIPv6EndpointLocked(r *referencedNetworkEndpoint, allowSLAACInvalidation bool) *tcpip.Error {
+	addr := r.addrWithPrefix()
+
+	isIPv6Unicast := header.IsV6UnicastAddress(addr.Address)
 
 	if isIPv6Unicast {
-		// If we are removing a tentative IPv6 unicast address, stop
-		// DAD.
-		if kind == permanentTentative {
-			n.mu.ndp.stopDuplicateAddressDetection(addr)
-		}
+		n.mu.ndp.stopDuplicateAddressDetection(addr.Address)
 
 		// If we are removing an address generated via SLAAC, cleanup
 		// its SLAAC resources and notify the integrator.
-		if r.configType == slaac {
-			n.mu.ndp.cleanupAutoGenAddrResourcesAndNotify(addr)
+		switch r.configType {
+		case slaac:
+			n.mu.ndp.cleanupSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
+		case slaacTemp:
+			n.mu.ndp.cleanupTempSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
 		}
 	}
 
-	r.setKind(permanentExpired)
-	if !r.decRefLocked() {
-		// The endpoint still has references to it.
-		return nil
-	}
+	r.expireLocked()
 
 	// At this point the endpoint is deleted.
 
 	// If we are removing an IPv6 unicast address, leave the solicited-node
 	// multicast address.
+	//
+	// We ignore the tcpip.ErrBadLocalAddress error because the solicited-node
+	// multicast group may be left by user action.
 	if isIPv6Unicast {
-		snmc := header.SolicitedNodeAddr(addr)
-		if err := n.leaveGroupLocked(snmc); err != nil {
+		snmc := header.SolicitedNodeAddr(addr.Address)
+		if err := n.leaveGroupLocked(snmc, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
 			return err
 		}
 	}
@@ -986,32 +1116,47 @@ func (n *NIC) leaveGroup(addr tcpip.Address) *tcpip.Error {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
-	return n.leaveGroupLocked(addr)
+	return n.leaveGroupLocked(addr, false /* force */)
 }
 
 // leaveGroupLocked decrements the count for the given multicast address, and
 // when it reaches zero removes the endpoint for this address. n MUST be locked
 // before leaveGroupLocked is called.
-func (n *NIC) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
+//
+// If force is true, then the count for the multicast addres is ignored and the
+// endpoint will be removed immediately.
+func (n *NIC) leaveGroupLocked(addr tcpip.Address, force bool) *tcpip.Error {
 	id := NetworkEndpointID{addr}
-	joins := n.mu.mcastJoins[id]
-	switch joins {
-	case 0:
+	joins, ok := n.mu.mcastJoins[id]
+	if !ok {
 		// There are no joins with this address on this NIC.
 		return tcpip.ErrBadLocalAddress
-	case 1:
-		// This is the last one, clean up.
-		if err := n.removePermanentAddressLocked(addr); err != nil {
-			return err
-		}
 	}
-	n.mu.mcastJoins[id] = joins - 1
+
+	joins--
+	if force || joins == 0 {
+		// There are no outstanding joins or we are forced to leave, clean up.
+		delete(n.mu.mcastJoins, id)
+		return n.removePermanentAddressLocked(addr)
+	}
+
+	n.mu.mcastJoins[id] = joins
 	return nil
 }
 
-func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt tcpip.PacketBuffer) {
+// isInGroup returns true if n has joined the multicast group addr.
+func (n *NIC) isInGroup(addr tcpip.Address) bool {
+	n.mu.RLock()
+	joins := n.mu.mcastJoins[NetworkEndpointID{addr}]
+	n.mu.RUnlock()
+
+	return joins != 0
+}
+
+func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt *PacketBuffer) {
 	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
 	r.RemoteLinkAddress = remotelinkAddr
+
 	ref.ep.HandlePacket(&r, pkt)
 	ref.decRef()
 }
@@ -1022,7 +1167,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // Note that the ownership of the slice backing vv is retained by the caller.
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
-func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	n.mu.RLock()
 	enabled := n.mu.enabled
 	// If the NIC is not yet enabled, don't receive any packets.
@@ -1067,12 +1212,21 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		n.stack.stats.IP.PacketsReceived.Increment()
 	}
 
-	if len(pkt.Data.First()) < netProto.MinimumPacketSize() {
+	// Parse headers.
+	transProtoNum, hasTransportHdr, ok := netProto.Parse(pkt)
+	if !ok {
+		// The packet is too small to contain a network header.
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
+	if hasTransportHdr {
+		// Parse the transport header if present.
+		if state, ok := n.stack.transportProtocols[transProtoNum]; ok {
+			state.proto.Parse(pkt)
+		}
+	}
 
-	src, dst := netProto.ParseAddresses(pkt.Data.First())
+	src, dst := netProto.ParseAddresses(pkt.NetworkHeader)
 
 	if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil {
 		// The source address is one of our own, so we never should have gotten a
@@ -1082,8 +1236,21 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		n.stack.stats.IP.InvalidSourceAddressesReceived.Increment()
 		return
 	}
+
+	// TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
+	// Loopback traffic skips the prerouting chain.
+	if protocol == header.IPv4ProtocolNumber && !n.isLoopback() {
+		// iptables filtering.
+		ipt := n.stack.IPTables()
+		address := n.primaryAddress(protocol)
+		if ok := ipt.Check(Prerouting, pkt, nil, nil, address.Address, ""); !ok {
+			// iptables is telling us to drop the packet.
+			return
+		}
+	}
+
 	if ref := n.getRef(protocol, dst); ref != nil {
-		handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, pkt)
+		handlePacket(protocol, dst, src, n.linkEP.LinkAddress(), remote, ref, pkt)
 		return
 	}
 
@@ -1097,10 +1264,6 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
 			return
 		}
-		defer r.Release()
-
-		r.LocalLinkAddress = n.linkEP.LinkAddress()
-		r.RemoteLinkAddress = remote
 
 		// Found a NIC.
 		n := r.ref.nic
@@ -1109,24 +1272,33 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		ok = ok && ref.isValidForOutgoingRLocked() && ref.tryIncRef()
 		n.mu.RUnlock()
 		if ok {
+			r.LocalLinkAddress = n.linkEP.LinkAddress()
+			r.RemoteLinkAddress = remote
 			r.RemoteAddress = src
 			// TODO(b/123449044): Update the source NIC as well.
 			ref.ep.HandlePacket(&r, pkt)
 			ref.decRef()
-		} else {
-			// n doesn't have a destination endpoint.
-			// Send the packet out of n.
-			pkt.Header = buffer.NewPrependableFromView(pkt.Data.First())
-			pkt.Data.RemoveFirst()
-
-			// TODO(b/128629022): use route.WritePacket.
-			if err := n.linkEP.WritePacket(&r, nil /* gso */, protocol, pkt); err != nil {
-				r.Stats().IP.OutgoingPacketErrors.Increment()
-			} else {
-				n.stats.Tx.Packets.Increment()
-				n.stats.Tx.Bytes.IncrementBy(uint64(pkt.Header.UsedLength() + pkt.Data.Size()))
+			r.Release()
+			return
+		}
+
+		// n doesn't have a destination endpoint.
+		// Send the packet out of n.
+		// TODO(b/128629022): move this logic to route.WritePacket.
+		if ch, err := r.Resolve(nil); err != nil {
+			if err == tcpip.ErrWouldBlock {
+				n.stack.forwarder.enqueue(ch, n, &r, protocol, pkt)
+				// forwarder will release route.
+				return
 			}
+			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
+			r.Release()
+			return
 		}
+
+		// The link-address resolution finished immediately.
+		n.forwardPacket(&r, protocol, pkt)
+		r.Release()
 		return
 	}
 
@@ -1136,9 +1308,37 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 }
 
+func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
+	// TODO(b/151227689): Avoid copying the packet when forwarding. We can do this
+	// by having lower layers explicity write each header instead of just
+	// pkt.Header.
+
+	// pkt may have set its NetworkHeader and TransportHeader. If we're
+	// forwarding, we'll have to copy them into pkt.Header.
+	pkt.Header = buffer.NewPrependable(int(n.linkEP.MaxHeaderLength()) + len(pkt.NetworkHeader) + len(pkt.TransportHeader))
+	if n := copy(pkt.Header.Prepend(len(pkt.TransportHeader)), pkt.TransportHeader); n != len(pkt.TransportHeader) {
+		panic(fmt.Sprintf("copied %d bytes, expected %d", n, len(pkt.TransportHeader)))
+	}
+	if n := copy(pkt.Header.Prepend(len(pkt.NetworkHeader)), pkt.NetworkHeader); n != len(pkt.NetworkHeader) {
+		panic(fmt.Sprintf("copied %d bytes, expected %d", n, len(pkt.NetworkHeader)))
+	}
+
+	// WritePacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Header.UsedLength() + pkt.Data.Size()
+
+	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
+		return
+	}
+
+	n.stats.Tx.Packets.Increment()
+	n.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
+}
+
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
-func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) {
 	state, ok := n.stack.transportProtocols[protocol]
 	if !ok {
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
@@ -1152,12 +1352,31 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	// validly formed.
 	n.stack.demux.deliverRawPacket(r, protocol, pkt)
 
-	if len(pkt.Data.First()) < transProto.MinimumPacketSize() {
+	// TransportHeader is nil only when pkt is an ICMP packet or was reassembled
+	// from fragments.
+	if pkt.TransportHeader == nil {
+		// TODO(gvisor.dev/issue/170): ICMP packets don't have their
+		// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+		// full explanation.
+		if protocol == header.ICMPv4ProtocolNumber || protocol == header.ICMPv6ProtocolNumber {
+			transHeader, ok := pkt.Data.PullUp(transProto.MinimumPacketSize())
+			if !ok {
+				n.stack.stats.MalformedRcvdPackets.Increment()
+				return
+			}
+			pkt.TransportHeader = transHeader
+		} else {
+			// This is either a bad packet or was re-assembled from fragments.
+			transProto.Parse(pkt)
+		}
+	}
+
+	if len(pkt.TransportHeader) < transProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
+	srcPort, dstPort, err := transProto.ParsePorts(pkt.TransportHeader)
 	if err != nil {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
@@ -1184,7 +1403,7 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 
 // DeliverTransportControlPacket delivers control packets to the appropriate
 // transport protocol endpoint.
-func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer) {
 	state, ok := n.stack.transportProtocols[trans]
 	if !ok {
 		return
@@ -1195,11 +1414,12 @@ func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcp
 	// ICMPv4 only guarantees that 8 bytes of the transport protocol will
 	// be present in the payload. We know that the ports are within the
 	// first 8 bytes for all known transport protocols.
-	if len(pkt.Data.First()) < 8 {
+	transHeader, ok := pkt.Data.PullUp(8)
+	if !ok {
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
+	srcPort, dstPort, err := transProto.ParsePorts(transHeader)
 	if err != nil {
 		return
 	}
@@ -1225,6 +1445,11 @@ func (n *NIC) Stack() *Stack {
 	return n.stack
 }
 
+// LinkEndpoint returns the link endpoint of n.
+func (n *NIC) LinkEndpoint() LinkEndpoint {
+	return n.linkEP
+}
+
 // isAddrTentative returns true if addr is tentative on n.
 //
 // Note that if addr is not associated with n, then this function will return
@@ -1242,10 +1467,12 @@ func (n *NIC) isAddrTentative(addr tcpip.Address) bool {
 	return ref.getKind() == permanentTentative
 }
 
-// dupTentativeAddrDetected attempts to inform n that a tentative addr
-// is a duplicate on a link.
+// dupTentativeAddrDetected attempts to inform n that a tentative addr is a
+// duplicate on a link.
 //
-// dupTentativeAddrDetected will delete the tentative address if it exists.
+// dupTentativeAddrDetected will remove the tentative address if it exists. If
+// the address was generated via SLAAC, an attempt will be made to generate a
+// new address.
 func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
 	n.mu.Lock()
 	defer n.mu.Unlock()
@@ -1259,7 +1486,24 @@ func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	return n.removePermanentAddressLocked(addr)
+	// If the address is a SLAAC address, do not invalidate its SLAAC prefix as a
+	// new address will be generated for it.
+	if err := n.removePermanentIPv6EndpointLocked(ref, false /* allowSLAACInvalidation */); err != nil {
+		return err
+	}
+
+	prefix := ref.addrWithPrefix().Subnet()
+
+	switch ref.configType {
+	case slaac:
+		n.mu.ndp.regenerateSLAACAddr(prefix)
+	case slaacTemp:
+		// Do not reset the generation attempts counter for the prefix as the
+		// temporary address is being regenerated in response to a DAD conflict.
+		n.mu.ndp.regenerateTempSLAACAddr(prefix, false /* resetGenAttempts */)
+	}
+
+	return nil
 }
 
 // setNDPConfigs sets the NDP configurations for n.
@@ -1355,9 +1599,14 @@ const (
 	// multicast group).
 	static networkEndpointConfigType = iota
 
-	// A slaac configured endpoint is an IPv6 endpoint that was
-	// added by SLAAC as per RFC 4862 section 5.5.3.
+	// A SLAAC configured endpoint is an IPv6 endpoint that was added by
+	// SLAAC as per RFC 4862 section 5.5.3.
 	slaac
+
+	// A temporary SLAAC configured endpoint is an IPv6 endpoint that was added by
+	// SLAAC as per RFC 4941. Temporary SLAAC addresses are short-lived and are
+	// not expected to be valid (or preferred) forever; hence the term temporary.
+	slaacTemp
 )
 
 type referencedNetworkEndpoint struct {
@@ -1387,6 +1636,13 @@ type referencedNetworkEndpoint struct {
 	deprecated bool
 }
 
+func (r *referencedNetworkEndpoint) addrWithPrefix() tcpip.AddressWithPrefix {
+	return tcpip.AddressWithPrefix{
+		Address:   r.ep.ID().LocalAddress,
+		PrefixLen: r.ep.PrefixLen(),
+	}
+}
+
 func (r *referencedNetworkEndpoint) getKind() networkEndpointKind {
 	return networkEndpointKind(atomic.LoadInt32((*int32)(&r.kind)))
 }
@@ -1411,7 +1667,14 @@ func (r *referencedNetworkEndpoint) isValidForOutgoing() bool {
 //
 // r's NIC must be read locked.
 func (r *referencedNetworkEndpoint) isValidForOutgoingRLocked() bool {
-	return r.getKind() != permanentExpired || r.nic.mu.spoofing
+	return r.nic.mu.enabled && (r.getKind() != permanentExpired || r.nic.mu.spoofing)
+}
+
+// expireLocked decrements the reference count and marks the permanent endpoint
+// as expired.
+func (r *referencedNetworkEndpoint) expireLocked() {
+	r.setKind(permanentExpired)
+	r.decRefLocked()
 }
 
 // decRef decrements the ref count and cleans up the endpoint once it reaches
@@ -1423,14 +1686,11 @@ func (r *referencedNetworkEndpoint) decRef() {
 }
 
 // decRefLocked is the same as decRef but assumes that the NIC.mu mutex is
-// locked. Returns true if the endpoint was removed.
-func (r *referencedNetworkEndpoint) decRefLocked() bool {
+// locked.
+func (r *referencedNetworkEndpoint) decRefLocked() {
 	if atomic.AddInt32(&r.refs, -1) == 0 {
 		r.nic.removeEndpointLocked(r)
-		return true
 	}
-
-	return false
 }
 
 // incRef increments the ref count. It must only be called when the caller is
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index edaee3b86..fea46158c 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -17,7 +17,6 @@ package stack
 import (
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
 
@@ -45,7 +44,7 @@ func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
 		t.FailNow()
 	}
 
-	nic.DeliverNetworkPacket(nil, "", "", 0, tcpip.PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
+	nic.DeliverNetworkPacket("", "", 0, &PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
 
 	if got := nic.stats.DisabledRx.Packets.Value(); got != 1 {
 		t.Errorf("got DisabledRx.Packets = %d, want = 1", got)
diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index ab24372e7..1b5da6017 100644
--- a/pkg/tcpip/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -11,18 +11,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package tcpip
+package stack
 
-import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
 
 // A PacketBuffer contains all the data of a network packet.
 //
 // As a PacketBuffer traverses up the stack, it may be necessary to pass it to
 // multiple endpoints. Clone() should be called in such cases so that
 // modifications to the Data field do not affect other copies.
-//
-// +stateify savable
 type PacketBuffer struct {
+	_ noCopy
+
+	// PacketBufferEntry is used to build an intrusive list of
+	// PacketBuffers.
+	PacketBufferEntry
+
 	// Data holds the payload of the packet. For inbound packets, it also
 	// holds the headers, which are consumed as the packet moves up the
 	// stack. Headers are guaranteed not to be split across views.
@@ -31,16 +38,14 @@ type PacketBuffer struct {
 	// or otherwise modified.
 	Data buffer.VectorisedView
 
-	// DataOffset is used for GSO output. It is the offset into the Data
-	// field where the payload of this packet starts.
-	DataOffset int
-
-	// DataSize is used for GSO output. It is the size of this packet's
-	// payload.
-	DataSize int
-
 	// Header holds the headers of outbound packets. As a packet is passed
-	// down the stack, each layer adds to Header.
+	// down the stack, each layer adds to Header. Note that forwarded
+	// packets don't populate Headers on their way out -- their headers and
+	// payload are never parsed out and remain in Data.
+	//
+	// TODO(gvisor.dev/issue/170): Forwarded packets don't currently
+	// populate Header, but should. This will be doable once early parsing
+	// (https://github.com/google/gvisor/pull/1995) is supported.
 	Header buffer.Prependable
 
 	// These fields are used by both inbound and outbound packets. They
@@ -55,13 +60,56 @@ type PacketBuffer struct {
 	LinkHeader      buffer.View
 	NetworkHeader   buffer.View
 	TransportHeader buffer.View
+
+	// Hash is the transport layer hash of this packet. A value of zero
+	// indicates no valid hash has been set.
+	Hash uint32
+
+	// Owner is implemented by task to get the uid and gid.
+	// Only set for locally generated packets.
+	Owner tcpip.PacketOwner
+
+	// The following fields are only set by the qdisc layer when the packet
+	// is added to a queue.
+	EgressRoute           *Route
+	GSOOptions            *GSO
+	NetworkProtocolNumber tcpip.NetworkProtocolNumber
+
+	// NatDone indicates if the packet has been manipulated as per NAT
+	// iptables rule.
+	NatDone bool
 }
 
 // Clone makes a copy of pk. It clones the Data field, which creates a new
 // VectorisedView but does not deep copy the underlying bytes.
 //
 // Clone also does not deep copy any of its other fields.
-func (pk PacketBuffer) Clone() PacketBuffer {
-	pk.Data = pk.Data.Clone(nil)
-	return pk
+//
+// FIXME(b/153685824): Data gets copied but not other header references.
+func (pk *PacketBuffer) Clone() *PacketBuffer {
+	return &PacketBuffer{
+		PacketBufferEntry:     pk.PacketBufferEntry,
+		Data:                  pk.Data.Clone(nil),
+		Header:                pk.Header,
+		LinkHeader:            pk.LinkHeader,
+		NetworkHeader:         pk.NetworkHeader,
+		TransportHeader:       pk.TransportHeader,
+		Hash:                  pk.Hash,
+		Owner:                 pk.Owner,
+		EgressRoute:           pk.EgressRoute,
+		GSOOptions:            pk.GSOOptions,
+		NetworkProtocolNumber: pk.NetworkProtocolNumber,
+		NatDone:               pk.NatDone,
+	}
 }
+
+// noCopy may be embedded into structs which must not be copied
+// after the first use.
+//
+// See https://golang.org/issues/8005#issuecomment-190753527
+// for details.
+type noCopy struct{}
+
+// Lock is a no-op used by -copylocks checker from `go vet`.
+func (*noCopy) Lock()   {}
+func (*noCopy) Unlock() {}
diff --git a/pkg/tcpip/stack/rand.go b/pkg/tcpip/stack/rand.go
new file mode 100644
index 000000000..421fb5c15
--- /dev/null
+++ b/pkg/tcpip/stack/rand.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	mathrand "math/rand"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// lockedRandomSource provides a threadsafe rand.Source.
+type lockedRandomSource struct {
+	mu  sync.Mutex
+	src mathrand.Source
+}
+
+func (r *lockedRandomSource) Int63() (n int64) {
+	r.mu.Lock()
+	n = r.src.Int63()
+	r.mu.Unlock()
+	return n
+}
+
+func (r *lockedRandomSource) Seed(seed int64) {
+	r.mu.Lock()
+	r.src.Seed(seed)
+	r.mu.Unlock()
+}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index ec91f60dd..5cbc946b6 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -67,17 +67,18 @@ type TransportEndpoint interface {
 	// this transport endpoint. It sets pkt.TransportHeader.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer)
+	HandlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer)
 
 	// HandleControlPacket is called by the stack when new control (e.g.
 	// ICMP) packets arrive to this transport endpoint.
 	// HandleControlPacket takes ownership of pkt.
-	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
+	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt *PacketBuffer)
 
-	// Close puts the endpoint in a closed state and frees all resources
-	// associated with it. This cleanup may happen asynchronously. Wait can
-	// be used to block on this asynchronous cleanup.
-	Close()
+	// Abort initiates an expedited endpoint teardown. It puts the endpoint
+	// in a closed state and frees all resources associated with it. This
+	// cleanup may happen asynchronously. Wait can be used to block on this
+	// asynchronous cleanup.
+	Abort()
 
 	// Wait waits for any worker goroutines owned by the endpoint to stop.
 	//
@@ -99,7 +100,7 @@ type RawTransportEndpoint interface {
 	// layer up.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, pkt tcpip.PacketBuffer)
+	HandlePacket(r *Route, pkt *PacketBuffer)
 }
 
 // PacketEndpoint is the interface that needs to be implemented by packet
@@ -117,7 +118,7 @@ type PacketEndpoint interface {
 	// should construct its own ethernet header for applications.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
+	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 }
 
 // TransportProtocol is the interface that needs to be implemented by transport
@@ -149,7 +150,7 @@ type TransportProtocol interface {
 	// stats purposes only).
 	//
 	// HandleUnknownDestinationPacket takes ownership of pkt.
-	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) bool
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
@@ -160,6 +161,18 @@ type TransportProtocol interface {
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
 	Option(option interface{}) *tcpip.Error
+
+	// Close requests that any worker goroutines owned by the protocol
+	// stop.
+	Close()
+
+	// Wait waits for any worker goroutines owned by the protocol to stop.
+	Wait()
+
+	// Parse sets pkt.TransportHeader and trims pkt.Data appropriately. It does
+	// neither and returns false if pkt.Data is too small, i.e. pkt.Data.Size() <
+	// MinimumPacketSize()
+	Parse(pkt *PacketBuffer) (ok bool)
 }
 
 // TransportDispatcher contains the methods used by the network stack to deliver
@@ -172,7 +185,7 @@ type TransportDispatcher interface {
 	// pkt.NetworkHeader must be set before calling DeliverTransportPacket.
 	//
 	// DeliverTransportPacket takes ownership of pkt.
-	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer)
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer)
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
@@ -181,7 +194,7 @@ type TransportDispatcher interface {
 	// DeliverTransportControlPacket.
 	//
 	// DeliverTransportControlPacket takes ownership of pkt.
-	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
+	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer)
 }
 
 // PacketLooping specifies where an outbound packet should be sent.
@@ -232,17 +245,18 @@ type NetworkEndpoint interface {
 	MaxHeaderLength() uint16
 
 	// WritePacket writes a packet to the given destination address and
-	// protocol. It sets pkt.NetworkHeader. pkt.TransportHeader must have
-	// already been set.
-	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error
+	// protocol. It takes ownership of pkt. pkt.TransportHeader must have already
+	// been set.
+	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets to the given destination address and
-	// protocol. pkts must not be zero length.
-	WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error)
+	// protocol. pkts must not be zero length. It takes ownership of pkts and
+	// underlying packets.
+	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error)
 
 	// WriteHeaderIncludedPacket writes a packet that includes a network
-	// header to the given destination address.
-	WriteHeaderIncludedPacket(r *Route, pkt tcpip.PacketBuffer) *tcpip.Error
+	// header to the given destination address. It takes ownership of pkt.
+	WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error
 
 	// ID returns the network protocol endpoint ID.
 	ID() *NetworkEndpointID
@@ -257,10 +271,14 @@ type NetworkEndpoint interface {
 	// this network endpoint. It sets pkt.NetworkHeader.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, pkt tcpip.PacketBuffer)
+	HandlePacket(r *Route, pkt *PacketBuffer)
 
 	// Close is called when the endpoint is reomved from a stack.
 	Close()
+
+	// NetworkProtocolNumber returns the tcpip.NetworkProtocolNumber for
+	// this endpoint.
+	NetworkProtocolNumber() tcpip.NetworkProtocolNumber
 }
 
 // NetworkProtocol is the interface that needs to be implemented by network
@@ -277,7 +295,7 @@ type NetworkProtocol interface {
 	// DefaultPrefixLen returns the protocol's default prefix length.
 	DefaultPrefixLen() int
 
-	// ParsePorts returns the source and destination addresses stored in a
+	// ParseAddresses returns the source and destination addresses stored in a
 	// packet of this protocol.
 	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
 
@@ -293,6 +311,21 @@ type NetworkProtocol interface {
 	// Option returns an error if the option is not supported or the
 	// provided option value is invalid.
 	Option(option interface{}) *tcpip.Error
+
+	// Close requests that any worker goroutines owned by the protocol
+	// stop.
+	Close()
+
+	// Wait waits for any worker goroutines owned by the protocol to stop.
+	Wait()
+
+	// Parse sets pkt.NetworkHeader and trims pkt.Data appropriately. It
+	// returns:
+	// - The encapsulated protocol, if present.
+	// - Whether there is an encapsulated transport protocol payload (e.g. ARP
+	//   does not encapsulate anything).
+	// - Whether pkt.Data was large enough to parse and set pkt.NetworkHeader.
+	Parse(pkt *PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool)
 }
 
 // NetworkDispatcher contains the methods used by the network stack to deliver
@@ -307,7 +340,7 @@ type NetworkDispatcher interface {
 	// packets sent via loopback), and won't have the field set.
 	//
 	// DeliverNetworkPacket takes ownership of pkt.
-	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
+	DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 }
 
 // LinkEndpointCapabilities is the type associated with the capabilities
@@ -339,7 +372,7 @@ const (
 // LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
 // ethernet, loopback, raw) and used by network layer protocols to send packets
 // out through the implementer's data link endpoint. When a link header exists,
-// it sets each tcpip.PacketBuffer's LinkHeader field before passing it up the
+// it sets each PacketBuffer's LinkHeader field before passing it up the
 // stack.
 type LinkEndpoint interface {
 	// MTU is the maximum transmission unit for this endpoint. This is
@@ -363,29 +396,32 @@ type LinkEndpoint interface {
 	LinkAddress() tcpip.LinkAddress
 
 	// WritePacket writes a packet with the given protocol through the
-	// given route. It sets pkt.LinkHeader if a link layer header exists.
-	// pkt.NetworkHeader and pkt.TransportHeader must have already been
-	// set.
+	// given route. It takes ownership of pkt. pkt.NetworkHeader and
+	// pkt.TransportHeader must have already been set.
 	//
 	// To participate in transparent bridging, a LinkEndpoint implementation
 	// should call eth.Encode with header.EthernetFields.SrcAddr set to
 	// r.LocalLinkAddress if it is provided.
-	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) *tcpip.Error
+	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets with the given protocol through the
-	// given route. pkts must not be zero length.
+	// given route. pkts must not be zero length. It takes ownership of pkts and
+	// underlying packets.
 	//
 	// Right now, WritePackets is used only when the software segmentation
 	// offload is enabled. If it will be used for something else, it may
 	// require to change syscall filters.
-	WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
+	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
 
 	// WriteRawPacket writes a packet directly to the link. The packet
-	// should already have an ethernet header.
+	// should already have an ethernet header. It takes ownership of vv.
 	WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error
 
 	// Attach attaches the data link layer endpoint to the network-layer
 	// dispatcher of the stack.
+	//
+	// Attach will be called with a nil dispatcher if the receiver's associated
+	// NIC is being removed.
 	Attach(dispatcher NetworkDispatcher)
 
 	// IsAttached returns whether a NetworkDispatcher is attached to the
@@ -408,7 +444,7 @@ type InjectableLinkEndpoint interface {
 	LinkEndpoint
 
 	// InjectInbound injects an inbound packet.
-	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
+	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 
 	// InjectOutbound writes a fully formed outbound packet directly to the
 	// link.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index f565aafb2..f5b6ca0b9 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -153,54 +153,66 @@ func (r *Route) IsResolutionRequired() bool {
 }
 
 // WritePacket writes the packet through the given route.
-func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
 
+	// WritePacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Header.UsedLength() + pkt.Data.Size()
+
 	err := r.ref.ep.WritePacket(r, gso, params, pkt)
 	if err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 	} else {
 		r.ref.nic.stats.Tx.Packets.Increment()
-		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkt.Header.UsedLength() + pkt.Data.Size()))
+		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
 	}
 	return err
 }
 
-// WritePackets writes the set of packets through the given route.
-func (r *Route) WritePackets(gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error) {
+// WritePackets writes a list of n packets through the given route and returns
+// the number of packets written.
+func (r *Route) WritePackets(gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
 	if !r.ref.isValidForOutgoing() {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
+	// WritePackets takes ownership of pkt, calculate length first.
+	numPkts := pkts.Len()
+
 	n, err := r.ref.ep.WritePackets(r, gso, pkts, params)
 	if err != nil {
-		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(pkts) - n))
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(numPkts - n))
 	}
 	r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n))
-	payloadSize := 0
-	for i := 0; i < n; i++ {
-		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkts[i].Header.UsedLength()))
-		payloadSize += pkts[i].DataSize
+
+	writtenBytes := 0
+	for i, pb := 0, pkts.Front(); i < n && pb != nil; i, pb = i+1, pb.Next() {
+		writtenBytes += pb.Header.UsedLength()
+		writtenBytes += pb.Data.Size()
 	}
-	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(payloadSize))
+
+	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(writtenBytes))
 	return n, err
 }
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (r *Route) WriteHeaderIncludedPacket(pkt tcpip.PacketBuffer) *tcpip.Error {
+func (r *Route) WriteHeaderIncludedPacket(pkt *PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
 
+	// WriteHeaderIncludedPacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Data.Size()
+
 	if err := r.ref.ep.WriteHeaderIncludedPacket(r, pkt); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return err
 	}
 	r.ref.nic.stats.Tx.Packets.Increment()
-	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
+	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
 	return nil
 }
 
@@ -214,6 +226,12 @@ func (r *Route) MTU() uint32 {
 	return r.ref.ep.MTU()
 }
 
+// NetworkProtocolNumber returns the NetworkProtocolNumber of the underlying
+// network endpoint.
+func (r *Route) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return r.ref.ep.NetworkProtocolNumber()
+}
+
 // Release frees all resources associated with the route.
 func (r *Route) Release() {
 	if r.ref != nil {
@@ -252,3 +270,16 @@ func (r *Route) MakeLoopedRoute() Route {
 func (r *Route) Stack() *Stack {
 	return r.ref.stack()
 }
+
+// ReverseRoute returns new route with given source and destination address.
+func (r *Route) ReverseRoute(src tcpip.Address, dst tcpip.Address) Route {
+	return Route{
+		NetProto:          r.NetProto,
+		LocalAddress:      dst,
+		LocalLinkAddress:  r.RemoteLinkAddress,
+		RemoteAddress:     src,
+		RemoteLinkAddress: r.LocalLinkAddress,
+		ref:               r.ref,
+		Loop:              r.Loop,
+	}
+}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6eac16e16..294ce8775 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -20,7 +20,9 @@
 package stack
 
 import (
+	"bytes"
 	"encoding/binary"
+	mathrand "math/rand"
 	"sync/atomic"
 	"time"
 
@@ -31,7 +33,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -51,7 +52,7 @@ const (
 
 type transportProtocolState struct {
 	proto          TransportProtocol
-	defaultHandler func(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
+	defaultHandler func(r *Route, id TransportEndpointID, pkt *PacketBuffer) bool
 }
 
 // TCPProbeFunc is the expected function type for a TCP probe function to be
@@ -234,11 +235,11 @@ type RcvBufAutoTuneParams struct {
 	// was started.
 	MeasureTime time.Time
 
-	// CopiedBytes is the number of bytes copied to user space since
+	// CopiedBytes is the number of bytes copied to userspace since
 	// this measure began.
 	CopiedBytes int
 
-	// PrevCopiedBytes is the number of bytes copied to user space in
+	// PrevCopiedBytes is the number of bytes copied to userspace in
 	// the previous RTT period.
 	PrevCopiedBytes int
 
@@ -423,12 +424,8 @@ type Stack struct {
 	// handleLocal allows non-loopback interfaces to loop packets.
 	handleLocal bool
 
-	// tablesMu protects iptables.
-	tablesMu sync.RWMutex
-
-	// tables are the iptables packet filtering and manipulation rules. The are
-	// protected by tablesMu.`
-	tables iptables.IPTables
+	// tables are the iptables packet filtering and manipulation rules.
+	tables *IPTables
 
 	// resumableEndpoints is a list of endpoints that need to be resumed if the
 	// stack is being restored.
@@ -462,6 +459,18 @@ type Stack struct {
 	// opaqueIIDOpts hold the options for generating opaque interface identifiers
 	// (IIDs) as outlined by RFC 7217.
 	opaqueIIDOpts OpaqueInterfaceIdentifierOptions
+
+	// tempIIDSeed is used to seed the initial temporary interface identifier
+	// history value used to generate IIDs for temporary SLAAC addresses.
+	tempIIDSeed []byte
+
+	// forwarder holds the packets that wait for their link-address resolutions
+	// to complete, and forwards them when each resolution is done.
+	forwarder *forwardQueue
+
+	// randomGenerator is an injectable pseudo random generator that can be
+	// used when a random number is required.
+	randomGenerator *mathrand.Rand
 }
 
 // UniqueID is an abstract generator of unique identifiers.
@@ -522,9 +531,31 @@ type Options struct {
 	// this is non-nil.
 	RawFactory RawFactory
 
-	// OpaqueIIDOpts hold the options for generating opaque interface identifiers
-	// (IIDs) as outlined by RFC 7217.
+	// OpaqueIIDOpts hold the options for generating opaque interface
+	// identifiers (IIDs) as outlined by RFC 7217.
 	OpaqueIIDOpts OpaqueInterfaceIdentifierOptions
+
+	// RandSource is an optional source to use to generate random
+	// numbers. If omitted it defaults to a Source seeded by the data
+	// returned by rand.Read().
+	//
+	// RandSource must be thread-safe.
+	RandSource mathrand.Source
+
+	// TempIIDSeed is used to seed the initial temporary interface identifier
+	// history value used to generate IIDs for temporary SLAAC addresses.
+	//
+	// Temporary SLAAC adresses are short-lived addresses which are unpredictable
+	// and random from the perspective of other nodes on the network. It is
+	// recommended that the seed be a random byte buffer of at least
+	// header.IIDSize bytes to make sure that temporary SLAAC addresses are
+	// sufficiently random. It should follow minimum randomness requirements for
+	// security as outlined by RFC 4086.
+	//
+	// Note: using a nil value, the same seed across netstack program runs, or a
+	// seed that is too small would reduce randomness and increase predictability,
+	// defeating the purpose of temporary SLAAC addresses.
+	TempIIDSeed []byte
 }
 
 // TransportEndpointInfo holds useful information about a transport endpoint
@@ -551,11 +582,13 @@ type TransportEndpointInfo struct {
 	RegisterNICID tcpip.NICID
 }
 
-// AddrNetProto unwraps the specified address if it is a V4-mapped V6 address
-// and returns the network protocol number to be used to communicate with the
-// specified address. It returns an error if the passed address is incompatible
-// with the receiver.
-func (e *TransportEndpointInfo) AddrNetProto(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+// AddrNetProtoLocked unwraps the specified address if it is a V4-mapped V6
+// address and returns the network protocol number to be used to communicate
+// with the specified address. It returns an error if the passed address is
+// incompatible with the receiver.
+//
+// Preconditon: the parent endpoint mu must be held while calling this method.
+func (e *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
 	netProto := e.NetProto
 	switch len(addr.Addr) {
 	case header.IPv4AddressSize:
@@ -618,6 +651,13 @@ func New(opts Options) *Stack {
 		opts.UniqueID = new(uniqueIDGenerator)
 	}
 
+	randSrc := opts.RandSource
+	if randSrc == nil {
+		// Source provided by mathrand.NewSource is not thread-safe so
+		// we wrap it in a simple thread-safe version.
+		randSrc = &lockedRandomSource{src: mathrand.NewSource(generateRandInt64())}
+	}
+
 	// Make sure opts.NDPConfigs contains valid values only.
 	opts.NDPConfigs.validate()
 
@@ -632,6 +672,7 @@ func New(opts Options) *Stack {
 		clock:                clock,
 		stats:                opts.Stats.FillIn(),
 		handleLocal:          opts.HandleLocal,
+		tables:               DefaultTables(),
 		icmpRateLimiter:      NewICMPRateLimiter(),
 		seed:                 generateRandUint32(),
 		ndpConfigs:           opts.NDPConfigs,
@@ -639,6 +680,9 @@ func New(opts Options) *Stack {
 		uniqueIDGenerator:    opts.UniqueID,
 		ndpDisp:              opts.NDPDisp,
 		opaqueIIDOpts:        opts.OpaqueIIDOpts,
+		tempIIDSeed:          opts.TempIIDSeed,
+		forwarder:            newForwardQueue(),
+		randomGenerator:      mathrand.New(randSrc),
 	}
 
 	// Add specified network protocols.
@@ -731,7 +775,7 @@ func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber,
 //
 // It must be called only during initialization of the stack. Changing it as the
 // stack is operating is not supported.
-func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, tcpip.PacketBuffer) bool) {
+func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, *PacketBuffer) bool) {
 	state := s.transportProtocols[p]
 	if state != nil {
 		state.defaultHandler = h
@@ -881,6 +925,8 @@ type NICOptions struct {
 // CreateNICWithOptions creates a NIC with the provided id, LinkEndpoint, and
 // NICOptions. See the documentation on type NICOptions for details on how
 // NICs can be configured.
+//
+// LinkEndpoint.Attach will be called to bind ep with a NetworkDispatcher.
 func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOptions) *tcpip.Error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -900,7 +946,6 @@ func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOp
 	}
 
 	n := newNIC(s, id, opts.Name, ep, opts.Context)
-
 	s.nics[id] = n
 	if !opts.Disabled {
 		return n.enable()
@@ -910,34 +955,88 @@ func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOp
 }
 
 // CreateNIC creates a NIC with the provided id and LinkEndpoint and calls
-// `LinkEndpoint.Attach` to start delivering packets to it.
+// LinkEndpoint.Attach to bind ep with a NetworkDispatcher.
 func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
 	return s.CreateNICWithOptions(id, ep, NICOptions{})
 }
 
+// GetNICByName gets the NIC specified by name.
+func (s *Stack) GetNICByName(name string) (*NIC, bool) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	for _, nic := range s.nics {
+		if nic.Name() == name {
+			return nic, true
+		}
+	}
+	return nil, false
+}
+
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
 // delivering packets to it.
 func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	nic := s.nics[id]
-	if nic == nil {
+	nic, ok := s.nics[id]
+	if !ok {
 		return tcpip.ErrUnknownNICID
 	}
 
 	return nic.enable()
 }
 
+// DisableNIC disables the given NIC.
+func (s *Stack) DisableNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.disable()
+}
+
 // CheckNIC checks if a NIC is usable.
 func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	s.mu.RLock()
+	defer s.mu.RUnlock()
+
 	nic, ok := s.nics[id]
-	s.mu.RUnlock()
-	if ok {
-		return nic.linkEP.IsAttached()
+	if !ok {
+		return false
+	}
+
+	return nic.enabled()
+}
+
+// RemoveNIC removes NIC and all related routes from the network stack.
+func (s *Stack) RemoveNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+	delete(s.nics, id)
+
+	// Remove routes in-place. n tracks the number of routes written.
+	n := 0
+	for i, r := range s.routeTable {
+		if r.NIC != id {
+			// Keep this route.
+			if i > n {
+				s.routeTable[n] = r
+			}
+			n++
+		}
 	}
-	return false
+	s.routeTable = s.routeTable[:n]
+
+	return nic.remove()
 }
 
 // NICAddressRanges returns a map of NICIDs to their associated subnets.
@@ -989,7 +1088,7 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 	for id, nic := range s.nics {
 		flags := NICStateFlags{
 			Up:          true, // Netstack interfaces are always up.
-			Running:     nic.linkEP.IsAttached(),
+			Running:     nic.enabled(),
 			Promiscuous: nic.isPromiscuousMode(),
 			Loopback:    nic.isLoopback(),
 		}
@@ -1151,7 +1250,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
 	needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
 	if id != 0 && !needRoute {
-		if nic, ok := s.nics[id]; ok {
+		if nic, ok := s.nics[id]; ok && nic.enabled() {
 			if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
 				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
 			}
@@ -1161,7 +1260,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 			if (id != 0 && id != route.NIC) || (len(remoteAddr) != 0 && !route.Destination.Contains(remoteAddr)) {
 				continue
 			}
-			if nic, ok := s.nics[route.NIC]; ok {
+			if nic, ok := s.nics[route.NIC]; ok && nic.enabled() {
 				if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
 					if len(remoteAddr) == 0 {
 						// If no remote address was provided, then the route
@@ -1391,7 +1490,13 @@ func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
 // Endpoints created or modified during this call may not get closed.
 func (s *Stack) Close() {
 	for _, e := range s.RegisteredEndpoints() {
-		e.Close()
+		e.Abort()
+	}
+	for _, p := range s.transportProtocols {
+		p.proto.Close()
+	}
+	for _, p := range s.networkProtocols {
+		p.Close()
 	}
 }
 
@@ -1409,6 +1514,12 @@ func (s *Stack) Wait() {
 	for _, e := range s.CleanupEndpoints() {
 		e.Wait()
 	}
+	for _, p := range s.transportProtocols {
+		p.proto.Wait()
+	}
+	for _, p := range s.networkProtocols {
+		p.Wait()
+	}
 
 	s.mu.RLock()
 	defer s.mu.RUnlock()
@@ -1614,19 +1725,21 @@ func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NIC
 	return tcpip.ErrUnknownNICID
 }
 
-// IPTables returns the stack's iptables.
-func (s *Stack) IPTables() iptables.IPTables {
-	s.tablesMu.RLock()
-	t := s.tables
-	s.tablesMu.RUnlock()
-	return t
+// IsInGroup returns true if the NIC with ID nicID has joined the multicast
+// group multicastAddr.
+func (s *Stack) IsInGroup(nicID tcpip.NICID, multicastAddr tcpip.Address) (bool, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[nicID]; ok {
+		return nic.isInGroup(multicastAddr), nil
+	}
+	return false, tcpip.ErrUnknownNICID
 }
 
-// SetIPTables sets the stack's iptables.
-func (s *Stack) SetIPTables(ipt iptables.IPTables) {
-	s.tablesMu.Lock()
-	s.tables = ipt
-	s.tablesMu.Unlock()
+// IPTables returns the stack's iptables.
+func (s *Stack) IPTables() *IPTables {
+	return s.tables
 }
 
 // ICMPLimit returns the maximum number of ICMP messages that can be sent
@@ -1733,6 +1846,12 @@ func (s *Stack) Seed() uint32 {
 	return s.seed
 }
 
+// Rand returns a reference to a pseudo random generator that can be used
+// to generate random numbers as required.
+func (s *Stack) Rand() *mathrand.Rand {
+	return s.randomGenerator
+}
+
 func generateRandUint32() uint32 {
 	b := make([]byte, 4)
 	if _, err := rand.Read(b); err != nil {
@@ -1740,3 +1859,49 @@ func generateRandUint32() uint32 {
 	}
 	return binary.LittleEndian.Uint32(b)
 }
+
+func generateRandInt64() int64 {
+	b := make([]byte, 8)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+	buf := bytes.NewReader(b)
+	var v int64
+	if err := binary.Read(buf, binary.LittleEndian, &v); err != nil {
+		panic(err)
+	}
+	return v
+}
+
+// FindNetworkEndpoint returns the network endpoint for the given address.
+func (s *Stack) FindNetworkEndpoint(netProto tcpip.NetworkProtocolNumber, address tcpip.Address) (NetworkEndpoint, *tcpip.Error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	for _, nic := range s.nics {
+		id := NetworkEndpointID{address}
+
+		if ref, ok := nic.mu.endpoints[id]; ok {
+			nic.mu.RLock()
+			defer nic.mu.RUnlock()
+
+			// An endpoint with this id exists, check if it can be
+			// used and return it.
+			return ref.ep, nil
+		}
+	}
+	return nil, tcpip.ErrBadAddress
+}
+
+// FindNICNameFromID returns the name of the nic for the given NICID.
+func (s *Stack) FindNICNameFromID(id tcpip.NICID) string {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return ""
+	}
+
+	return nic.Name()
+}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 7ba604442..ffef9bc2c 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
@@ -51,6 +52,10 @@ const (
 	// where another value is explicitly used. It is chosen to match the MTU
 	// of loopback interfaces on linux systems.
 	defaultMTU = 65536
+
+	dstAddrOffset        = 0
+	srcAddrOffset        = 1
+	protocolNumberOffset = 2
 )
 
 // fakeNetworkEndpoint is a network-layer protocol endpoint. It counts sent and
@@ -89,28 +94,28 @@ func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 	// Increment the received packet count in the protocol descriptor.
 	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
 
-	// Consume the network header.
-	b := pkt.Data.First()
-	pkt.Data.TrimFront(fakeNetHeaderLen)
-
 	// Handle control packets.
-	if b[2] == uint8(fakeControlProtocol) {
-		nb := pkt.Data.First()
-		if len(nb) < fakeNetHeaderLen {
+	if pkt.NetworkHeader[protocolNumberOffset] == uint8(fakeControlProtocol) {
+		nb, ok := pkt.Data.PullUp(fakeNetHeaderLen)
+		if !ok {
 			return
 		}
-
 		pkt.Data.TrimFront(fakeNetHeaderLen)
-		f.dispatcher.DeliverTransportControlPacket(tcpip.Address(nb[1:2]), tcpip.Address(nb[0:1]), fakeNetNumber, tcpip.TransportProtocolNumber(nb[2]), stack.ControlPortUnreachable, 0, pkt)
+		f.dispatcher.DeliverTransportControlPacket(
+			tcpip.Address(nb[srcAddrOffset:srcAddrOffset+1]),
+			tcpip.Address(nb[dstAddrOffset:dstAddrOffset+1]),
+			fakeNetNumber,
+			tcpip.TransportProtocolNumber(nb[protocolNumberOffset]),
+			stack.ControlPortUnreachable, 0, pkt)
 		return
 	}
 
 	// Dispatch the packet to the transport protocol.
-	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), pkt)
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader[protocolNumberOffset]), pkt)
 }
 
 func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 {
@@ -125,24 +130,23 @@ func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
 	return f.ep.Capabilities()
 }
 
-func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (f *fakeNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return f.proto.Number()
+}
+
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	// Increment the sent packet count in the protocol descriptor.
 	f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
 
 	// Add the protocol's header to the packet and send it to the link
 	// endpoint.
-	b := pkt.Header.Prepend(fakeNetHeaderLen)
-	b[0] = r.RemoteAddress[0]
-	b[1] = f.id.LocalAddress[0]
-	b[2] = byte(params.Protocol)
+	pkt.NetworkHeader = pkt.Header.Prepend(fakeNetHeaderLen)
+	pkt.NetworkHeader[dstAddrOffset] = r.RemoteAddress[0]
+	pkt.NetworkHeader[srcAddrOffset] = f.id.LocalAddress[0]
+	pkt.NetworkHeader[protocolNumberOffset] = byte(params.Protocol)
 
 	if r.Loop&stack.PacketLoop != 0 {
-		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-		views[0] = pkt.Header.View()
-		views = append(views, pkt.Data.Views()...)
-		f.HandlePacket(r, tcpip.PacketBuffer{
-			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
-		})
+		f.HandlePacket(r, pkt)
 	}
 	if r.Loop&stack.PacketOut == 0 {
 		return nil
@@ -152,11 +156,11 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
-func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -198,7 +202,7 @@ func (f *fakeNetworkProtocol) PacketCount(intfAddr byte) int {
 }
 
 func (*fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
-	return tcpip.Address(v[1:2]), tcpip.Address(v[0:1])
+	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
 }
 
 func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
@@ -234,10 +238,44 @@ func (f *fakeNetworkProtocol) Option(option interface{}) *tcpip.Error {
 	}
 }
 
+// Close implements TransportProtocol.Close.
+func (*fakeNetworkProtocol) Close() {}
+
+// Wait implements TransportProtocol.Wait.
+func (*fakeNetworkProtocol) Wait() {}
+
+// Parse implements TransportProtocol.Parse.
+func (*fakeNetworkProtocol) Parse(pkt *stack.PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	hdr, ok := pkt.Data.PullUp(fakeNetHeaderLen)
+	if !ok {
+		return 0, false, false
+	}
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(fakeNetHeaderLen)
+	return tcpip.TransportProtocolNumber(hdr[protocolNumberOffset]), true, true
+}
+
 func fakeNetFactory() stack.NetworkProtocol {
 	return &fakeNetworkProtocol{}
 }
 
+// linkEPWithMockedAttach is a stack.LinkEndpoint that tests can use to verify
+// that LinkEndpoint.Attach was called.
+type linkEPWithMockedAttach struct {
+	stack.LinkEndpoint
+	attached bool
+}
+
+// Attach implements stack.LinkEndpoint.Attach.
+func (l *linkEPWithMockedAttach) Attach(d stack.NetworkDispatcher) {
+	l.LinkEndpoint.Attach(d)
+	l.attached = d != nil
+}
+
+func (l *linkEPWithMockedAttach) isAttached() bool {
+	return l.attached
+}
+
 func TestNetworkReceive(t *testing.T) {
 	// Create a stack with the fake network protocol, one nic, and two
 	// addresses attached to it: 1 & 2.
@@ -262,8 +300,8 @@ func TestNetworkReceive(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	// Make sure packet with wrong address is not delivered.
-	buf[0] = 3
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 0 {
@@ -274,8 +312,8 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is delivered to first endpoint.
-	buf[0] = 1
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	buf[dstAddrOffset] = 1
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -286,8 +324,8 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is delivered to second endpoint.
-	buf[0] = 2
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	buf[dstAddrOffset] = 2
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -298,7 +336,7 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is not delivered if protocol number is wrong.
-	ep.InjectInbound(fakeNetNumber-1, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber-1, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -310,7 +348,7 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet that is too small is dropped.
 	buf.CapLength(2)
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -332,7 +370,7 @@ func sendTo(s *stack.Stack, addr tcpip.Address, payload buffer.View) *tcpip.Erro
 
 func send(r stack.Route, payload buffer.View) *tcpip.Error {
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()))
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	})
@@ -390,7 +428,7 @@ func testFailingRecv(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte b
 
 func testRecvInternal(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte byte, ep *channel.Endpoint, buf buffer.View, want int) {
 	t.Helper()
-	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if got := fakeNet.PacketCount(localAddrByte); got != want {
@@ -509,6 +547,340 @@ func testNoRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr
 	}
 }
 
+// TestAttachToLinkEndpointImmediately tests that a LinkEndpoint is attached to
+// a NetworkDispatcher when the NIC is created.
+func TestAttachToLinkEndpointImmediately(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name    string
+		nicOpts stack.NICOptions
+	}{
+		{
+			name:    "Create enabled NIC",
+			nicOpts: stack.NICOptions{Disabled: false},
+		},
+		{
+			name:    "Create disabled NIC",
+			nicOpts: stack.NICOptions{Disabled: true},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			})
+
+			e := linkEPWithMockedAttach{
+				LinkEndpoint: loopback.New(),
+			}
+
+			if err := s.CreateNICWithOptions(nicID, &e, test.nicOpts); err != nil {
+				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, test.nicOpts, err)
+			}
+			if !e.isAttached() {
+				t.Fatal("link endpoint not attached to a network dispatcher")
+			}
+		})
+	}
+}
+
+func TestDisableUnknownNIC(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	if err := s.DisableNIC(1); err != tcpip.ErrUnknownNICID {
+		t.Fatalf("got s.DisableNIC(1) = %v, want = %s", err, tcpip.ErrUnknownNICID)
+	}
+}
+
+func TestDisabledNICsNICInfoAndCheckNIC(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	e := loopback.New()
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	checkNIC := func(enabled bool) {
+		t.Helper()
+
+		allNICInfo := s.NICInfo()
+		nicInfo, ok := allNICInfo[nicID]
+		if !ok {
+			t.Errorf("entry for %d missing from allNICInfo = %+v", nicID, allNICInfo)
+		} else if nicInfo.Flags.Running != enabled {
+			t.Errorf("got nicInfo.Flags.Running = %t, want = %t", nicInfo.Flags.Running, enabled)
+		}
+
+		if got := s.CheckNIC(nicID); got != enabled {
+			t.Errorf("got s.CheckNIC(%d) = %t, want = %t", nicID, got, enabled)
+		}
+	}
+
+	// NIC should initially report itself as disabled.
+	checkNIC(false)
+
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	checkNIC(true)
+
+	// If the NIC is not reporting a correct enabled status, we cannot trust the
+	// next check so end the test here.
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	checkNIC(false)
+}
+
+func TestRemoveUnknownNIC(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	if err := s.RemoveNIC(1); err != tcpip.ErrUnknownNICID {
+		t.Fatalf("got s.RemoveNIC(1) = %v, want = %s", err, tcpip.ErrUnknownNICID)
+	}
+}
+
+func TestRemoveNIC(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	e := linkEPWithMockedAttach{
+		LinkEndpoint: loopback.New(),
+	}
+	if err := s.CreateNIC(nicID, &e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	// NIC should be present in NICInfo and attached to a NetworkDispatcher.
+	allNICInfo := s.NICInfo()
+	if _, ok := allNICInfo[nicID]; !ok {
+		t.Errorf("entry for %d missing from allNICInfo = %+v", nicID, allNICInfo)
+	}
+	if !e.isAttached() {
+		t.Fatal("link endpoint not attached to a network dispatcher")
+	}
+
+	// Removing a NIC should remove it from NICInfo and e should be detached from
+	// the NetworkDispatcher.
+	if err := s.RemoveNIC(nicID); err != nil {
+		t.Fatalf("s.RemoveNIC(%d): %s", nicID, err)
+	}
+	if nicInfo, ok := s.NICInfo()[nicID]; ok {
+		t.Errorf("got unexpected NICInfo entry for deleted NIC %d = %+v", nicID, nicInfo)
+	}
+	if e.isAttached() {
+		t.Error("link endpoint for removed NIC still attached to a network dispatcher")
+	}
+}
+
+func TestRouteWithDownNIC(t *testing.T) {
+	tests := []struct {
+		name   string
+		downFn func(s *stack.Stack, nicID tcpip.NICID) *tcpip.Error
+		upFn   func(s *stack.Stack, nicID tcpip.NICID) *tcpip.Error
+	}{
+		{
+			name:   "Disabled NIC",
+			downFn: (*stack.Stack).DisableNIC,
+			upFn:   (*stack.Stack).EnableNIC,
+		},
+
+		// Once a NIC is removed, it cannot be brought up.
+		{
+			name:   "Removed NIC",
+			downFn: (*stack.Stack).RemoveNIC,
+		},
+	}
+
+	const unspecifiedNIC = 0
+	const nicID1 = 1
+	const nicID2 = 2
+	const addr1 = tcpip.Address("\x01")
+	const addr2 = tcpip.Address("\x02")
+	const nic1Dst = tcpip.Address("\x05")
+	const nic2Dst = tcpip.Address("\x06")
+
+	setup := func(t *testing.T) (*stack.Stack, *channel.Endpoint, *channel.Endpoint) {
+		s := stack.New(stack.Options{
+			NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		})
+
+		ep1 := channel.New(1, defaultMTU, "")
+		if err := s.CreateNIC(nicID1, ep1); err != nil {
+			t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+		}
+
+		if err := s.AddAddress(nicID1, fakeNetNumber, addr1); err != nil {
+			t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, fakeNetNumber, addr1, err)
+		}
+
+		ep2 := channel.New(1, defaultMTU, "")
+		if err := s.CreateNIC(nicID2, ep2); err != nil {
+			t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+		}
+
+		if err := s.AddAddress(nicID2, fakeNetNumber, addr2); err != nil {
+			t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, fakeNetNumber, addr2, err)
+		}
+
+		// Set a route table that sends all packets with odd destination
+		// addresses through the first NIC, and all even destination address
+		// through the second one.
+		{
+			subnet0, err := tcpip.NewSubnet("\x00", "\x01")
+			if err != nil {
+				t.Fatal(err)
+			}
+			subnet1, err := tcpip.NewSubnet("\x01", "\x01")
+			if err != nil {
+				t.Fatal(err)
+			}
+			s.SetRouteTable([]tcpip.Route{
+				{Destination: subnet1, Gateway: "\x00", NIC: nicID1},
+				{Destination: subnet0, Gateway: "\x00", NIC: nicID2},
+			})
+		}
+
+		return s, ep1, ep2
+	}
+
+	// Tests that routes through a down NIC are not used when looking up a route
+	// for a destination.
+	t.Run("Find", func(t *testing.T) {
+		for _, test := range tests {
+			t.Run(test.name, func(t *testing.T) {
+				s, _, _ := setup(t)
+
+				// Test routes to odd address.
+				testRoute(t, s, unspecifiedNIC, "", "\x05", addr1)
+				testRoute(t, s, unspecifiedNIC, addr1, "\x05", addr1)
+				testRoute(t, s, nicID1, addr1, "\x05", addr1)
+
+				// Test routes to even address.
+				testRoute(t, s, unspecifiedNIC, "", "\x06", addr2)
+				testRoute(t, s, unspecifiedNIC, addr2, "\x06", addr2)
+				testRoute(t, s, nicID2, addr2, "\x06", addr2)
+
+				// Bringing NIC1 down should result in no routes to odd addresses. Routes to
+				// even addresses should continue to be available as NIC2 is still up.
+				if err := test.downFn(s, nicID1); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID1, err)
+				}
+				testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
+				testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
+				testNoRoute(t, s, nicID1, addr1, nic1Dst)
+				testRoute(t, s, unspecifiedNIC, "", nic2Dst, addr2)
+				testRoute(t, s, unspecifiedNIC, addr2, nic2Dst, addr2)
+				testRoute(t, s, nicID2, addr2, nic2Dst, addr2)
+
+				// Bringing NIC2 down should result in no routes to even addresses. No
+				// route should be available to any address as routes to odd addresses
+				// were made unavailable by bringing NIC1 down above.
+				if err := test.downFn(s, nicID2); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID2, err)
+				}
+				testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
+				testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
+				testNoRoute(t, s, nicID1, addr1, nic1Dst)
+				testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
+				testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
+				testNoRoute(t, s, nicID2, addr2, nic2Dst)
+
+				if upFn := test.upFn; upFn != nil {
+					// Bringing NIC1 up should make routes to odd addresses available
+					// again. Routes to even addresses should continue to be unavailable
+					// as NIC2 is still down.
+					if err := upFn(s, nicID1); err != nil {
+						t.Fatalf("test.upFn(_, %d): %s", nicID1, err)
+					}
+					testRoute(t, s, unspecifiedNIC, "", nic1Dst, addr1)
+					testRoute(t, s, unspecifiedNIC, addr1, nic1Dst, addr1)
+					testRoute(t, s, nicID1, addr1, nic1Dst, addr1)
+					testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
+					testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
+					testNoRoute(t, s, nicID2, addr2, nic2Dst)
+				}
+			})
+		}
+	})
+
+	// Tests that writing a packet using a Route through a down NIC fails.
+	t.Run("WritePacket", func(t *testing.T) {
+		for _, test := range tests {
+			t.Run(test.name, func(t *testing.T) {
+				s, ep1, ep2 := setup(t)
+
+				r1, err := s.FindRoute(nicID1, addr1, nic1Dst, fakeNetNumber, false /* multicastLoop */)
+				if err != nil {
+					t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID1, addr1, nic1Dst, fakeNetNumber, err)
+				}
+				defer r1.Release()
+
+				r2, err := s.FindRoute(nicID2, addr2, nic2Dst, fakeNetNumber, false /* multicastLoop */)
+				if err != nil {
+					t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID2, addr2, nic2Dst, fakeNetNumber, err)
+				}
+				defer r2.Release()
+
+				// If we failed to get routes r1 or r2, we cannot proceed with the test.
+				if t.Failed() {
+					t.FailNow()
+				}
+
+				buf := buffer.View([]byte{1})
+				testSend(t, r1, ep1, buf)
+				testSend(t, r2, ep2, buf)
+
+				// Writes with Routes that use NIC1 after being brought down should fail.
+				if err := test.downFn(s, nicID1); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID1, err)
+				}
+				testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
+				testSend(t, r2, ep2, buf)
+
+				// Writes with Routes that use NIC2 after being brought down should fail.
+				if err := test.downFn(s, nicID2); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID2, err)
+				}
+				testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
+				testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+
+				if upFn := test.upFn; upFn != nil {
+					// Writes with Routes that use NIC1 after being brought up should
+					// succeed.
+					//
+					// TODO(b/147015577): Should we instead completely invalidate all
+					// Routes that were bound to a NIC that was brought down at some
+					// point?
+					if err := upFn(s, nicID1); err != nil {
+						t.Fatalf("test.upFn(_, %d): %s", nicID1, err)
+					}
+					testSend(t, r1, ep1, buf)
+					testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+				}
+			})
+		}
+	})
+}
+
 func TestRoutes(t *testing.T) {
 	// Create a stack with the fake network protocol, two nics, and two
 	// addresses per nic, the first nic has odd address, the second one has
@@ -618,7 +990,7 @@ func TestAddressRemoval(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	// Send and receive packets, and verify they are received.
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	testRecv(t, fakeNet, localAddrByte, ep, buf)
 	testSendTo(t, s, remoteAddr, ep, nil)
 
@@ -668,7 +1040,7 @@ func TestAddressRemovalWithRouteHeld(t *testing.T) {
 	}
 
 	// Send and receive packets, and verify they are received.
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	testRecv(t, fakeNet, localAddrByte, ep, buf)
 	testSend(t, r, ep, nil)
 	testSendTo(t, s, remoteAddr, ep, nil)
@@ -750,7 +1122,7 @@ func TestEndpointExpiration(t *testing.T) {
 
 				fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
 				buf := buffer.NewView(30)
-				buf[0] = localAddrByte
+				buf[dstAddrOffset] = localAddrByte
 
 				if promiscuous {
 					if err := s.SetPromiscuousMode(nicID, true); err != nil {
@@ -913,7 +1285,7 @@ func TestPromiscuousMode(t *testing.T) {
 	// Write a packet, and check that it doesn't get delivered as we don't
 	// have a matching endpoint.
 	const localAddrByte byte = 0x01
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
 
 	// Set promiscuous mode, then check that packet is delivered.
@@ -1087,19 +1459,19 @@ func TestOutgoingBroadcastWithEmptyRouteTable(t *testing.T) {
 
 	protoAddr := tcpip.ProtocolAddress{Protocol: fakeNetNumber, AddressWithPrefix: tcpip.AddressWithPrefix{header.IPv4Any, 0}}
 	if err := s.AddProtocolAddress(1, protoAddr); err != nil {
-		t.Fatalf("AddProtocolAddress(1, %s) failed: %s", protoAddr, err)
+		t.Fatalf("AddProtocolAddress(1, %v) failed: %v", protoAddr, err)
 	}
 	r, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatalf("FindRoute(1, %s, %s, %d) failed: %s", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
+		t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
 	}
 	if err := verifyRoute(r, stack.Route{LocalAddress: header.IPv4Any, RemoteAddress: header.IPv4Broadcast}); err != nil {
-		t.Errorf("FindRoute(1, %s, %s, %d) returned unexpected Route: %s)", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
+		t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
 	}
 
 	// If the NIC doesn't exist, it won't work.
 	if _, err := s.FindRoute(2, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */); err != tcpip.ErrNetworkUnreachable {
-		t.Fatalf("got FindRoute(2, %s, %s, %d) = %s want = %s", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable)
+		t.Fatalf("got FindRoute(2, %v, %v, %d) = %v want = %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable)
 	}
 }
 
@@ -1125,12 +1497,12 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
 	}
 	nic1ProtoAddr := tcpip.ProtocolAddress{fakeNetNumber, nic1Addr}
 	if err := s.AddProtocolAddress(1, nic1ProtoAddr); err != nil {
-		t.Fatalf("AddProtocolAddress(1, %s) failed: %s", nic1ProtoAddr, err)
+		t.Fatalf("AddProtocolAddress(1, %v) failed: %v", nic1ProtoAddr, err)
 	}
 
 	nic2ProtoAddr := tcpip.ProtocolAddress{fakeNetNumber, nic2Addr}
 	if err := s.AddProtocolAddress(2, nic2ProtoAddr); err != nil {
-		t.Fatalf("AddAddress(2, %s) failed: %s", nic2ProtoAddr, err)
+		t.Fatalf("AddAddress(2, %v) failed: %v", nic2ProtoAddr, err)
 	}
 
 	// Set the initial route table.
@@ -1145,10 +1517,10 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
 	// When an interface is given, the route for a broadcast goes through it.
 	r, err := s.FindRoute(1, nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatalf("FindRoute(1, %s, %s, %d) failed: %s", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
+		t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
 	}
 	if err := verifyRoute(r, stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
-		t.Errorf("FindRoute(1, %s, %s, %d) returned unexpected Route: %s)", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
+		t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
 	}
 
 	// When an interface is not given, it consults the route table.
@@ -1294,7 +1666,7 @@ func TestAddressRangeAcceptsMatchingPacket(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	const localAddrByte byte = 0x01
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	subnet, err := tcpip.NewSubnet(tcpip.Address("\x00"), tcpip.AddressMask("\xF0"))
 	if err != nil {
 		t.Fatal("NewSubnet failed:", err)
@@ -1402,7 +1774,7 @@ func TestAddressRangeRejectsNonmatchingPacket(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	const localAddrByte byte = 0x01
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	subnet, err := tcpip.NewSubnet(tcpip.Address("\x10"), tcpip.AddressMask("\xF0"))
 	if err != nil {
 		t.Fatal("NewSubnet failed:", err)
@@ -1899,7 +2271,7 @@ func TestNICStats(t *testing.T) {
 
 	// Send a packet to address 1.
 	buf := buffer.NewView(30)
-	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep1.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if got, want := s.NICInfo()[1].Stats.Rx.Packets.Value(), uint64(1); got != want {
@@ -1926,56 +2298,84 @@ func TestNICStats(t *testing.T) {
 }
 
 func TestNICForwarding(t *testing.T) {
-	// Create a stack with the fake network protocol, two NICs, each with
-	// an address.
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
-	})
-	s.SetForwarding(true)
+	const nicID1 = 1
+	const nicID2 = 2
+	const dstAddr = tcpip.Address("\x03")
 
-	ep1 := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(1, ep1); err != nil {
-		t.Fatal("CreateNIC #1 failed:", err)
-	}
-	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
-		t.Fatal("AddAddress #1 failed:", err)
+	tests := []struct {
+		name      string
+		headerLen uint16
+	}{
+		{
+			name: "Zero header length",
+		},
+		{
+			name:      "Non-zero header length",
+			headerLen: 16,
+		},
 	}
 
-	ep2 := channel.New(10, defaultMTU, "")
-	if err := s.CreateNIC(2, ep2); err != nil {
-		t.Fatal("CreateNIC #2 failed:", err)
-	}
-	if err := s.AddAddress(2, fakeNetNumber, "\x02"); err != nil {
-		t.Fatal("AddAddress #2 failed:", err)
-	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			})
+			s.SetForwarding(true)
 
-	// Route all packets to address 3 to NIC 2.
-	{
-		subnet, err := tcpip.NewSubnet("\x03", "\xff")
-		if err != nil {
-			t.Fatal(err)
-		}
-		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 2}})
-	}
+			ep1 := channel.New(10, defaultMTU, "")
+			if err := s.CreateNIC(nicID1, ep1); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+			}
+			if err := s.AddAddress(nicID1, fakeNetNumber, "\x01"); err != nil {
+				t.Fatalf("AddAddress(%d, %d, 0x01): %s", nicID1, fakeNetNumber, err)
+			}
 
-	// Send a packet to address 3.
-	buf := buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
-		Data: buf.ToVectorisedView(),
-	})
+			ep2 := channelLinkWithHeaderLength{
+				Endpoint:     channel.New(10, defaultMTU, ""),
+				headerLength: test.headerLen,
+			}
+			if err := s.CreateNIC(nicID2, &ep2); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+			}
+			if err := s.AddAddress(nicID2, fakeNetNumber, "\x02"); err != nil {
+				t.Fatalf("AddAddress(%d, %d, 0x02): %s", nicID2, fakeNetNumber, err)
+			}
 
-	if _, ok := ep2.Read(); !ok {
-		t.Fatal("Packet not forwarded")
-	}
+			// Route all packets to dstAddr to NIC 2.
+			{
+				subnet, err := tcpip.NewSubnet(dstAddr, "\xff")
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: nicID2}})
+			}
 
-	// Test that forwarding increments Tx stats correctly.
-	if got, want := s.NICInfo()[2].Stats.Tx.Packets.Value(), uint64(1); got != want {
-		t.Errorf("got Tx.Packets.Value() = %d, want = %d", got, want)
-	}
+			// Send a packet to dstAddr.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = dstAddr[0]
+			ep1.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+				Data: buf.ToVectorisedView(),
+			})
 
-	if got, want := s.NICInfo()[2].Stats.Tx.Bytes.Value(), uint64(len(buf)); got != want {
-		t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
+			pkt, ok := ep2.Read()
+			if !ok {
+				t.Fatal("packet not forwarded")
+			}
+
+			// Test that the link's MaxHeaderLength is honoured.
+			if capacity, want := pkt.Pkt.Header.AvailableLength(), int(test.headerLen); capacity != want {
+				t.Errorf("got Header.AvailableLength() = %d, want = %d", capacity, want)
+			}
+
+			// Test that forwarding increments Tx stats correctly.
+			if got, want := s.NICInfo()[nicID2].Stats.Tx.Packets.Value(), uint64(1); got != want {
+				t.Errorf("got Tx.Packets.Value() = %d, want = %d", got, want)
+			}
+
+			if got, want := s.NICInfo()[nicID2].Stats.Tx.Bytes.Value(), uint64(len(buf)); got != want {
+				t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
+			}
+		})
 	}
 }
 
@@ -2013,7 +2413,7 @@ func TestNICContextPreservation(t *testing.T) {
 				t.Fatalf("got nicinfos[%d] = _, %t, want _, true; nicinfos = %+v", id, ok, nicinfos)
 			}
 			if got, want := nicinfo.Context == test.want, true; got != want {
-				t.Fatal("got nicinfo.Context == ctx = %t, want %t; nicinfo.Context = %p, ctx = %p", got, want, nicinfo.Context, test.want)
+				t.Fatalf("got nicinfo.Context == ctx = %t, want %t; nicinfo.Context = %p, ctx = %p", got, want, nicinfo.Context, test.want)
 			}
 		})
 	}
@@ -2173,13 +2573,29 @@ func TestNICAutoGenLinkLocalAddr(t *testing.T) {
 
 			e := channel.New(0, 1280, test.linkAddr)
 			s := stack.New(opts)
-			nicOpts := stack.NICOptions{Name: test.nicName}
+			nicOpts := stack.NICOptions{Name: test.nicName, Disabled: true}
 			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
 				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
 			}
 
-			var expectedMainAddr tcpip.AddressWithPrefix
+			// A new disabled NIC should not have any address, even if auto generation
+			// was enabled.
+			allStackAddrs := s.AllAddresses()
+			allNICAddrs, ok := allStackAddrs[nicID]
+			if !ok {
+				t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+			}
+			if l := len(allNICAddrs); l != 0 {
+				t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+			}
+
+			// Enabling the NIC should attempt auto-generation of a link-local
+			// address.
+			if err := s.EnableNIC(nicID); err != nil {
+				t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+			}
 
+			var expectedMainAddr tcpip.AddressWithPrefix
 			if test.shouldGen {
 				expectedMainAddr = tcpip.AddressWithPrefix{
 					Address:   test.expectedAddr,
@@ -2366,7 +2782,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 				{
 					subnet, err := tcpip.NewSubnet("\x00", "\x00")
 					if err != nil {
-						t.Fatalf("NewSubnet failed:", err)
+						t.Fatalf("NewSubnet failed: %v", err)
 					}
 					s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
 				}
@@ -2380,11 +2796,11 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 				// permanentExpired kind.
 				r, err := s.FindRoute(1, "\x01", "\x02", fakeNetNumber, false)
 				if err != nil {
-					t.Fatal("FindRoute failed:", err)
+					t.Fatalf("FindRoute failed: %v", err)
 				}
 				defer r.Release()
 				if err := s.RemoveAddress(1, "\x01"); err != nil {
-					t.Fatalf("RemoveAddress failed:", err)
+					t.Fatalf("RemoveAddress failed: %v", err)
 				}
 
 				//
@@ -2396,7 +2812,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 				// Add some other address with peb set to
 				// FirstPrimaryEndpoint.
 				if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x03", stack.FirstPrimaryEndpoint); err != nil {
-					t.Fatal("AddAddressWithOptions failed:", err)
+					t.Fatalf("AddAddressWithOptions failed: %v", err)
 
 				}
 
@@ -2404,7 +2820,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 				// make sure the new peb was respected.
 				// (The address should just be promoted now).
 				if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x01", ps); err != nil {
-					t.Fatal("AddAddressWithOptions failed:", err)
+					t.Fatalf("AddAddressWithOptions failed: %v", err)
 				}
 				var primaryAddrs []tcpip.Address
 				for _, pa := range s.NICInfo()[1].ProtocolAddresses {
@@ -2437,11 +2853,11 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 				// GetMainNICAddress; else, our original address
 				// should be returned.
 				if err := s.RemoveAddress(1, "\x03"); err != nil {
-					t.Fatalf("RemoveAddress failed:", err)
+					t.Fatalf("RemoveAddress failed: %v", err)
 				}
 				addr, err = s.GetMainNICAddress(1, fakeNetNumber)
 				if err != nil {
-					t.Fatal("s.GetMainNICAddress failed:", err)
+					t.Fatalf("s.GetMainNICAddress failed: %v", err)
 				}
 				if ps == stack.NeverPrimaryEndpoint {
 					if want := (tcpip.AddressWithPrefix{}); addr != want {
@@ -2460,21 +2876,33 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
 
 func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 	const (
-		linkLocalAddr1   = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-		linkLocalAddr2   = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-		uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-		uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-		globalAddr1      = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-		globalAddr2      = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-		nicID            = 1
+		linkLocalAddr1         = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		linkLocalAddr2         = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		linkLocalMulticastAddr = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		uniqueLocalAddr1       = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		uniqueLocalAddr2       = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		globalAddr1            = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		globalAddr2            = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		nicID                  = 1
+		lifetimeSeconds        = 9999
 	)
 
+	prefix1, _, stableGlobalAddr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, stableGlobalAddr2 := prefixSubnetAddr(1, linkAddr1)
+
+	var tempIIDHistory [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistory[:], nil, nicID)
+	tempGlobalAddr1 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], stableGlobalAddr1.Address).Address
+	tempGlobalAddr2 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], stableGlobalAddr2.Address).Address
+
 	// Rule 3 is not tested here, and is instead tested by NDP's AutoGenAddr test.
 	tests := []struct {
-		name              string
-		nicAddrs          []tcpip.Address
-		connectAddr       tcpip.Address
-		expectedLocalAddr tcpip.Address
+		name                                   string
+		slaacPrefixForTempAddrBeforeNICAddrAdd tcpip.AddressWithPrefix
+		nicAddrs                               []tcpip.Address
+		slaacPrefixForTempAddrAfterNICAddrAdd  tcpip.AddressWithPrefix
+		connectAddr                            tcpip.Address
+		expectedLocalAddr                      tcpip.Address
 	}{
 		// Test Rule 1 of RFC 6724 section 5.
 		{
@@ -2540,6 +2968,18 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 			expectedLocalAddr: linkLocalAddr1,
 		},
 		{
+			name:              "Link Local most preferred for link local multicast (last address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, uniqueLocalAddr1, linkLocalAddr1},
+			connectAddr:       linkLocalMulticastAddr,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Link Local most preferred for link local multicast (first address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       linkLocalMulticastAddr,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
 			name:              "Unique Local most preferred (last address)",
 			nicAddrs:          []tcpip.Address{uniqueLocalAddr1, globalAddr1, linkLocalAddr1},
 			connectAddr:       uniqueLocalAddr2,
@@ -2552,6 +2992,22 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 			expectedLocalAddr: uniqueLocalAddr1,
 		},
 
+		// Test Rule 7 of RFC 6724 section 5.
+		{
+			name:                                   "Temp Global most preferred (last address)",
+			slaacPrefixForTempAddrBeforeNICAddrAdd: prefix1,
+			nicAddrs:                               []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:                            globalAddr2,
+			expectedLocalAddr:                      tempGlobalAddr1,
+		},
+		{
+			name:                                  "Temp Global most preferred (first address)",
+			nicAddrs:                              []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			slaacPrefixForTempAddrAfterNICAddrAdd: prefix1,
+			connectAddr:                           globalAddr2,
+			expectedLocalAddr:                     tempGlobalAddr1,
+		},
+
 		// Test returning the endpoint that is closest to the front when
 		// candidate addresses are "equal" from the perspective of RFC 6724
 		// section 5.
@@ -2573,6 +3029,13 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 			connectAddr:       uniqueLocalAddr2,
 			expectedLocalAddr: linkLocalAddr1,
 		},
+		{
+			name:                                   "Temp Global for Global",
+			slaacPrefixForTempAddrBeforeNICAddrAdd: prefix1,
+			slaacPrefixForTempAddrAfterNICAddrAdd:  prefix2,
+			connectAddr:                            globalAddr1,
+			expectedLocalAddr:                      tempGlobalAddr2,
+		},
 	}
 
 	for _, test := range tests {
@@ -2581,6 +3044,12 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 			s := stack.New(stack.Options{
 				NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
 				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:                  true,
+					AutoGenGlobalAddresses:     true,
+					AutoGenTempGlobalAddresses: true,
+				},
+				NDPDisp: &ndpDispatcher{},
 			})
 			if err := s.CreateNIC(nicID, e); err != nil {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
@@ -2592,12 +3061,20 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 			}})
 			s.AddLinkAddress(nicID, llAddr3, linkAddr3)
 
+			if test.slaacPrefixForTempAddrBeforeNICAddrAdd != (tcpip.AddressWithPrefix{}) {
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, test.slaacPrefixForTempAddrBeforeNICAddrAdd, true, true, lifetimeSeconds, lifetimeSeconds))
+			}
+
 			for _, a := range test.nicAddrs {
 				if err := s.AddAddress(nicID, ipv6.ProtocolNumber, a); err != nil {
 					t.Errorf("s.AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, a, err)
 				}
 			}
 
+			if test.slaacPrefixForTempAddrAfterNICAddrAdd != (tcpip.AddressWithPrefix{}) {
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, test.slaacPrefixForTempAddrAfterNICAddrAdd, true, true, lifetimeSeconds, lifetimeSeconds))
+			}
+
 			if t.Failed() {
 				t.FailNow()
 			}
@@ -2609,11 +3086,158 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 	}
 }
 
+func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) {
+	const nicID = 1
+
+	e := loopback.New()
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
+	})
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	allStackAddrs := s.AllAddresses()
+	allNICAddrs, ok := allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 0 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+	}
+
+	// Enabling the NIC should add the IPv4 broadcast address.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	allStackAddrs = s.AllAddresses()
+	allNICAddrs, ok = allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 1 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 1", l)
+	}
+	want := tcpip.ProtocolAddress{
+		Protocol: header.IPv4ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   header.IPv4Broadcast,
+			PrefixLen: 32,
+		},
+	}
+	if allNICAddrs[0] != want {
+		t.Fatalf("got allNICAddrs[0] = %+v, want = %+v", allNICAddrs[0], want)
+	}
+
+	// Disabling the NIC should remove the IPv4 broadcast address.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	allStackAddrs = s.AllAddresses()
+	allNICAddrs, ok = allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 0 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+	}
+}
+
+// TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval tests that removing an IPv6
+// address after leaving its solicited node multicast address does not result in
+// an error.
+func TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	})
+	e := channel.New(10, 1280, linkAddr1)
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	if err := s.AddAddress(nicID, ipv6.ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, addr1, err)
+	}
+
+	// The NIC should have joined addr1's solicited node multicast address.
+	snmc := header.SolicitedNodeAddr(addr1)
+	in, err := s.IsInGroup(nicID, snmc)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, snmc, err)
+	}
+	if !in {
+		t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, snmc)
+	}
+
+	if err := s.LeaveGroup(ipv6.ProtocolNumber, nicID, snmc); err != nil {
+		t.Fatalf("LeaveGroup(%d, %d, %s): %s", ipv6.ProtocolNumber, nicID, snmc, err)
+	}
+	in, err = s.IsInGroup(nicID, snmc)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, snmc, err)
+	}
+	if in {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, snmc)
+	}
+
+	if err := s.RemoveAddress(nicID, addr1); err != nil {
+		t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr1, err)
+	}
+}
+
+func TestJoinLeaveAllNodesMulticastOnNICEnableDisable(t *testing.T) {
+	const nicID = 1
+
+	e := loopback.New()
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	})
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	// Should not be in the IPv6 all-nodes multicast group yet because the NIC has
+	// not been enabled yet.
+	isInGroup, err := s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+
+	// The all-nodes multicast group should be joined when the NIC is enabled.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if !isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+
+	// The all-nodes multicast group should be left when the NIC is disabled.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+}
+
 // TestDoDADWhenNICEnabled tests that IPv6 endpoints that were added while a NIC
 // was disabled have DAD performed on them when the NIC is enabled.
 func TestDoDADWhenNICEnabled(t *testing.T) {
-	t.Parallel()
-
 	const dadTransmits = 1
 	const retransmitTimer = time.Second
 	const nicID = 1
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index d686e6eb8..e09866405 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -15,9 +15,9 @@
 package stack
 
 import (
+	"container/heap"
 	"fmt"
 	"math/rand"
-	"sort"
 
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -35,7 +35,7 @@ type protocolIDs struct {
 type transportEndpoints struct {
 	// mu protects all fields of the transportEndpoints.
 	mu        sync.RWMutex
-	endpoints map[TransportEndpointID]*endpointsByNic
+	endpoints map[TransportEndpointID]*endpointsByNIC
 	// rawEndpoints contains endpoints for raw sockets, which receive all
 	// traffic of a given protocol regardless of port.
 	rawEndpoints []RawTransportEndpoint
@@ -46,11 +46,11 @@ type transportEndpoints struct {
 func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
-	epsByNic, ok := eps.endpoints[id]
+	epsByNIC, ok := eps.endpoints[id]
 	if !ok {
 		return
 	}
-	if !epsByNic.unregisterEndpoint(bindToDevice, ep) {
+	if !epsByNIC.unregisterEndpoint(bindToDevice, ep) {
 		return
 	}
 	delete(eps.endpoints, id)
@@ -66,18 +66,85 @@ func (eps *transportEndpoints) transportEndpoints() []TransportEndpoint {
 	return es
 }
 
-type endpointsByNic struct {
+// iterEndpointsLocked yields all endpointsByNIC in eps that match id, in
+// descending order of match quality. If a call to yield returns false,
+// iterEndpointsLocked stops iteration and returns immediately.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) iterEndpointsLocked(id TransportEndpointID, yield func(*endpointsByNIC) bool) {
+	// Try to find a match with the id as provided.
+	if ep, ok := eps.endpoints[id]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with the id minus the local address.
+	nid := id
+
+	nid.LocalAddress = ""
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with the id minus the remote part.
+	nid.LocalAddress = id.LocalAddress
+	nid.RemoteAddress = ""
+	nid.RemotePort = 0
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with only the local port.
+	nid.LocalAddress = ""
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+}
+
+// findAllEndpointsLocked returns all endpointsByNIC in eps that match id, in
+// descending order of match quality.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) findAllEndpointsLocked(id TransportEndpointID) []*endpointsByNIC {
+	var matchedEPs []*endpointsByNIC
+	eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
+		matchedEPs = append(matchedEPs, ep)
+		return true
+	})
+	return matchedEPs
+}
+
+// findEndpointLocked returns the endpoint that most closely matches the given id.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) findEndpointLocked(id TransportEndpointID) *endpointsByNIC {
+	var matchedEP *endpointsByNIC
+	eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
+		matchedEP = ep
+		return false
+	})
+	return matchedEP
+}
+
+type endpointsByNIC struct {
 	mu        sync.RWMutex
 	endpoints map[tcpip.NICID]*multiPortEndpoint
 	// seed is a random secret for a jenkins hash.
 	seed uint32
 }
 
-func (epsByNic *endpointsByNic) transportEndpoints() []TransportEndpoint {
-	epsByNic.mu.RLock()
-	defer epsByNic.mu.RUnlock()
+func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
+	epsByNIC.mu.RLock()
+	defer epsByNIC.mu.RUnlock()
 	var eps []TransportEndpoint
-	for _, ep := range epsByNic.endpoints {
+	for _, ep := range epsByNIC.endpoints {
 		eps = append(eps, ep.transportEndpoints()...)
 	}
 	return eps
@@ -85,13 +152,13 @@ func (epsByNic *endpointsByNic) transportEndpoints() []TransportEndpoint {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
-	epsByNic.mu.RLock()
+func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
+	epsByNIC.mu.RLock()
 
-	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
+	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
 	if !ok {
-		if mpep, ok = epsByNic.endpoints[0]; !ok {
-			epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+		if mpep, ok = epsByNIC.endpoints[0]; !ok {
+			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 			return
 		}
 	}
@@ -100,29 +167,29 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, p
 	// endpoints bound to the right device.
 	if isMulticastOrBroadcast(id.LocalAddress) {
 		mpep.handlePacketAll(r, id, pkt)
-		epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+		epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 		return
 	}
 	// multiPortEndpoints are guaranteed to have at least one element.
-	transEP := selectEndpoint(id, mpep, epsByNic.seed)
+	transEP := selectEndpoint(id, mpep, epsByNIC.seed)
 	if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
 		queuedProtocol.QueuePacket(r, transEP, id, pkt)
-		epsByNic.mu.RUnlock()
+		epsByNIC.mu.RUnlock()
 		return
 	}
 
 	transEP.HandlePacket(r, id, pkt)
-	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+	epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer) {
-	epsByNic.mu.RLock()
-	defer epsByNic.mu.RUnlock()
+func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt *PacketBuffer) {
+	epsByNIC.mu.RLock()
+	defer epsByNIC.mu.RUnlock()
 
-	mpep, ok := epsByNic.endpoints[n.ID()]
+	mpep, ok := epsByNIC.endpoints[n.ID()]
 	if !ok {
-		mpep, ok = epsByNic.endpoints[0]
+		mpep, ok = epsByNIC.endpoints[0]
 	}
 	if !ok {
 		return
@@ -132,40 +199,41 @@ func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpoint
 	// broadcast like we are doing with handlePacket above?
 
 	// multiPortEndpoints are guaranteed to have at least one element.
-	selectEndpoint(id, mpep, epsByNic.seed).HandleControlPacket(id, typ, extra, pkt)
+	selectEndpoint(id, mpep, epsByNIC.seed).HandleControlPacket(id, typ, extra, pkt)
 }
 
 // registerEndpoint returns true if it succeeds. It fails and returns
 // false if ep already has an element with the same key.
-func (epsByNic *endpointsByNic) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
-	epsByNic.mu.Lock()
-	defer epsByNic.mu.Unlock()
+func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+	epsByNIC.mu.Lock()
+	defer epsByNIC.mu.Unlock()
 
-	if multiPortEp, ok := epsByNic.endpoints[bindToDevice]; ok {
-		// There was already a bind.
-		return multiPortEp.singleRegisterEndpoint(t, reusePort)
+	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
+	if !ok {
+		multiPortEp = &multiPortEndpoint{
+			demux:      d,
+			netProto:   netProto,
+			transProto: transProto,
+			reuse:      reusePort,
+		}
+		epsByNIC.endpoints[bindToDevice] = multiPortEp
 	}
 
-	// This is a new binding.
-	multiPortEp := &multiPortEndpoint{demux: d, netProto: netProto, transProto: transProto}
-	multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
-	multiPortEp.reuse = reusePort
-	epsByNic.endpoints[bindToDevice] = multiPortEp
 	return multiPortEp.singleRegisterEndpoint(t, reusePort)
 }
 
-// unregisterEndpoint returns true if endpointsByNic has to be unregistered.
-func (epsByNic *endpointsByNic) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint) bool {
-	epsByNic.mu.Lock()
-	defer epsByNic.mu.Unlock()
-	multiPortEp, ok := epsByNic.endpoints[bindToDevice]
+// unregisterEndpoint returns true if endpointsByNIC has to be unregistered.
+func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint) bool {
+	epsByNIC.mu.Lock()
+	defer epsByNIC.mu.Unlock()
+	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
 	if !ok {
 		return false
 	}
 	if multiPortEp.unregisterEndpoint(t) {
-		delete(epsByNic.endpoints, bindToDevice)
+		delete(epsByNIC.endpoints, bindToDevice)
 	}
-	return len(epsByNic.endpoints) == 0
+	return len(epsByNIC.endpoints) == 0
 }
 
 // transportDemuxer demultiplexes packets targeted at a transport endpoint
@@ -183,7 +251,7 @@ type transportDemuxer struct {
 // the dispatcher to delivery packets to the QueuePacket method instead of
 // calling HandlePacket directly on the endpoint.
 type queuedTransportProtocol interface {
-	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt tcpip.PacketBuffer)
+	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt *PacketBuffer)
 }
 
 func newTransportDemuxer(stack *Stack) *transportDemuxer {
@@ -197,7 +265,7 @@ func newTransportDemuxer(stack *Stack) *transportDemuxer {
 		for proto := range stack.transportProtocols {
 			protoIDs := protocolIDs{netProto, proto}
 			d.protocol[protoIDs] = &transportEndpoints{
-				endpoints: make(map[TransportEndpointID]*endpointsByNic),
+				endpoints: make(map[TransportEndpointID]*endpointsByNIC),
 			}
 			qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol)
 			if isQueued {
@@ -222,6 +290,35 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
 	return nil
 }
 
+type transportEndpointHeap []TransportEndpoint
+
+var _ heap.Interface = (*transportEndpointHeap)(nil)
+
+func (h *transportEndpointHeap) Len() int {
+	return len(*h)
+}
+
+func (h *transportEndpointHeap) Less(i, j int) bool {
+	return (*h)[i].UniqueID() < (*h)[j].UniqueID()
+}
+
+func (h *transportEndpointHeap) Swap(i, j int) {
+	(*h)[i], (*h)[j] = (*h)[j], (*h)[i]
+}
+
+func (h *transportEndpointHeap) Push(x interface{}) {
+	*h = append(*h, x.(TransportEndpoint))
+}
+
+func (h *transportEndpointHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	old[n-1] = nil
+	*h = old[:n-1]
+	return x
+}
+
 // multiPortEndpoint is a container for TransportEndpoints which are bound to
 // the same pair of address and port. endpointsArr always has at least one
 // element.
@@ -237,15 +334,14 @@ type multiPortEndpoint struct {
 	netProto   tcpip.NetworkProtocolNumber
 	transProto tcpip.TransportProtocolNumber
 
-	endpointsArr []TransportEndpoint
-	endpointsMap map[TransportEndpoint]int
+	endpoints transportEndpointHeap
 	// reuse indicates if more than one endpoint is allowed.
 	reuse bool
 }
 
 func (ep *multiPortEndpoint) transportEndpoints() []TransportEndpoint {
 	ep.mu.RLock()
-	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
+	eps := append([]TransportEndpoint(nil), ep.endpoints...)
 	ep.mu.RUnlock()
 	return eps
 }
@@ -262,8 +358,8 @@ func reciprocalScale(val, n uint32) uint32 {
 // ports then uses it to select a socket. In this case, all packets from one
 // address will be sent to same endpoint.
 func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32) TransportEndpoint {
-	if len(mpep.endpointsArr) == 1 {
-		return mpep.endpointsArr[0]
+	if len(mpep.endpoints) == 1 {
+		return mpep.endpoints[0]
 	}
 
 	payload := []byte{
@@ -279,51 +375,28 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 	h.Write([]byte(id.RemoteAddress))
 	hash := h.Sum32()
 
-	idx := reciprocalScale(hash, uint32(len(mpep.endpointsArr)))
-	return mpep.endpointsArr[idx]
+	idx := reciprocalScale(hash, uint32(len(mpep.endpoints)))
+	return mpep.endpoints[idx]
 }
 
-func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
 	ep.mu.RLock()
 	queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
-	for i, endpoint := range ep.endpointsArr {
-		// HandlePacket takes ownership of pkt, so each endpoint needs
-		// its own copy except for the final one.
-		if i == len(ep.endpointsArr)-1 {
-			if mustQueue {
-				queuedProtocol.QueuePacket(r, endpoint, id, pkt)
-				break
-			}
-			endpoint.HandlePacket(r, id, pkt)
-			break
-		}
+	// HandlePacket takes ownership of pkt, so each endpoint needs
+	// its own copy except for the final one.
+	for _, endpoint := range ep.endpoints[:len(ep.endpoints)-1] {
 		if mustQueue {
 			queuedProtocol.QueuePacket(r, endpoint, id, pkt.Clone())
-			continue
+		} else {
+			endpoint.HandlePacket(r, id, pkt.Clone())
 		}
-		endpoint.HandlePacket(r, id, pkt.Clone())
-	}
-	ep.mu.RUnlock() // Don't use defer for performance reasons.
-}
-
-// Close implements stack.TransportEndpoint.Close.
-func (ep *multiPortEndpoint) Close() {
-	ep.mu.RLock()
-	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
-	ep.mu.RUnlock()
-	for _, e := range eps {
-		e.Close()
 	}
-}
-
-// Wait implements stack.TransportEndpoint.Wait.
-func (ep *multiPortEndpoint) Wait() {
-	ep.mu.RLock()
-	eps := append([]TransportEndpoint(nil), ep.endpointsArr...)
-	ep.mu.RUnlock()
-	for _, e := range eps {
-		e.Wait()
+	if endpoint := ep.endpoints[len(ep.endpoints)-1]; mustQueue {
+		queuedProtocol.QueuePacket(r, endpoint, id, pkt)
+	} else {
+		endpoint.HandlePacket(r, id, pkt)
 	}
+	ep.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
 // singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint
@@ -332,26 +405,15 @@ func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePo
 	ep.mu.Lock()
 	defer ep.mu.Unlock()
 
-	if len(ep.endpointsArr) > 0 {
+	if len(ep.endpoints) != 0 {
 		// If it was previously bound, we need to check if we can bind again.
 		if !ep.reuse || !reusePort {
 			return tcpip.ErrPortInUse
 		}
 	}
 
-	// A new endpoint is added into endpointsArr and its index there is saved in
-	// endpointsMap. This will allow us to remove endpoint from the array fast.
-	ep.endpointsMap[t] = len(ep.endpointsArr)
-	ep.endpointsArr = append(ep.endpointsArr, t)
+	heap.Push(&ep.endpoints, t)
 
-	// ep.endpointsArr is sorted by endpoint unique IDs, so that endpoints
-	// can be restored in the same order.
-	sort.Slice(ep.endpointsArr, func(i, j int) bool {
-		return ep.endpointsArr[i].UniqueID() < ep.endpointsArr[j].UniqueID()
-	})
-	for i, e := range ep.endpointsArr {
-		ep.endpointsMap[e] = i
-	}
 	return nil
 }
 
@@ -360,21 +422,13 @@ func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint) bool {
 	ep.mu.Lock()
 	defer ep.mu.Unlock()
 
-	idx, ok := ep.endpointsMap[t]
-	if !ok {
-		return false
-	}
-	delete(ep.endpointsMap, t)
-	l := len(ep.endpointsArr)
-	if l > 1 {
-		// The last endpoint in endpointsArr is moved instead of the deleted one.
-		lastEp := ep.endpointsArr[l-1]
-		ep.endpointsArr[idx] = lastEp
-		ep.endpointsMap[lastEp] = idx
-		ep.endpointsArr = ep.endpointsArr[0 : l-1]
-		return false
+	for i, endpoint := range ep.endpoints {
+		if endpoint == t {
+			heap.Remove(&ep.endpoints, i)
+			break
+		}
 	}
-	return true
+	return len(ep.endpoints) == 0
 }
 
 func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
@@ -391,19 +445,16 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
 
-	if epsByNic, ok := eps.endpoints[id]; ok {
-		// There was already a binding.
-		return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
-	}
-
-	// This is a new binding.
-	epsByNic := &endpointsByNic{
-		endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
-		seed:      rand.Uint32(),
+	epsByNIC, ok := eps.endpoints[id]
+	if !ok {
+		epsByNIC = &endpointsByNIC{
+			endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
+			seed:      rand.Uint32(),
+		}
+		eps.endpoints[id] = epsByNIC
 	}
-	eps.endpoints[id] = epsByNic
 
-	return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
+	return epsByNIC.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
 }
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
@@ -416,84 +467,60 @@ func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolN
 	}
 }
 
-var loopbackSubnet = func() tcpip.Subnet {
-	sn, err := tcpip.NewSubnet("\x7f\x00\x00\x00", "\xff\x00\x00\x00")
-	if err != nil {
-		panic(err)
-	}
-	return sn
-}()
-
 // deliverPacket attempts to find one or more matching transport endpoints, and
 // then, if matches are found, delivers the packet to them. Returns true if
 // the packet no longer needs to be handled.
-func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
 	}
 
-	eps.mu.RLock()
-
-	// Determine which transport endpoint or endpoints to deliver this packet to.
 	// If the packet is a UDP broadcast or multicast, then find all matching
-	// transport endpoints. If the packet is a TCP packet with a non-unicast
-	// source or destination address, then do nothing further and instruct
-	// the caller to do the same.
-	var destEps []*endpointsByNic
-	switch protocol {
-	case header.UDPProtocolNumber:
-		if isMulticastOrBroadcast(id.LocalAddress) {
-			destEps = d.findAllEndpointsLocked(eps, id)
-			break
-		}
-
-		if ep := d.findEndpointLocked(eps, id); ep != nil {
-			destEps = append(destEps, ep)
+	// transport endpoints.
+	if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
+		eps.mu.RLock()
+		destEPs := eps.findAllEndpointsLocked(id)
+		eps.mu.RUnlock()
+		// Fail if we didn't find at least one matching transport endpoint.
+		if len(destEPs) == 0 {
+			r.Stats().UDP.UnknownPortErrors.Increment()
+			return false
 		}
-
-	case header.TCPProtocolNumber:
-		if !(isUnicast(r.LocalAddress) && isUnicast(r.RemoteAddress)) {
-			// TCP can only be used to communicate between a single
-			// source and a single destination; the addresses must
-			// be unicast.
-			eps.mu.RUnlock()
-			r.Stats().TCP.InvalidSegmentsReceived.Increment()
-			return true
+		// handlePacket takes ownership of pkt, so each endpoint needs its own
+		// copy except for the final one.
+		for _, ep := range destEPs[:len(destEPs)-1] {
+			ep.handlePacket(r, id, pkt.Clone())
 		}
+		destEPs[len(destEPs)-1].handlePacket(r, id, pkt)
+		return true
+	}
 
-		fallthrough
-
-	default:
-		if ep := d.findEndpointLocked(eps, id); ep != nil {
-			destEps = append(destEps, ep)
-		}
+	// If the packet is a TCP packet with a non-unicast source or destination
+	// address, then do nothing further and instruct the caller to do the same.
+	if protocol == header.TCPProtocolNumber && (!isUnicast(r.LocalAddress) || !isUnicast(r.RemoteAddress)) {
+		// TCP can only be used to communicate between a single source and a
+		// single destination; the addresses must be unicast.
+		r.Stats().TCP.InvalidSegmentsReceived.Increment()
+		return true
 	}
 
+	eps.mu.RLock()
+	ep := eps.findEndpointLocked(id)
 	eps.mu.RUnlock()
-
-	// Fail if we didn't find at least one matching transport endpoint.
-	if len(destEps) == 0 {
-		// UDP packet could not be delivered to an unknown destination port.
+	if ep == nil {
 		if protocol == header.UDPProtocolNumber {
 			r.Stats().UDP.UnknownPortErrors.Increment()
 		}
 		return false
 	}
-
-	// HandlePacket takes ownership of pkt, so each endpoint needs its own
-	// copy except for the final one.
-	for _, ep := range destEps[:len(destEps)-1] {
-		ep.handlePacket(r, id, pkt.Clone())
-	}
-	destEps[len(destEps)-1].handlePacket(r, id, pkt)
-
+	ep.handlePacket(r, id, pkt)
 	return true
 }
 
 // deliverRawPacket attempts to deliver the given packet and returns whether it
 // was delivered successfully.
-func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) bool {
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -517,99 +544,53 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
 
 // deliverControlPacket attempts to deliver the given control packet. Returns
 // true if it found an endpoint, false otherwise.
-func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{net, trans}]
 	if !ok {
 		return false
 	}
 
-	// Try to find the endpoint.
 	eps.mu.RLock()
-	ep := d.findEndpointLocked(eps, id)
+	ep := eps.findEndpointLocked(id)
 	eps.mu.RUnlock()
-
-	// Fail if we didn't find one.
 	if ep == nil {
 		return false
 	}
 
-	// Deliver the packet.
 	ep.handleControlPacket(n, id, typ, extra, pkt)
-
 	return true
 }
 
-func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id TransportEndpointID) []*endpointsByNic {
-	var matchedEPs []*endpointsByNic
-	// Try to find a match with the id as provided.
-	if ep, ok := eps.endpoints[id]; ok {
-		matchedEPs = append(matchedEPs, ep)
-	}
-
-	// Try to find a match with the id minus the local address.
-	nid := id
-
-	nid.LocalAddress = ""
-	if ep, ok := eps.endpoints[nid]; ok {
-		matchedEPs = append(matchedEPs, ep)
-	}
-
-	// Try to find a match with the id minus the remote part.
-	nid.LocalAddress = id.LocalAddress
-	nid.RemoteAddress = ""
-	nid.RemotePort = 0
-	if ep, ok := eps.endpoints[nid]; ok {
-		matchedEPs = append(matchedEPs, ep)
-	}
-
-	// Try to find a match with only the local port.
-	nid.LocalAddress = ""
-	if ep, ok := eps.endpoints[nid]; ok {
-		matchedEPs = append(matchedEPs, ep)
-	}
-	return matchedEPs
-}
-
 // findTransportEndpoint find a single endpoint that most closely matches the provided id.
 func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
 	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
 	if !ok {
 		return nil
 	}
-	// Try to find the endpoint.
+
 	eps.mu.RLock()
-	epsByNic := d.findEndpointLocked(eps, id)
-	// Fail if we didn't find one.
-	if epsByNic == nil {
+	epsByNIC := eps.findEndpointLocked(id)
+	if epsByNIC == nil {
 		eps.mu.RUnlock()
 		return nil
 	}
 
-	epsByNic.mu.RLock()
+	epsByNIC.mu.RLock()
 	eps.mu.RUnlock()
 
-	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
+	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
 	if !ok {
-		if mpep, ok = epsByNic.endpoints[0]; !ok {
-			epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+		if mpep, ok = epsByNIC.endpoints[0]; !ok {
+			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
 			return nil
 		}
 	}
 
-	ep := selectEndpoint(id, mpep, epsByNic.seed)
-	epsByNic.mu.RUnlock()
+	ep := selectEndpoint(id, mpep, epsByNIC.seed)
+	epsByNIC.mu.RUnlock()
 	return ep
 }
 
-// findEndpointLocked returns the endpoint that most closely matches the given
-// id.
-func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, id TransportEndpointID) *endpointsByNic {
-	if matchedEPs := d.findAllEndpointsLocked(eps, id); len(matchedEPs) > 0 {
-		return matchedEPs[0]
-	}
-	return nil
-}
-
 // registerRawEndpoint registers the given endpoint with the dispatcher such
 // that packets of the appropriate protocol are delivered to it. A single
 // packet can be sent to one or more raw endpoints along with a non-raw
@@ -621,8 +602,8 @@ func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNum
 	}
 
 	eps.mu.Lock()
-	defer eps.mu.Unlock()
 	eps.rawEndpoints = append(eps.rawEndpoints, ep)
+	eps.mu.Unlock()
 
 	return nil
 }
@@ -636,13 +617,16 @@ func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolN
 	}
 
 	eps.mu.Lock()
-	defer eps.mu.Unlock()
 	for i, rawEP := range eps.rawEndpoints {
 		if rawEP == ep {
-			eps.rawEndpoints = append(eps.rawEndpoints[:i], eps.rawEndpoints[i+1:]...)
-			return
+			lastIdx := len(eps.rawEndpoints) - 1
+			eps.rawEndpoints[i] = eps.rawEndpoints[lastIdx]
+			eps.rawEndpoints[lastIdx] = nil
+			eps.rawEndpoints = eps.rawEndpoints[:lastIdx]
+			break
 		}
 	}
+	eps.mu.Unlock()
 }
 
 func isMulticastOrBroadcast(addr tcpip.Address) bool {
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 5e9237de9..67d778137 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -31,84 +31,58 @@ import (
 )
 
 const (
-	stackV6Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
-	testV6Addr  = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	testSrcAddrV6 = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	testDstAddrV6 = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
 
-	stackAddr = "\x0a\x00\x00\x01"
-	stackPort = 1234
-	testPort  = 4096
+	testSrcAddrV4 = "\x0a\x00\x00\x01"
+	testDstAddrV4 = "\x0a\x00\x00\x02"
+
+	testDstPort = 1234
+	testSrcPort = 4096
 )
 
 type testContext struct {
-	t       *testing.T
 	linkEps map[tcpip.NICID]*channel.Endpoint
 	s       *stack.Stack
-
-	ep tcpip.Endpoint
-	wq waiter.Queue
-}
-
-func (c *testContext) cleanup() {
-	if c.ep != nil {
-		c.ep.Close()
-	}
-}
-
-func (c *testContext) createV6Endpoint(v6only bool) {
-	var err *tcpip.Error
-	c.ep, err = c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &c.wq)
-	if err != nil {
-		c.t.Fatalf("NewEndpoint failed: %v", err)
-	}
-
-	if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil {
-		c.t.Fatalf("SetSockOpt failed: %v", err)
-	}
+	wq      waiter.Queue
 }
 
 // newDualTestContextMultiNIC creates the testing context and also linkEpIDs NICs.
 func newDualTestContextMultiNIC(t *testing.T, mtu uint32, linkEpIDs []tcpip.NICID) *testContext {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
-		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+	})
 	linkEps := make(map[tcpip.NICID]*channel.Endpoint)
 	for _, linkEpID := range linkEpIDs {
 		channelEp := channel.New(256, mtu, "")
 		if err := s.CreateNIC(linkEpID, channelEp); err != nil {
-			t.Fatalf("CreateNIC failed: %v", err)
+			t.Fatalf("CreateNIC failed: %s", err)
 		}
 		linkEps[linkEpID] = channelEp
 
-		if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, stackAddr); err != nil {
-			t.Fatalf("AddAddress IPv4 failed: %v", err)
+		if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, testDstAddrV4); err != nil {
+			t.Fatalf("AddAddress IPv4 failed: %s", err)
 		}
 
-		if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, stackV6Addr); err != nil {
-			t.Fatalf("AddAddress IPv6 failed: %v", err)
+		if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, testDstAddrV6); err != nil {
+			t.Fatalf("AddAddress IPv6 failed: %s", err)
 		}
 	}
 
 	s.SetRouteTable([]tcpip.Route{
-		{
-			Destination: header.IPv4EmptySubnet,
-			NIC:         1,
-		},
-		{
-			Destination: header.IPv6EmptySubnet,
-			NIC:         1,
-		},
+		{Destination: header.IPv4EmptySubnet, NIC: 1},
+		{Destination: header.IPv6EmptySubnet, NIC: 1},
 	})
 
 	return &testContext{
-		t:       t,
 		s:       s,
 		linkEps: linkEps,
 	}
 }
 
 type headers struct {
-	srcPort uint16
-	dstPort uint16
+	srcPort, dstPort uint16
 }
 
 func newPayload() []byte {
@@ -119,6 +93,47 @@ func newPayload() []byte {
 	return b
 }
 
+func (c *testContext) sendV4Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
+	buf := buffer.NewView(header.UDPMinimumSize + header.IPv4MinimumSize + len(payload))
+	payloadStart := len(buf) - len(payload)
+	copy(buf[payloadStart:], payload)
+
+	// Initialize the IP header.
+	ip := header.IPv4(buf)
+	ip.Encode(&header.IPv4Fields{
+		IHL:         header.IPv4MinimumSize,
+		TOS:         0x80,
+		TotalLength: uint16(len(buf)),
+		TTL:         65,
+		Protocol:    uint8(udp.ProtocolNumber),
+		SrcAddr:     testSrcAddrV4,
+		DstAddr:     testDstAddrV4,
+	})
+	ip.SetChecksum(^ip.CalculateChecksum())
+
+	// Initialize the UDP header.
+	u := header.UDP(buf[header.IPv4MinimumSize:])
+	u.Encode(&header.UDPFields{
+		SrcPort: h.srcPort,
+		DstPort: h.dstPort,
+		Length:  uint16(header.UDPMinimumSize + len(payload)),
+	})
+
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testSrcAddrV4, testDstAddrV4, uint16(len(u)))
+
+	// Calculate the UDP checksum and set it.
+	xsum = header.Checksum(payload, xsum)
+	u.SetChecksum(^u.CalculateChecksum(xsum))
+
+	// Inject packet.
+	c.linkEps[linkEpID].InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
+	})
+}
+
 func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
@@ -130,8 +145,8 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI
 		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
 		NextHeader:    uint8(udp.ProtocolNumber),
 		HopLimit:      65,
-		SrcAddr:       testV6Addr,
-		DstAddr:       stackV6Addr,
+		SrcAddr:       testSrcAddrV6,
+		DstAddr:       testDstAddrV6,
 	})
 
 	// Initialize the UDP header.
@@ -143,15 +158,17 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI
 	})
 
 	// Calculate the UDP pseudo-header checksum.
-	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testV6Addr, stackV6Addr, uint16(len(u)))
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testSrcAddrV6, testDstAddrV6, uint16(len(u)))
 
 	// Calculate the UDP checksum and set it.
 	xsum = header.Checksum(payload, xsum)
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
-		Data: buf.ToVectorisedView(),
+	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
 	})
 }
 
@@ -167,38 +184,48 @@ func TestTransportDemuxerRegister(t *testing.T) {
 		t.Run(test.name, func(t *testing.T) {
 			s := stack.New(stack.Options{
 				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
-				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
-			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, nil, false, 0), test.want; got != want {
-				t.Fatalf("s.RegisterTransportEndpoint(...) = %v, want %v", got, want)
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			var wq waiter.Queue
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatal(err)
+			}
+			tEP, ok := ep.(stack.TransportEndpoint)
+			if !ok {
+				t.Fatalf("%T does not implement stack.TransportEndpoint", ep)
+			}
+			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, false, 0), test.want; got != want {
+				t.Fatalf("s.RegisterTransportEndpoint(...) = %s, want %s", got, want)
 			}
 		})
 	}
 }
 
-// TestReuseBindToDevice injects varied packets on input devices and checks that
+// TestBindToDeviceDistribution injects varied packets on input devices and checks that
 // the distribution of packets received matches expectations.
-func TestDistribution(t *testing.T) {
+func TestBindToDeviceDistribution(t *testing.T) {
 	type endpointSockopts struct {
-		reuse        int
+		reuse        bool
 		bindToDevice tcpip.NICID
 	}
 	for _, test := range []struct {
 		name string
 		// endpoints will received the inject packets.
 		endpoints []endpointSockopts
-		// wantedDistribution is the wanted ratio of packets received on each
+		// wantDistributions is the want ratio of packets received on each
 		// endpoint for each NIC on which packets are injected.
-		wantedDistributions map[tcpip.NICID][]float64
+		wantDistributions map[tcpip.NICID][]float64
 	}{
 		{
 			"BindPortReuse",
 			// 5 endpoints that all have reuse set.
 			[]endpointSockopts{
-				{1, 0},
-				{1, 0},
-				{1, 0},
-				{1, 0},
-				{1, 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 get distributed evenly.
@@ -209,9 +236,9 @@ func TestDistribution(t *testing.T) {
 			"BindToDevice",
 			// 3 endpoints with various bindings.
 			[]endpointSockopts{
-				{0, 1},
-				{0, 2},
-				{0, 3},
+				{reuse: false, bindToDevice: 1},
+				{reuse: false, bindToDevice: 2},
+				{reuse: false, bindToDevice: 3},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 go only to the endpoint bound to dev0.
@@ -226,12 +253,12 @@ func TestDistribution(t *testing.T) {
 			"ReuseAndBindToDevice",
 			// 6 endpoints with various bindings.
 			[]endpointSockopts{
-				{1, 1},
-				{1, 1},
-				{1, 2},
-				{1, 2},
-				{1, 2},
-				{1, 0},
+				{reuse: true, bindToDevice: 1},
+				{reuse: true, bindToDevice: 1},
+				{reuse: true, bindToDevice: 2},
+				{reuse: true, bindToDevice: 2},
+				{reuse: true, bindToDevice: 2},
+				{reuse: true, bindToDevice: 0},
 			},
 			map[tcpip.NICID][]float64{
 				// Injected packets on dev0 get distributed among endpoints bound to
@@ -245,17 +272,17 @@ func TestDistribution(t *testing.T) {
 			},
 		},
 	} {
-		t.Run(test.name, func(t *testing.T) {
-			for device, wantedDistribution := range test.wantedDistributions {
-				t.Run(string(device), func(t *testing.T) {
+		for protoName, netProtoNum := range map[string]tcpip.NetworkProtocolNumber{
+			"IPv4": ipv4.ProtocolNumber,
+			"IPv6": ipv6.ProtocolNumber,
+		} {
+			for device, wantDistribution := range test.wantDistributions {
+				t.Run(test.name+protoName+string(device), func(t *testing.T) {
 					var devices []tcpip.NICID
-					for d := range test.wantedDistributions {
+					for d := range test.wantDistributions {
 						devices = append(devices, d)
 					}
 					c := newDualTestContextMultiNIC(t, defaultMTU, devices)
-					defer c.cleanup()
-
-					c.createV6Endpoint(false)
 
 					eps := make(map[tcpip.Endpoint]int)
 
@@ -269,9 +296,9 @@ func TestDistribution(t *testing.T) {
 						defer close(ch)
 
 						var err *tcpip.Error
-						ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &wq)
+						ep, err := c.s.NewEndpoint(udp.ProtocolNumber, netProtoNum, &wq)
 						if err != nil {
-							c.t.Fatalf("NewEndpoint failed: %v", err)
+							t.Fatalf("NewEndpoint failed: %s", err)
 						}
 						eps[ep] = i
 
@@ -282,22 +309,31 @@ func TestDistribution(t *testing.T) {
 						}(ep)
 
 						defer ep.Close()
-						reusePortOption := tcpip.ReusePortOption(endpoint.reuse)
-						if err := ep.SetSockOpt(reusePortOption); err != nil {
-							c.t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %v", reusePortOption, i, err)
+						if err := ep.SetSockOptBool(tcpip.ReusePortOption, endpoint.reuse); err != nil {
+							t.Fatalf("SetSockOptBool(ReusePortOption, %t) on endpoint %d failed: %s", endpoint.reuse, i, err)
 						}
 						bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice)
 						if err := ep.SetSockOpt(bindToDeviceOption); err != nil {
-							c.t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %v", bindToDeviceOption, i, err)
+							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", bindToDeviceOption, i, err)
+						}
+
+						var dstAddr tcpip.Address
+						switch netProtoNum {
+						case ipv4.ProtocolNumber:
+							dstAddr = testDstAddrV4
+						case ipv6.ProtocolNumber:
+							dstAddr = testDstAddrV6
+						default:
+							t.Fatalf("unexpected protocol number: %d", netProtoNum)
 						}
-						if err := ep.Bind(tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort}); err != nil {
-							t.Fatalf("ep.Bind(...) on endpoint %d failed: %v", i, err)
+						if err := ep.Bind(tcpip.FullAddress{Addr: dstAddr, Port: testDstPort}); err != nil {
+							t.Fatalf("ep.Bind(...) on endpoint %d failed: %s", i, err)
 						}
 					}
 
 					npackets := 100000
 					nports := 10000
-					if got, want := len(test.endpoints), len(wantedDistribution); got != want {
+					if got, want := len(test.endpoints), len(wantDistribution); got != want {
 						t.Fatalf("got len(test.endpoints) = %d, want %d", got, want)
 					}
 					ports := make(map[uint16]tcpip.Endpoint)
@@ -306,17 +342,22 @@ func TestDistribution(t *testing.T) {
 						// Send a packet.
 						port := uint16(i % nports)
 						payload := newPayload()
-						c.sendV6Packet(payload,
-							&headers{
-								srcPort: testPort + port,
-								dstPort: stackPort},
-							device)
+						hdrs := &headers{
+							srcPort: testSrcPort + port,
+							dstPort: testDstPort,
+						}
+						switch netProtoNum {
+						case ipv4.ProtocolNumber:
+							c.sendV4Packet(payload, hdrs, device)
+						case ipv6.ProtocolNumber:
+							c.sendV6Packet(payload, hdrs, device)
+						default:
+							t.Fatalf("unexpected protocol number: %d", netProtoNum)
+						}
 
-						var addr tcpip.FullAddress
 						ep := <-pollChannel
-						_, _, err := ep.Read(&addr)
-						if err != nil {
-							c.t.Fatalf("Read on endpoint %d failed: %v", eps[ep], err)
+						if _, _, err := ep.Read(nil); err != nil {
+							t.Fatalf("Read on endpoint %d failed: %s", eps[ep], err)
 						}
 						stats[ep]++
 						if i < nports {
@@ -332,17 +373,17 @@ func TestDistribution(t *testing.T) {
 
 					// Check that a packet distribution is as expected.
 					for ep, i := range eps {
-						wantedRatio := wantedDistribution[i]
-						wantedRecv := wantedRatio * float64(npackets)
+						wantRatio := wantDistribution[i]
+						wantRecv := wantRatio * float64(npackets)
 						actualRecv := stats[ep]
 						actualRatio := float64(stats[ep]) / float64(npackets)
 						// The deviation is less than 10%.
-						if math.Abs(actualRatio-wantedRatio) > 0.05 {
-							t.Errorf("wanted about %.0f%% (%.0f of %d) packets to arrive on endpoint %d, got %.0f%% (%d of %d)", wantedRatio*100, wantedRecv, npackets, i, actualRatio*100, actualRecv, npackets)
+						if math.Abs(actualRatio-wantRatio) > 0.05 {
+							t.Errorf("want about %.0f%% (%.0f of %d) packets to arrive on endpoint %d, got %.0f%% (%d of %d)", wantRatio*100, wantRecv, npackets, i, actualRatio*100, actualRecv, npackets)
 						}
 					}
 				})
 			}
-		})
+		}
 	}
 }
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 869c69a6d..ad61c09d6 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -19,7 +19,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -57,10 +56,16 @@ func (f *fakeTransportEndpoint) Stats() tcpip.EndpointStats {
 	return nil
 }
 
+func (f *fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {}
+
 func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
 	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
 }
 
+func (f *fakeTransportEndpoint) Abort() {
+	f.Close()
+}
+
 func (f *fakeTransportEndpoint) Close() {
 	f.route.Release()
 }
@@ -78,12 +83,13 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions
 		return 0, nil, tcpip.ErrNoRoute
 	}
 
-	hdr := buffer.NewPrependable(int(f.route.MaxHeaderLength()))
+	hdr := buffer.NewPrependable(int(f.route.MaxHeaderLength()) + fakeTransHeaderLen)
+	hdr.Prepend(fakeTransHeaderLen)
 	v, err := p.FullPayload()
 	if err != nil {
 		return 0, nil, err
 	}
-	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.View(v).ToVectorisedView(),
 	}); err != nil {
@@ -210,7 +216,7 @@ func (*fakeTransportEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Erro
 	return tcpip.FullAddress{}, nil
 }
 
-func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ tcpip.PacketBuffer) {
+func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ *stack.PacketBuffer) {
 	// Increment the number of received packets.
 	f.proto.packetCount++
 	if f.acceptQueue != nil {
@@ -227,7 +233,7 @@ func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportE
 	}
 }
 
-func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, tcpip.PacketBuffer) {
+func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, *stack.PacketBuffer) {
 	// Increment the number of received control packets.
 	f.proto.controlCount++
 }
@@ -238,8 +244,8 @@ func (f *fakeTransportEndpoint) State() uint32 {
 
 func (f *fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
 
-func (f *fakeTransportEndpoint) IPTables() (iptables.IPTables, error) {
-	return iptables.IPTables{}, nil
+func (f *fakeTransportEndpoint) IPTables() (stack.IPTables, error) {
+	return stack.IPTables{}, nil
 }
 
 func (f *fakeTransportEndpoint) Resume(*stack.Stack) {}
@@ -272,7 +278,7 @@ func (f *fakeTransportProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.N
 	return newFakeTransportEndpoint(stack, f, netProto, stack.UniqueID()), nil
 }
 
-func (f *fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+func (*fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	return nil, tcpip.ErrUnknownProtocol
 }
 
@@ -284,7 +290,7 @@ func (*fakeTransportProtocol) ParsePorts(buffer.View) (src, dst uint16, err *tcp
 	return 0, 0, nil
 }
 
-func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
+func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
 	return true
 }
 
@@ -310,6 +316,26 @@ func (f *fakeTransportProtocol) Option(option interface{}) *tcpip.Error {
 	}
 }
 
+// Abort implements TransportProtocol.Abort.
+func (*fakeTransportProtocol) Abort() {}
+
+// Close implements tcpip.Endpoint.Close.
+func (*fakeTransportProtocol) Close() {}
+
+// Wait implements TransportProtocol.Wait.
+func (*fakeTransportProtocol) Wait() {}
+
+// Parse implements TransportProtocol.Parse.
+func (*fakeTransportProtocol) Parse(pkt *stack.PacketBuffer) bool {
+	hdr, ok := pkt.Data.PullUp(fakeTransHeaderLen)
+	if !ok {
+		return false
+	}
+	pkt.TransportHeader = hdr
+	pkt.Data.TrimFront(fakeTransHeaderLen)
+	return true
+}
+
 func fakeTransFactory() stack.TransportProtocol {
 	return &fakeTransportProtocol{}
 }
@@ -355,7 +381,7 @@ func TestTransportReceive(t *testing.T) {
 	// Make sure packet with wrong protocol is not delivered.
 	buf[0] = 1
 	buf[2] = 0
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 0 {
@@ -366,7 +392,7 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 3
 	buf[2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 0 {
@@ -377,7 +403,7 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 2
 	buf[2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 1 {
@@ -432,7 +458,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 0
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = 0
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 0 {
@@ -443,7 +469,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 3
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 0 {
@@ -454,7 +480,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 2
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 1 {
@@ -609,7 +635,7 @@ func TestTransportForwarding(t *testing.T) {
 	req[0] = 1
 	req[1] = 3
 	req[2] = byte(fakeTransNumber)
-	ep2.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+	ep2.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: req.ToVectorisedView(),
 	})
 
@@ -628,10 +654,10 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatal("Response packet not forwarded")
 	}
 
-	if dst := p.Pkt.Header.View()[0]; dst != 3 {
+	if dst := p.Pkt.NetworkHeader[0]; dst != 3 {
 		t.Errorf("Response packet has incorrect destination addresss: got = %d, want = 3", dst)
 	}
-	if src := p.Pkt.Header.View()[1]; src != 1 {
+	if src := p.Pkt.NetworkHeader[1]; src != 1 {
 		t.Errorf("Response packet has incorrect source addresss: got = %d, want = 3", src)
 	}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 9ca39ce40..b7b227328 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -110,6 +110,71 @@ var (
 	ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"}
 )
 
+var messageToError map[string]*Error
+
+var populate sync.Once
+
+// StringToError converts an error message to the error.
+func StringToError(s string) *Error {
+	populate.Do(func() {
+		var errors = []*Error{
+			ErrUnknownProtocol,
+			ErrUnknownNICID,
+			ErrUnknownDevice,
+			ErrUnknownProtocolOption,
+			ErrDuplicateNICID,
+			ErrDuplicateAddress,
+			ErrNoRoute,
+			ErrBadLinkEndpoint,
+			ErrAlreadyBound,
+			ErrInvalidEndpointState,
+			ErrAlreadyConnecting,
+			ErrAlreadyConnected,
+			ErrNoPortAvailable,
+			ErrPortInUse,
+			ErrBadLocalAddress,
+			ErrClosedForSend,
+			ErrClosedForReceive,
+			ErrWouldBlock,
+			ErrConnectionRefused,
+			ErrTimeout,
+			ErrAborted,
+			ErrConnectStarted,
+			ErrDestinationRequired,
+			ErrNotSupported,
+			ErrQueueSizeNotSupported,
+			ErrNotConnected,
+			ErrConnectionReset,
+			ErrConnectionAborted,
+			ErrNoSuchFile,
+			ErrInvalidOptionValue,
+			ErrNoLinkAddress,
+			ErrBadAddress,
+			ErrNetworkUnreachable,
+			ErrMessageTooLong,
+			ErrNoBufferSpace,
+			ErrBroadcastDisabled,
+			ErrNotPermitted,
+			ErrAddressFamilyNotSupported,
+		}
+
+		messageToError = make(map[string]*Error)
+		for _, e := range errors {
+			if messageToError[e.String()] != nil {
+				panic("tcpip errors with duplicated message: " + e.String())
+			}
+			messageToError[e.String()] = e
+		}
+	})
+
+	e, ok := messageToError[s]
+	if !ok {
+		panic("unknown error message: " + s)
+	}
+
+	return e
+}
+
 // Errors related to Subnet
 var (
 	errSubnetLengthMismatch = errors.New("subnet length of address and mask differ")
@@ -323,11 +388,11 @@ type ControlMessages struct {
 	// TOS is the IPv4 type of service of the associated packet.
 	TOS uint8
 
-	// HasTClass indicates whether Tclass is valid/set.
+	// HasTClass indicates whether TClass is valid/set.
 	HasTClass bool
 
-	// Tclass is the IPv6 traffic class of the associated packet.
-	TClass int32
+	// TClass is the IPv6 traffic class of the associated packet.
+	TClass uint32
 
 	// HasIPPacketInfo indicates whether PacketInfo is set.
 	HasIPPacketInfo bool
@@ -336,14 +401,29 @@ type ControlMessages struct {
 	PacketInfo IPPacketInfo
 }
 
+// PacketOwner is used to get UID and GID of the packet.
+type PacketOwner interface {
+	// UID returns UID of the packet.
+	UID() uint32
+
+	// GID returns GID of the packet.
+	GID() uint32
+}
+
 // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
 // that exposes functionality like read, write, connect, etc. to users of the
 // networking stack.
 type Endpoint interface {
 	// Close puts the endpoint in a closed state and frees all resources
-	// associated with it.
+	// associated with it. Close initiates the teardown process, the
+	// Endpoint may not be fully closed when Close returns.
 	Close()
 
+	// Abort initiates an expedited endpoint teardown. As compared to
+	// Close, Abort prioritizes closing the Endpoint quickly over cleanly.
+	// Abort is best effort; implementing Abort with Close is acceptable.
+	Abort()
+
 	// Read reads data from the endpoint and optionally returns the sender.
 	//
 	// This method does not block if there is no data pending. It will also
@@ -464,6 +544,9 @@ type Endpoint interface {
 
 	// Stats returns a reference to the endpoint stats.
 	Stats() EndpointStats
+
+	// SetOwner sets the task owner to the endpoint owner.
+	SetOwner(owner PacketOwner)
 }
 
 // EndpointInfo is the interface implemented by each endpoint info struct.
@@ -502,27 +585,90 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
+	// BroadcastOption is used by SetSockOpt/GetSockOpt to specify whether
+	// datagram sockets are allowed to send packets to a broadcast address.
+	BroadcastOption SockOptBool = iota
+
+	// CorkOption is used by SetSockOpt/GetSockOpt to specify if data should be
+	// held until segments are full by the TCP transport protocol.
+	CorkOption
+
+	// DelayOption is used by SetSockOpt/GetSockOpt to specify if data
+	// should be sent out immediately by the transport protocol. For TCP,
+	// it determines if the Nagle algorithm is on or off.
+	DelayOption
+
+	// KeepaliveEnabledOption is used by SetSockOpt/GetSockOpt to specify whether
+	// TCP keepalive is enabled for this socket.
+	KeepaliveEnabledOption
+
+	// MulticastLoopOption is used by SetSockOpt/GetSockOpt to specify whether
+	// multicast packets sent over a non-loopback interface will be looped back.
+	MulticastLoopOption
+
+	// PasscredOption is used by SetSockOpt/GetSockOpt to specify whether
+	// SCM_CREDENTIALS socket control messages are enabled.
+	//
+	// Only supported on Unix sockets.
+	PasscredOption
+
+	// QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
+	QuickAckOption
+
+	// ReceiveTClassOption is used by SetSockOpt/GetSockOpt to specify if the
+	// IPV6_TCLASS ancillary message is passed with incoming packets.
+	ReceiveTClassOption
+
 	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
 	// ancillary message is passed with incoming packets.
-	ReceiveTOSOption SockOptBool = iota
-
-	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
-	// socket is to be restricted to sending and receiving IPv6 packets only.
-	V6OnlyOption
+	ReceiveTOSOption
 
 	// ReceiveIPPacketInfoOption is used by {G,S}etSockOptBool to specify
 	// if more inforamtion is provided with incoming packets such
 	// as interface index and address.
 	ReceiveIPPacketInfoOption
+
+	// ReuseAddressOption is used by SetSockOpt/GetSockOpt to specify whether Bind()
+	// should allow reuse of local address.
+	ReuseAddressOption
+
+	// ReusePortOption is used by SetSockOpt/GetSockOpt to permit multiple sockets
+	// to be bound to an identical socket address.
+	ReusePortOption
+
+	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
+	// socket is to be restricted to sending and receiving IPv6 packets only.
+	V6OnlyOption
 )
 
 // SockOptInt represents socket options which values have the int type.
 type SockOptInt int
 
 const (
+	// KeepaliveCountOption is used by SetSockOpt/GetSockOpt to specify the number
+	// of un-ACKed TCP keepalives that will be sent before the connection is
+	// closed.
+	KeepaliveCountOption SockOptInt = iota
+
+	// IPv4TOSOption is used by SetSockOpt/GetSockOpt to specify TOS
+	// for all subsequent outgoing IPv4 packets from the endpoint.
+	IPv4TOSOption
+
+	// IPv6TrafficClassOption is used by SetSockOpt/GetSockOpt to specify TOS
+	// for all subsequent outgoing IPv6 packets from the endpoint.
+	IPv6TrafficClassOption
+
+	// MaxSegOption is used by SetSockOpt/GetSockOpt to set/get the current
+	// Maximum Segment Size(MSS) value as specified using the TCP_MAXSEG option.
+	MaxSegOption
+
+	// MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
+	// TTL value for multicast messages. The default is 1.
+	MulticastTTLOption
+
 	// ReceiveQueueSizeOption is used in GetSockOptInt to specify that the
 	// number of unread bytes in the input buffer should be returned.
-	ReceiveQueueSizeOption SockOptInt = iota
+	ReceiveQueueSizeOption
 
 	// SendBufferSizeOption is used by SetSockOptInt/GetSockOptInt to
 	// specify the send buffer size option.
@@ -536,44 +682,34 @@ const (
 	// number of unread bytes in the output buffer should be returned.
 	SendQueueSizeOption
 
-	// DelayOption is used by SetSockOpt/GetSockOpt to specify if data
-	// should be sent out immediately by the transport protocol. For TCP,
-	// it determines if the Nagle algorithm is on or off.
-	DelayOption
+	// TTLOption is used by SetSockOpt/GetSockOpt to control the default TTL/hop
+	// limit value for unicast messages. The default is protocol specific.
+	//
+	// A zero value indicates the default.
+	TTLOption
 
-	// TODO(b/137664753): convert all int socket options to be handled via
-	// GetSockOptInt.
+	// TCPSynCountOption is used by SetSockOpt/GetSockOpt to specify the number of
+	// SYN retransmits that TCP should send before aborting the attempt to
+	// connect. It cannot exceed 255.
+	//
+	// NOTE: This option is currently only stubbed out and is no-op.
+	TCPSynCountOption
+
+	// TCPWindowClampOption is used by SetSockOpt/GetSockOpt to bound the size
+	// of the advertised window to this value.
+	//
+	// NOTE: This option is currently only stubed out and is a no-op
+	TCPWindowClampOption
 )
 
 // ErrorOption is used in GetSockOpt to specify that the last error reported by
 // the endpoint should be cleared and returned.
 type ErrorOption struct{}
 
-// CorkOption is used by SetSockOpt/GetSockOpt to specify if data should be
-// held until segments are full by the TCP transport protocol.
-type CorkOption int
-
-// ReuseAddressOption is used by SetSockOpt/GetSockOpt to specify whether Bind()
-// should allow reuse of local address.
-type ReuseAddressOption int
-
-// ReusePortOption is used by SetSockOpt/GetSockOpt to permit multiple sockets
-// to be bound to an identical socket address.
-type ReusePortOption int
-
 // BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
 // should bind only on a specific NIC.
 type BindToDeviceOption NICID
 
-// QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
-type QuickAckOption int
-
-// PasscredOption is used by SetSockOpt/GetSockOpt to specify whether
-// SCM_CREDENTIALS socket control messages are enabled.
-//
-// Only supported on Unix sockets.
-type PasscredOption int
-
 // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
 //
 // TODO(b/64800844): Add and populate stat fields.
@@ -582,10 +718,6 @@ type TCPInfoOption struct {
 	RTTVar time.Duration
 }
 
-// KeepaliveEnabledOption is used by SetSockOpt/GetSockOpt to specify whether
-// TCP keepalive is enabled for this socket.
-type KeepaliveEnabledOption int
-
 // KeepaliveIdleOption is used by SetSockOpt/GetSockOpt to specify the time a
 // connection must remain idle before the first TCP keepalive packet is sent.
 // Once this time is reached, KeepaliveIntervalOption is used instead.
@@ -595,11 +727,6 @@ type KeepaliveIdleOption time.Duration
 // interval between sending TCP keepalive packets.
 type KeepaliveIntervalOption time.Duration
 
-// KeepaliveCountOption is used by SetSockOpt/GetSockOpt to specify the number
-// of un-ACKed TCP keepalives that will be sent before the connection is
-// closed.
-type KeepaliveCountOption int
-
 // TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
 // specified timeout for a given TCP connection.
 // See: RFC5482 for details.
@@ -613,20 +740,9 @@ type CongestionControlOption string
 // control algorithms.
 type AvailableCongestionControlOption string
 
-// ModerateReceiveBufferOption allows the caller to enable/disable TCP receive
 // buffer moderation.
 type ModerateReceiveBufferOption bool
 
-// MaxSegOption is used by SetSockOpt/GetSockOpt to set/get the current
-// Maximum Segment Size(MSS) value as specified using the TCP_MAXSEG option.
-type MaxSegOption int
-
-// TTLOption is used by SetSockOpt/GetSockOpt to control the default TTL/hop
-// limit value for unicast messages. The default is protocol specific.
-//
-// A zero value indicates the default.
-type TTLOption uint8
-
 // TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
 // maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
 // before being marked closed.
@@ -643,9 +759,26 @@ type TCPTimeWaitTimeoutOption time.Duration
 // for a handshake till the specified timeout until a segment with data arrives.
 type TCPDeferAcceptOption time.Duration
 
-// MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
-// TTL value for multicast messages. The default is 1.
-type MulticastTTLOption uint8
+// TCPMinRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
+// default MinRTO used by the Stack.
+type TCPMinRTOOption time.Duration
+
+// TCPMaxRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
+// default MaxRTO used by the Stack.
+type TCPMaxRTOOption time.Duration
+
+// TCPMaxRetriesOption is used by SetSockOpt/GetSockOpt to set/get the
+// maximum number of retransmits after which we time out the connection.
+type TCPMaxRetriesOption uint64
+
+// TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify
+// the number of endpoints that can be in SYN-RCVD state before the stack
+// switches to using SYN cookies.
+type TCPSynRcvdCountThresholdOption uint64
+
+// TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide
+// default for number of times SYN is retransmitted before aborting a connect.
+type TCPSynRetriesOption uint8
 
 // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
 // default interface for multicast.
@@ -654,10 +787,6 @@ type MulticastInterfaceOption struct {
 	InterfaceAddr Address
 }
 
-// MulticastLoopOption is used by SetSockOpt/GetSockOpt to specify whether
-// multicast packets sent over a non-loopback interface will be looped back.
-type MulticastLoopOption bool
-
 // MembershipOption is used by SetSockOpt/GetSockOpt as an argument to
 // AddMembershipOption and RemoveMembershipOption.
 type MembershipOption struct {
@@ -680,22 +809,10 @@ type RemoveMembershipOption MembershipOption
 // TCP out-of-band data is delivered along with the normal in-band data.
 type OutOfBandInlineOption int
 
-// BroadcastOption is used by SetSockOpt/GetSockOpt to specify whether
-// datagram sockets are allowed to send packets to a broadcast address.
-type BroadcastOption int
-
 // DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
 // a default TTL.
 type DefaultTTLOption uint8
 
-// IPv4TOSOption is used by SetSockOpt/GetSockOpt to specify TOS
-// for all subsequent outgoing IPv4 packets from the endpoint.
-type IPv4TOSOption uint8
-
-// IPv6TrafficClassOption is used by SetSockOpt/GetSockOpt to specify TOS
-// for all subsequent outgoing IPv6 packets from the endpoint.
-type IPv6TrafficClassOption uint8
-
 // IPPacketInfo is the message struture for IP_PKTINFO.
 //
 // +stateify savable
diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go
index 8c0aacffa..1c8e2bc34 100644
--- a/pkg/tcpip/tcpip_test.go
+++ b/pkg/tcpip/tcpip_test.go
@@ -218,7 +218,7 @@ func TestAddressWithPrefixSubnet(t *testing.T) {
 		gotSubnet := ap.Subnet()
 		wantSubnet, err := NewSubnet(tt.subnetAddr, tt.subnetMask)
 		if err != nil {
-			t.Error("NewSubnet(%q, %q) failed: %s", tt.subnetAddr, tt.subnetMask, err)
+			t.Errorf("NewSubnet(%q, %q) failed: %s", tt.subnetAddr, tt.subnetMask, err)
 			continue
 		}
 		if gotSubnet != wantSubnet {
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 48764b978..7f172f978 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.9
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -25,6 +25,8 @@ import (
 )
 
 // StdClock implements Clock with the time package.
+//
+// +stateify savable
 type StdClock struct{}
 
 var _ Clock = (*StdClock)(nil)
diff --git a/pkg/tcpip/timer.go b/pkg/tcpip/timer.go
index f5f01f32f..59f3b391f 100644
--- a/pkg/tcpip/timer.go
+++ b/pkg/tcpip/timer.go
@@ -88,6 +88,9 @@ func (t *cancellableTimerInstance) stop() {
 //
 // The term "related work" is defined as some work that needs to be done while
 // holding some lock that the timer must also hold while doing some work.
+//
+// Note, it is not safe to copy a CancellableTimer as its timer instance creates
+// a closure over the address of the CancellableTimer.
 type CancellableTimer struct {
 	// The active instance of a cancellable timer.
 	instance cancellableTimerInstance
@@ -131,10 +134,14 @@ func (t *CancellableTimer) StopLocked() {
 func (t *CancellableTimer) Reset(d time.Duration) {
 	// Create a new instance.
 	earlyReturn := false
+
+	// Capture the locker so that updating the timer does not cause a data race
+	// when a timer fires and tries to obtain the lock (read the timer's locker).
+	locker := t.locker
 	t.instance = cancellableTimerInstance{
 		timer: time.AfterFunc(d, func() {
-			t.locker.Lock()
-			defer t.locker.Unlock()
+			locker.Lock()
+			defer locker.Unlock()
 
 			if earlyReturn {
 				// If we reach this point, it means that the timer fired while another
@@ -150,12 +157,28 @@ func (t *CancellableTimer) Reset(d time.Duration) {
 	}
 }
 
-// MakeCancellableTimer returns an unscheduled CancellableTimer with the given
+// Lock is a no-op used by the copylocks checker from go vet.
+//
+// See CancellableTimer for details about why it shouldn't be copied.
+//
+// See https://github.com/golang/go/issues/8005#issuecomment-190753527 for more
+// details about the copylocks checker.
+func (*CancellableTimer) Lock() {}
+
+// Unlock is a no-op used by the copylocks checker from go vet.
+//
+// See CancellableTimer for details about why it shouldn't be copied.
+//
+// See https://github.com/golang/go/issues/8005#issuecomment-190753527 for more
+// details about the copylocks checker.
+func (*CancellableTimer) Unlock() {}
+
+// NewCancellableTimer returns an unscheduled CancellableTimer with the given
 // locker and fn.
 //
 // fn MUST NOT attempt to lock locker.
 //
 // Callers must call Reset to schedule the timer to fire.
-func MakeCancellableTimer(locker sync.Locker, fn func()) CancellableTimer {
-	return CancellableTimer{locker: locker, fn: fn}
+func NewCancellableTimer(locker sync.Locker, fn func()) *CancellableTimer {
+	return &CancellableTimer{locker: locker, fn: fn}
 }
diff --git a/pkg/tcpip/timer_test.go b/pkg/tcpip/timer_test.go
index 2d20f7ef3..b4940e397 100644
--- a/pkg/tcpip/timer_test.go
+++ b/pkg/tcpip/timer_test.go
@@ -28,13 +28,38 @@ const (
 	longDuration   = 1 * time.Second
 )
 
+func TestCancellableTimerReassignment(t *testing.T) {
+	var timer tcpip.CancellableTimer
+	var wg sync.WaitGroup
+	var lock sync.Mutex
+
+	for i := 0; i < 2; i++ {
+		wg.Add(1)
+
+		go func() {
+			lock.Lock()
+			// Assigning a new timer value updates the timer's locker and function.
+			// This test makes sure there is no data race when reassigning a timer
+			// that has an active timer (even if it has been stopped as a stopped
+			// timer may be blocked on a lock before it can check if it has been
+			// stopped while another goroutine holds the same lock).
+			timer = *tcpip.NewCancellableTimer(&lock, func() {
+				wg.Done()
+			})
+			timer.Reset(shortDuration)
+			lock.Unlock()
+		}()
+	}
+	wg.Wait()
+}
+
 func TestCancellableTimerFire(t *testing.T) {
 	t.Parallel()
 
 	ch := make(chan struct{})
 	var lock sync.Mutex
 
-	timer := tcpip.MakeCancellableTimer(&lock, func() {
+	timer := tcpip.NewCancellableTimer(&lock, func() {
 		ch <- struct{}{}
 	})
 	timer.Reset(shortDuration)
@@ -60,7 +85,7 @@ func TestCancellableTimerResetFromLongDuration(t *testing.T) {
 	ch := make(chan struct{})
 	var lock sync.Mutex
 
-	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 	timer.Reset(middleDuration)
 
 	lock.Lock()
@@ -91,7 +116,7 @@ func TestCancellableTimerResetFromShortDuration(t *testing.T) {
 	var lock sync.Mutex
 
 	lock.Lock()
-	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 	timer.Reset(shortDuration)
 	timer.StopLocked()
 	lock.Unlock()
@@ -128,7 +153,7 @@ func TestCancellableTimerImmediatelyStop(t *testing.T) {
 
 	for i := 0; i < 1000; i++ {
 		lock.Lock()
-		timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+		timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 		timer.Reset(shortDuration)
 		timer.StopLocked()
 		lock.Unlock()
@@ -149,7 +174,7 @@ func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) {
 	var lock sync.Mutex
 
 	lock.Lock()
-	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 	timer.Reset(shortDuration)
 	timer.StopLocked()
 	lock.Unlock()
@@ -180,7 +205,7 @@ func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) {
 	var lock sync.Mutex
 
 	lock.Lock()
-	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 	timer.Reset(shortDuration)
 	for i := 0; i < 10; i++ {
 		// Sleep until the timer fires and gets blocked trying to take the lock.
@@ -212,7 +237,7 @@ func TestManyCancellableTimerResetUnderLock(t *testing.T) {
 	var lock sync.Mutex
 
 	lock.Lock()
-	timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
 	timer.Reset(shortDuration)
 	for i := 0; i < 10; i++ {
 		timer.StopLocked()
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index ac18ec5b1..9ce625c17 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -31,7 +31,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
         "//pkg/tcpip/transport/tcp",
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 42afb3f5b..57e0a069b 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -19,7 +19,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -74,6 +73,9 @@ type endpoint struct {
 	route         stack.Route `state:"manual"`
 	ttl           uint8
 	stats         tcpip.TransportEndpointStats `state:"nosave"`
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
 }
 
 func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
@@ -96,6 +98,11 @@ func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close puts the endpoint in a closed state and frees all resources
 // associated with it.
 func (e *endpoint) Close() {
@@ -129,9 +136,8 @@ func (e *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
-	return e.stack.IPTables(), nil
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
 }
 
 // Read reads data from the endpoint. This method does not block if
@@ -286,15 +292,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			nicID = e.BindNICID
 		}
 
-		toCopy := *to
-		to = &toCopy
-		netProto, err := e.checkV4Mapped(to)
+		dst, netProto, err := e.checkV4MappedLocked(*to)
 		if err != nil {
 			return 0, nil, err
 		}
 
-		// Find the enpoint.
-		r, err := e.stack.FindRoute(nicID, e.BindAddr, to.Addr, netProto, false /* multicastLoop */)
+		// Find the endpoint.
+		r, err := e.stack.FindRoute(nicID, e.BindAddr, dst.Addr, netProto, false /* multicastLoop */)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -319,7 +323,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
-		err = send4(route, e.ID.LocalPort, v, e.ttl)
+		err = send4(route, e.ID.LocalPort, v, e.ttl, e.owner)
 
 	case header.IPv6ProtocolNumber:
 		err = send6(route, e.ID.LocalPort, v, e.ttl)
@@ -339,13 +343,6 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 
 // SetSockOpt sets a socket option.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
-	case tcpip.TTLOption:
-		e.mu.Lock()
-		e.ttl = uint8(o)
-		e.mu.Unlock()
-	}
-
 	return nil
 }
 
@@ -356,12 +353,25 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 
 // SetSockOptInt sets a socket option. Currently not supported.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	switch opt {
+	case tcpip.TTLOption:
+		e.mu.Lock()
+		e.ttl = uint8(v)
+		e.mu.Unlock()
+
+	}
 	return nil
 }
 
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	return false, tcpip.ErrUnknownProtocolOption
+	switch opt {
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
@@ -388,32 +398,29 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		e.rcvMu.Unlock()
 		return v, nil
 
+	case tcpip.TTLOption:
+		e.rcvMu.Lock()
+		v := int(e.ttl)
+		e.rcvMu.Unlock()
+		return v, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
 	}
-	return -1, tcpip.ErrUnknownProtocolOption
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
+	switch opt.(type) {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
-	case *tcpip.TTLOption:
-		e.rcvMu.Lock()
-		*o = tcpip.TTLOption(e.ttl)
-		e.rcvMu.Unlock()
-		return nil
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Error {
+func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) *tcpip.Error {
 	if len(data) < header.ICMPv4MinimumSize {
 		return tcpip.ErrInvalidEndpointState
 	}
@@ -438,10 +445,11 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 		Header:          hdr,
 		Data:            data.ToVectorisedView(),
 		TransportHeader: buffer.View(icmpv4),
+		Owner:           owner,
 	})
 }
 
@@ -468,20 +476,21 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 		Header:          hdr,
 		Data:            dataVV,
 		TransportHeader: buffer.View(icmpv6),
 	})
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, false /* v6only */)
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, false /* v6only */)
 	if err != nil {
-		return 0, err
+		return tcpip.FullAddress{}, 0, err
 	}
-	*addr = unwrapped
-	return netProto, nil
+	return unwrapped, netProto, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect.
@@ -497,6 +506,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	nicID := addr.NIC
 	localPort := uint16(0)
 	switch e.state {
+	case stateInitial:
 	case stateBound, stateConnected:
 		localPort = e.ID.LocalPort
 		if e.BindNICID == 0 {
@@ -512,7 +522,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
@@ -625,7 +635,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
@@ -729,19 +739,19 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
-		h := header.ICMPv4(pkt.Data.First())
-		if h.Type() != header.ICMPv4EchoReply {
+		h, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
+		if !ok || header.ICMPv4(h).Type() != header.ICMPv4EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
 		}
 	case header.IPv6ProtocolNumber:
-		h := header.ICMPv6(pkt.Data.First())
-		if h.Type() != header.ICMPv6EchoReply {
+		h, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize)
+		if !ok || header.ICMPv6(h).Type() != header.ICMPv6EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
@@ -791,7 +801,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 9ce500e80..74ef6541e 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -104,20 +104,36 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
 	return true
 }
 
-// SetOption implements TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+// SetOption implements stack.TransportProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-// Option implements TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+// Option implements stack.TransportProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	// TODO(gvisor.dev/issue/170): Implement parsing of ICMP.
+	//
+	// Right now, the Parse() method is tied to enabled protocols passed into
+	// stack.New. This works for UDP and TCP, but we handle ICMP traffic even
+	// when netstack users don't pass ICMP as a supported protocol.
+	return false
+}
+
 // NewProtocol4 returns an ICMPv4 transport protocol.
 func NewProtocol4() stack.TransportProtocol {
 	return &protocol{ProtocolNumber4}
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
index d22de6b26..b989b1209 100644
--- a/pkg/tcpip/transport/packet/BUILD
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -31,7 +31,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/waiter",
     ],
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index fc5bc69fa..baf08eda6 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -29,7 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -76,6 +75,7 @@ type endpoint struct {
 	sndBufSize int
 	closed     bool
 	stats      tcpip.TransportEndpointStats `state:"nosave"`
+	bound      bool
 }
 
 // NewEndpoint returns a new packet endpoint.
@@ -98,6 +98,11 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb
 	return ep, nil
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (ep *endpoint) Abort() {
+	ep.Close()
+}
+
 // Close implements tcpip.Endpoint.Close.
 func (ep *endpoint) Close() {
 	ep.mu.Lock()
@@ -120,17 +125,13 @@ func (ep *endpoint) Close() {
 	}
 
 	ep.closed = true
+	ep.bound = false
 	ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 }
 
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (ep *endpoint) ModerateRecvBuf(copied int) {}
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (ep *endpoint) IPTables() (iptables.IPTables, error) {
-	return ep.stack.IPTables(), nil
-}
-
 // Read implements tcpip.Endpoint.Read.
 func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	ep.rcvMu.Lock()
@@ -211,7 +212,24 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	// sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex."
 	// - packet(7).
 
-	return tcpip.ErrNotSupported
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	if ep.bound {
+		return tcpip.ErrAlreadyBound
+	}
+
+	// Unregister endpoint with all the nics.
+	ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+
+	// Bind endpoint to receive packets from specific interface.
+	if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil {
+		return err
+	}
+
+	ep.bound = true
+
+	return nil
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
@@ -275,7 +293,7 @@ func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // HandlePacket implements stack.PacketEndpoint.HandlePacket.
-func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	ep.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
@@ -369,3 +387,5 @@ func (ep *endpoint) Info() tcpip.EndpointInfo {
 func (ep *endpoint) Stats() tcpip.EndpointStats {
 	return &ep.stats
 }
+
+func (ep *endpoint) SetOwner(owner tcpip.PacketOwner) {}
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index c9baf4600..2eab09088 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -32,7 +32,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/packet",
         "//pkg/waiter",
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index ee9c4c58b..a406d815e 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -30,7 +30,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -81,6 +80,9 @@ type endpoint struct {
 	// Connect(), and is valid only when conneted is true.
 	route stack.Route                  `state:"manual"`
 	stats tcpip.TransportEndpointStats `state:"nosave"`
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
 }
 
 // NewEndpoint returns a raw  endpoint for the given protocols.
@@ -121,6 +123,11 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 	return e, nil
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close implements tcpip.Endpoint.Close.
 func (e *endpoint) Close() {
 	e.mu.Lock()
@@ -155,9 +162,8 @@ func (e *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
-	return e.stack.IPTables(), nil
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
 }
 
 // Read implements tcpip.Endpoint.Read.
@@ -337,17 +343,19 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
 		if !e.associated {
-			if err := route.WriteHeaderIncludedPacket(tcpip.PacketBuffer{
+			if err := route.WriteHeaderIncludedPacket(&stack.PacketBuffer{
 				Data: buffer.View(payloadBytes).ToVectorisedView(),
 			}); err != nil {
 				return 0, nil, err
 			}
 			break
 		}
+
 		hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength()))
-		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
+		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buffer.View(payloadBytes).ToVectorisedView(),
+			Owner:  e.owner,
 		}); err != nil {
 			return 0, nil, err
 		}
@@ -520,14 +528,10 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
+	switch opt.(type) {
 	case tcpip.ErrorOption:
 		return nil
 
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -535,7 +539,13 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	return false, tcpip.ErrUnknownProtocolOption
+	switch opt {
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
@@ -563,13 +573,13 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		e.rcvMu.Unlock()
 		return v, nil
 
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
 	}
-
-	return -1, tcpip.ErrUnknownProtocolOption
 }
 
 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
-func (e *endpoint) HandlePacket(route *stack.Route, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(route *stack.Route, pkt *stack.PacketBuffer) {
 	e.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
@@ -617,8 +627,9 @@ func (e *endpoint) HandlePacket(route *stack.Route, pkt tcpip.PacketBuffer) {
 		},
 	}
 
-	networkHeader := append(buffer.View(nil), pkt.NetworkHeader...)
-	combinedVV := networkHeader.ToVectorisedView()
+	headers := append(buffer.View(nil), pkt.NetworkHeader...)
+	headers = append(headers, pkt.TransportHeader...)
+	combinedVV := headers.ToVectorisedView()
 	combinedVV.Append(pkt.Data)
 	packet.data = combinedVV
 	packet.timestampNS = e.stack.NowNanoseconds()
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 272e8f570..e26f01fae 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -32,6 +32,7 @@ go_library(
     srcs = [
         "accept.go",
         "connect.go",
+        "connect_unsafe.go",
         "cubic.go",
         "cubic_state.go",
         "dispatcher.go",
@@ -65,12 +66,10 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
-        "//pkg/tmutex",
         "//pkg/waiter",
         "@com_github_google_btree//:go_default_library",
     ],
@@ -87,8 +86,6 @@ go_test(
         "tcp_test.go",
         "tcp_timestamp_test.go",
     ],
-    # FIXME(b/68809571)
-    tags = ["flaky"],
     deps = [
         ":tcp",
         "//pkg/sync",
@@ -104,6 +101,17 @@ go_test(
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/tcp/testing/context",
+        "//pkg/test/testutil",
         "//pkg/waiter",
     ],
 )
+
+go_test(
+    name = "rcv_test",
+    size = "small",
+    srcs = ["rcv_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+    ],
+)
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 08afb7c17..e6a23c978 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -17,6 +17,7 @@ package tcp
 import (
 	"crypto/sha1"
 	"encoding/binary"
+	"fmt"
 	"hash"
 	"io"
 	"time"
@@ -25,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -49,17 +49,14 @@ const (
 	// timestamp and the current timestamp. If the difference is greater
 	// than maxTSDiff, the cookie is expired.
 	maxTSDiff = 2
-)
 
-var (
-	// SynRcvdCountThreshold is the global maximum number of connections
-	// that are allowed to be in SYN-RCVD state before TCP starts using SYN
-	// cookies to accept connections.
-	//
-	// It is an exported variable only for testing, and should not otherwise
-	// be used by importers of this package.
+	// SynRcvdCountThreshold is the default global maximum number of
+	// connections that are allowed to be in SYN-RCVD state before TCP
+	// starts using SYN cookies to accept connections.
 	SynRcvdCountThreshold uint64 = 1000
+)
 
+var (
 	// mssTable is a slice containing the possible MSS values that we
 	// encode in the SYN cookie with two bits.
 	mssTable = []uint16{536, 1300, 1440, 1460}
@@ -74,29 +71,42 @@ func encodeMSS(mss uint16) uint32 {
 	return 0
 }
 
-// syncRcvdCount is the number of endpoints in the SYN-RCVD state. The value is
-// protected by a mutex so that we can increment only when it's guaranteed not
-// to go above a threshold.
-var synRcvdCount struct {
-	sync.Mutex
-	value   uint64
-	pending sync.WaitGroup
-}
-
 // listenContext is used by a listening endpoint to store state used while
 // listening for connections. This struct is allocated by the listen goroutine
 // and must not be accessed or have its methods called concurrently as they
 // may mutate the stored objects.
 type listenContext struct {
-	stack    *stack.Stack
-	rcvWnd   seqnum.Size
-	nonce    [2][sha1.BlockSize]byte
+	stack *stack.Stack
+
+	// synRcvdCount is a reference to the stack level synRcvdCount.
+	synRcvdCount *synRcvdCounter
+
+	// rcvWnd is the receive window that is sent by this listening context
+	// in the initial SYN-ACK.
+	rcvWnd seqnum.Size
+
+	// nonce are random bytes that are initialized once when the context
+	// is created and used to seed the hash function when generating
+	// the SYN cookie.
+	nonce [2][sha1.BlockSize]byte
+
+	// listenEP is a reference to the listening endpoint associated with
+	// this context. Can be nil if the context is created by the forwarder.
 	listenEP *endpoint
 
+	// hasherMu protects hasher.
 	hasherMu sync.Mutex
-	hasher   hash.Hash
-	v6only   bool
+	// hasher is the hash function used to generate a SYN cookie.
+	hasher hash.Hash
+
+	// v6Only is true if listenEP is a dual stack socket and has the
+	// IPV6_V6ONLY option set.
+	v6Only bool
+
+	// netProto indicates the network protocol(IPv4/v6) for the listening
+	// endpoint.
 	netProto tcpip.NetworkProtocolNumber
+
 	// pendingMu protects pendingEndpoints. This should only be accessed
 	// by the listening endpoint's worker goroutine.
 	//
@@ -115,55 +125,22 @@ func timeStamp() uint32 {
 	return uint32(time.Now().Unix()>>6) & tsMask
 }
 
-// incSynRcvdCount tries to increment the global number of endpoints in SYN-RCVD
-// state. It succeeds if the increment doesn't make the count go beyond the
-// threshold, and fails otherwise.
-func incSynRcvdCount() bool {
-	synRcvdCount.Lock()
-
-	if synRcvdCount.value >= SynRcvdCountThreshold {
-		synRcvdCount.Unlock()
-		return false
-	}
-
-	synRcvdCount.pending.Add(1)
-	synRcvdCount.value++
-
-	synRcvdCount.Unlock()
-	return true
-}
-
-// decSynRcvdCount atomically decrements the global number of endpoints in
-// SYN-RCVD state. It must only be called if a previous call to incSynRcvdCount
-// succeeded.
-func decSynRcvdCount() {
-	synRcvdCount.Lock()
-
-	synRcvdCount.value--
-	synRcvdCount.pending.Done()
-	synRcvdCount.Unlock()
-}
-
-// synCookiesInUse() returns true if the synRcvdCount is greater than
-// SynRcvdCountThreshold.
-func synCookiesInUse() bool {
-	synRcvdCount.Lock()
-	v := synRcvdCount.value
-	synRcvdCount.Unlock()
-	return v >= SynRcvdCountThreshold
-}
-
 // newListenContext creates a new listen context.
-func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
+func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
 	l := &listenContext{
 		stack:            stk,
 		rcvWnd:           rcvWnd,
 		hasher:           sha1.New(),
-		v6only:           v6only,
+		v6Only:           v6Only,
 		netProto:         netProto,
 		listenEP:         listenEP,
 		pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint),
 	}
+	p, ok := stk.TransportProtocolInstance(ProtocolNumber).(*protocol)
+	if !ok {
+		panic(fmt.Sprintf("unable to get TCP protocol instance from stack: %+v", stk))
+	}
+	l.synRcvdCount = p.SynRcvdCounter()
 
 	rand.Read(l.nonce[0][:])
 	rand.Read(l.nonce[1][:])
@@ -221,7 +198,8 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 }
 
 // createConnectingEndpoint creates a new endpoint in a connecting state, with
-// the connection parameters given by the arguments.
+// the connection parameters given by the arguments. The endpoint is returned
+// with n.mu held.
 func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create a new endpoint.
 	netProto := l.netProto
@@ -229,34 +207,20 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 		netProto = s.route.NetProto
 	}
 	n := newEndpoint(l.stack, netProto, queue)
-	n.v6only = l.v6only
+	n.v6only = l.v6Only
 	n.ID = s.id
 	n.boundNICID = s.route.NICID()
 	n.route = s.route.Clone()
 	n.effectiveNetProtos = []tcpip.NetworkProtocolNumber{s.route.NetProto}
 	n.rcvBufSize = int(l.rcvWnd)
 	n.amss = mssForRoute(&n.route)
+	n.setEndpointState(StateConnecting)
 
 	n.maybeEnableTimestamp(rcvdSynOpts)
 	n.maybeEnableSACKPermitted(rcvdSynOpts)
 
 	n.initGSO()
 
-	// Now inherit any socket options that should be inherited from the
-	// listening endpoint.
-	// In case of Forwarder listenEP will be nil and hence this check.
-	if l.listenEP != nil {
-		l.listenEP.propagateInheritableOptions(n)
-	}
-
-	// Register new endpoint so that packets are routed to it.
-	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
-		n.Close()
-		return nil, err
-	}
-
-	n.isRegistered = true
-
 	// Create sender and receiver.
 	//
 	// The receiver at least temporarily has a zero receive window scale,
@@ -268,12 +232,28 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	// window to grow to a really large value.
 	n.rcvAutoParams.prevCopied = n.initialReceiveWindow()
 
+	// Lock the endpoint before registering to ensure that no out of
+	// band changes are possible due to incoming packets etc till
+	// the endpoint is done initializing.
+	n.mu.Lock()
+
+	// Register new endpoint so that packets are routed to it.
+	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
+		n.mu.Unlock()
+		n.Close()
+		return nil, err
+	}
+
+	n.isRegistered = true
+
 	return n, nil
 }
 
 // createEndpointAndPerformHandshake creates a new endpoint in connected state
 // and then performs the TCP 3-way handshake.
-func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
+//
+// The new endpoint is returned with e.mu held.
+func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
 	isn := generateSecureISN(s.id, l.stack.Seed())
@@ -281,6 +261,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	if err != nil {
 		return nil, err
 	}
+	ep.owner = owner
 
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
 	deferAccept := time.Duration(0)
@@ -288,25 +269,50 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		l.listenEP.mu.Lock()
 		if l.listenEP.EndpointState() != StateListen {
 			l.listenEP.mu.Unlock()
+			// Ensure we release any registrations done by the newly
+			// created endpoint.
+			ep.mu.Unlock()
+			ep.Close()
+
+			// Wake up any waiters. This is strictly not required normally
+			// as a socket that was never accepted can't really have any
+			// registered waiters except when stack.Wait() is called which
+			// waits for all registered endpoints to stop and expects an
+			// EventHUp.
+			ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 			return nil, tcpip.ErrConnectionAborted
 		}
 		l.addPendingEndpoint(ep)
+
+		// Propagate any inheritable options from the listening endpoint
+		// to the newly created endpoint.
+		l.listenEP.propagateInheritableOptionsLocked(ep)
+
 		deferAccept = l.listenEP.deferAccept
 		l.listenEP.mu.Unlock()
 	}
 
 	// Perform the 3-way handshake.
-	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
+	h := newPassiveHandshake(ep, ep.rcv.rcvWnd, isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
+		ep.mu.Unlock()
 		ep.Close()
+		// Wake up any waiters. This is strictly not required normally
+		// as a socket that was never accepted can't really have any
+		// registered waiters except when stack.Wait() is called which
+		// waits for all registered endpoints to stop and expects an
+		// EventHUp.
+		ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+
 		if l.listenEP != nil {
 			l.removePendingEndpoint(ep)
 		}
+
+		ep.drainClosingSegmentQueue()
+
 		return nil, err
 	}
-	ep.mu.Lock()
 	ep.isConnectNotified = true
-	ep.mu.Unlock()
 
 	// Update the receive window scaling. We can't do it before the
 	// handshake because it's possible that the peer doesn't support window
@@ -340,30 +346,38 @@ func (l *listenContext) closeAllPendingEndpoints() {
 }
 
 // deliverAccepted delivers the newly-accepted endpoint to the listener. If the
-// endpoint has transitioned out of the listen state, the new endpoint is closed
-// instead.
+// endpoint has transitioned out of the listen state (acceptedChan is nil),
+// the new endpoint is closed instead.
 func (e *endpoint) deliverAccepted(n *endpoint) {
 	e.mu.Lock()
-	state := e.EndpointState()
 	e.pendingAccepted.Add(1)
-	defer e.pendingAccepted.Done()
-	acceptedChan := e.acceptedChan
 	e.mu.Unlock()
+	defer e.pendingAccepted.Done()
 
-	if state == StateListen {
-		acceptedChan <- n
-		e.waiterQueue.Notify(waiter.EventIn)
-	} else {
-		n.Close()
+	e.acceptMu.Lock()
+	for {
+		if e.acceptedChan == nil {
+			e.acceptMu.Unlock()
+			n.notifyProtocolGoroutine(notifyReset)
+			return
+		}
+		select {
+		case e.acceptedChan <- n:
+			e.acceptMu.Unlock()
+			e.waiterQueue.Notify(waiter.EventIn)
+			return
+		default:
+			e.acceptCond.Wait()
+		}
 	}
 }
 
-// propagateInheritableOptions propagates any options set on the listening
+// propagateInheritableOptionsLocked propagates any options set on the listening
 // endpoint to the newly created endpoint.
-func (e *endpoint) propagateInheritableOptions(n *endpoint) {
-	e.mu.Lock()
+//
+// Precondition: e.mu and n.mu must be held.
+func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) {
 	n.userTimeout = e.userTimeout
-	e.mu.Unlock()
 }
 
 // handleSynSegment is called in its own goroutine once the listening endpoint
@@ -373,11 +387,15 @@ func (e *endpoint) propagateInheritableOptions(n *endpoint) {
 // A limited number of these goroutines are allowed before TCP starts using SYN
 // cookies to accept connections.
 func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) {
-	defer decSynRcvdCount()
-	defer e.decSynRcvdCount()
+	defer ctx.synRcvdCount.dec()
+	defer func() {
+		e.mu.Lock()
+		e.decSynRcvdCount()
+		e.mu.Unlock()
+	}()
 	defer s.decRef()
 
-	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{})
+	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{}, e.owner)
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		e.stats.FailedConnectionAttempts.Increment()
@@ -391,40 +409,39 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 }
 
 func (e *endpoint) incSynRcvdCount() bool {
-	e.mu.Lock()
-	if e.synRcvdCount >= cap(e.acceptedChan) {
-		e.mu.Unlock()
-		return false
+	e.acceptMu.Lock()
+	canInc := e.synRcvdCount < cap(e.acceptedChan)
+	e.acceptMu.Unlock()
+	if canInc {
+		e.synRcvdCount++
 	}
-	e.synRcvdCount++
-	e.mu.Unlock()
-	return true
+	return canInc
 }
 
 func (e *endpoint) decSynRcvdCount() {
-	e.mu.Lock()
 	e.synRcvdCount--
-	e.mu.Unlock()
 }
 
 func (e *endpoint) acceptQueueIsFull() bool {
-	e.mu.Lock()
-	if l, c := len(e.acceptedChan)+e.synRcvdCount, cap(e.acceptedChan); l >= c {
-		e.mu.Unlock()
-		return true
-	}
-	e.mu.Unlock()
-	return false
+	e.acceptMu.Lock()
+	full := len(e.acceptedChan)+e.synRcvdCount >= cap(e.acceptedChan)
+	e.acceptMu.Unlock()
+	return full
 }
 
 // handleListenSegment is called when a listening endpoint receives a segment
 // and needs to handle it.
 func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
-	if s.flagsAreSet(header.TCPFlagSyn | header.TCPFlagAck) {
+	e.rcvListMu.Lock()
+	rcvClosed := e.rcvClosed
+	e.rcvListMu.Unlock()
+	if rcvClosed || s.flagsAreSet(header.TCPFlagSyn|header.TCPFlagAck) {
+		// If the endpoint is shutdown, reply with reset.
+		//
 		// RFC 793 section 3.4 page 35 (figure 12) outlines that a RST
 		// must be sent in response to a SYN-ACK while in the listen
 		// state to prevent completing a handshake from an old SYN.
-		e.sendTCP(&s.route, s.id, buffer.VectorisedView{}, e.ttl, e.sendTOS, header.TCPFlagRst, s.ackNumber, 0, 0, nil, nil)
+		replyWithReset(s, e.sendTOS, e.ttl)
 		return
 	}
 
@@ -434,7 +451,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 	switch {
 	case s.flags == header.TCPFlagSyn:
 		opts := parseSynSegmentOptions(s)
-		if incSynRcvdCount() {
+		if ctx.synRcvdCount.inc() {
 			// Only handle the syn if the following conditions hold
 			//   - accept queue is not full.
 			//   - number of connections in synRcvd state is less than the
@@ -444,7 +461,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 				go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
 				return
 			}
-			decSynRcvdCount()
+			ctx.synRcvdCount.dec()
 			e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
 			e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment()
 			e.stack.Stats().DroppedPackets.Increment()
@@ -472,7 +489,15 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 				TSEcr: opts.TSVal,
 				MSS:   mssForRoute(&s.route),
 			}
-			e.sendSynTCP(&s.route, s.id, e.ttl, e.sendTOS, header.TCPFlagSyn|header.TCPFlagAck, cookie, s.sequenceNumber+1, ctx.rcvWnd, synOpts)
+			e.sendSynTCP(&s.route, tcpFields{
+				id:     s.id,
+				ttl:    e.ttl,
+				tos:    e.sendTOS,
+				flags:  header.TCPFlagSyn | header.TCPFlagAck,
+				seq:    cookie,
+				ack:    s.sequenceNumber + 1,
+				rcvWnd: ctx.rcvWnd,
+			}, synOpts)
 			e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
 		}
 
@@ -489,7 +514,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			return
 		}
 
-		if !synCookiesInUse() {
+		if !ctx.synRcvdCount.synCookiesInUse() {
 			// When not using SYN cookies, as per RFC 793, section 3.9, page 64:
 			// Any acknowledgment is bad if it arrives on a connection still in
 			// the LISTEN state.  An acceptable reset segment should be formed
@@ -505,7 +530,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			// The only time we should reach here when a connection
 			// was opened and closed really quickly and a delayed
 			// ACK was received from the sender.
-			replyWithReset(s)
+			replyWithReset(s, e.sendTOS, e.ttl)
 			return
 		}
 
@@ -551,6 +576,10 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			return
 		}
 
+		// Propagate any inheritable options from the listening endpoint
+		// to the newly created endpoint.
+		e.propagateInheritableOptionsLocked(n)
+
 		// clear the tsOffset for the newly created
 		// endpoint as the Timestamp was already
 		// randomly offset when the original SYN-ACK was
@@ -584,15 +613,13 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 // its own goroutine and is responsible for handling connection requests.
 func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 	e.mu.Lock()
-	v6only := e.v6only
-	e.mu.Unlock()
-	ctx := newListenContext(e.stack, e, rcvWnd, v6only, e.NetProto)
+	v6Only := e.v6only
+	ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto)
 
 	defer func() {
 		// Mark endpoint as closed. This will prevent goroutines running
 		// handleSynSegment() from attempting to queue new connections
 		// to the endpoint.
-		e.mu.Lock()
 		e.setEndpointState(StateClose)
 
 		// close any endpoints in SYN-RCVD state.
@@ -606,15 +633,20 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 		}
 		e.mu.Unlock()
 
+		e.drainClosingSegmentQueue()
+
 		// Notify waiters that the endpoint is shutdown.
-		e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut)
+		e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
 	}()
 
 	s := sleep.Sleeper{}
 	s.AddWaker(&e.notificationWaker, wakerForNotification)
 	s.AddWaker(&e.newSegmentWaker, wakerForNewSegment)
 	for {
-		switch index, _ := s.Fetch(true); index {
+		e.mu.Unlock()
+		index, _ := s.Fetch(true)
+		e.mu.Lock()
+		switch index {
 		case wakerForNotification:
 			n := e.fetchNotifications()
 			if n&notifyClose != 0 {
@@ -627,7 +659,9 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 					s.decRef()
 				}
 				close(e.drainDone)
+				e.mu.Unlock()
 				<-e.undrain
+				e.mu.Lock()
 			}
 
 		case wakerForNewSegment:
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 5c5397823..7da93dcc4 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -61,6 +61,9 @@ const (
 )
 
 // handshake holds the state used during a TCP 3-way handshake.
+//
+// NOTE: handshake.ep.mu is held during handshake processing. It is released if
+// we are going to block and reacquired when we start processing an event.
 type handshake struct {
 	ep     *endpoint
 	state  handshakeState
@@ -102,24 +105,11 @@ type handshake struct {
 }
 
 func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
-	rcvWndScale := ep.rcvWndScaleForHandshake()
-
-	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
-	// window offered in SYN won't be reduced due to the loss of precision if
-	// window scaling is enabled after the handshake.
-	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
-
-	// Ensure we can always accept at least 1 byte if the scale specified
-	// was too high for the provided rcvWnd.
-	if rcvWnd == 0 {
-		rcvWnd = 1
-	}
-
 	h := handshake{
 		ep:          ep,
 		active:      true,
 		rcvWnd:      rcvWnd,
-		rcvWndScale: int(rcvWndScale),
+		rcvWndScale: ep.rcvWndScaleForHandshake(),
 	}
 	h.resetState()
 	return h
@@ -209,9 +199,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
 	h.deferAccept = deferAccept
-	h.ep.mu.Lock()
 	h.ep.setEndpointState(StateSynRecv)
-	h.ep.mu.Unlock()
 }
 
 // checkAck checks if the ACK number, if present, of a segment received during
@@ -241,9 +229,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
 			// was acceptable then signal the user "error: connection reset", drop
 			// the segment, enter CLOSED state, delete TCB, and return."
-			h.ep.mu.Lock()
 			h.ep.workerCleanup = true
-			h.ep.mu.Unlock()
 			// Although the RFC above calls out ECONNRESET, Linux actually returns
 			// ECONNREFUSED here so we do as well.
 			return tcpip.ErrConnectionRefused
@@ -281,9 +267,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	if s.flagIsSet(header.TCPFlagAck) {
 		h.state = handshakeCompleted
 
-		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
-		h.ep.mu.Unlock()
 
 		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
 		return nil
@@ -293,10 +277,9 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// but resend our own SYN and wait for it to be acknowledged in the
 	// SYN-RCVD state.
 	h.state = handshakeSynRcvd
-	h.ep.mu.Lock()
 	ttl := h.ep.ttl
+	amss := h.ep.amss
 	h.ep.setEndpointState(StateSynRecv)
-	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
 		WS:    int(h.effectiveRcvWndScale()),
 		TS:    rcvSynOpts.TS,
@@ -307,12 +290,20 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 		// permits SACK. This is not explicitly defined in the RFC but
 		// this is the behaviour implemented by Linux.
 		SACKPermitted: rcvSynOpts.SACKPermitted,
-		MSS:           h.ep.amss,
+		MSS:           amss,
 	}
 	if ttl == 0 {
 		ttl = s.route.DefaultTTL()
 	}
-	h.ep.sendSynTCP(&s.route, h.ep.ID, ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+	h.ep.sendSynTCP(&s.route, tcpFields{
+		id:     h.ep.ID,
+		ttl:    ttl,
+		tos:    h.ep.sendTOS,
+		flags:  h.flags,
+		seq:    h.iss,
+		ack:    h.ackNum,
+		rcvWnd: h.rcvWnd,
+	}, synOpts)
 	return nil
 }
 
@@ -365,7 +356,15 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			SACKPermitted: h.ep.sackPermitted,
 			MSS:           h.ep.amss,
 		}
-		h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+		h.ep.sendSynTCP(&s.route, tcpFields{
+			id:     h.ep.ID,
+			ttl:    h.ep.ttl,
+			tos:    h.ep.sendTOS,
+			flags:  h.flags,
+			seq:    h.iss,
+			ack:    h.ackNum,
+			rcvWnd: h.rcvWnd,
+		}, synOpts)
 		return nil
 	}
 
@@ -394,15 +393,14 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 		}
 		h.state = handshakeCompleted
 
-		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
+
 		// If the segment has data then requeue it for the receiver
 		// to process it again once main loop is started.
 		if s.data.Size() > 0 {
 			s.incRef()
 			h.ep.enqueueSegment(s)
 		}
-		h.ep.mu.Unlock()
 		return nil
 	}
 
@@ -488,7 +486,9 @@ func (h *handshake) resolveRoute() *tcpip.Error {
 			}
 			if n&notifyDrain != 0 {
 				close(h.ep.drainDone)
+				h.ep.mu.Unlock()
 				<-h.ep.undrain
+				h.ep.mu.Lock()
 			}
 		}
 
@@ -553,10 +553,23 @@ func (h *handshake) execute() *tcpip.Error {
 			synOpts.WS = -1
 		}
 	}
-	h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+
+	h.ep.sendSynTCP(&h.ep.route, tcpFields{
+		id:     h.ep.ID,
+		ttl:    h.ep.ttl,
+		tos:    h.ep.sendTOS,
+		flags:  h.flags,
+		seq:    h.iss,
+		ack:    h.ackNum,
+		rcvWnd: h.rcvWnd,
+	}, synOpts)
 
 	for h.state != handshakeCompleted {
-		switch index, _ := s.Fetch(true); index {
+		h.ep.mu.Unlock()
+		index, _ := s.Fetch(true)
+		h.ep.mu.Lock()
+		switch index {
+
 		case wakerForResend:
 			timeOut *= 2
 			if timeOut > MaxRTO {
@@ -572,12 +585,20 @@ func (h *handshake) execute() *tcpip.Error {
 			// the connection with another ACK or data (as ACKs are never
 			// retransmitted on their own).
 			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
-				h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+				h.ep.sendSynTCP(&h.ep.route, tcpFields{
+					id:     h.ep.ID,
+					ttl:    h.ep.ttl,
+					tos:    h.ep.sendTOS,
+					flags:  h.flags,
+					seq:    h.iss,
+					ack:    h.ackNum,
+					rcvWnd: h.rcvWnd,
+				}, synOpts)
 			}
 
 		case wakerForNotification:
 			n := h.ep.fetchNotifications()
-			if n&notifyClose != 0 {
+			if (n&notifyClose)|(n&notifyAbort) != 0 {
 				return tcpip.ErrAborted
 			}
 			if n&notifyDrain != 0 {
@@ -593,7 +614,9 @@ func (h *handshake) execute() *tcpip.Error {
 					}
 				}
 				close(h.ep.drainDone)
+				h.ep.mu.Unlock()
 				<-h.ep.undrain
+				h.ep.mu.Lock()
 			}
 
 		case wakerForNewSegment:
@@ -617,17 +640,17 @@ func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
 
 var optionPool = sync.Pool{
 	New: func() interface{} {
-		return make([]byte, maxOptionSize)
+		return &[maxOptionSize]byte{}
 	},
 }
 
 func getOptions() []byte {
-	return optionPool.Get().([]byte)
+	return (*optionPool.Get().(*[maxOptionSize]byte))[:]
 }
 
 func putOptions(options []byte) {
 	// Reslice to full capacity.
-	optionPool.Put(options[0:cap(options)])
+	optionPool.Put(optionsToArray(options))
 }
 
 func makeSynOptions(opts header.TCPSynOptions) []byte {
@@ -683,18 +706,33 @@ func makeSynOptions(opts header.TCPSynOptions) []byte {
 	return options[:offset]
 }
 
-func (e *endpoint) sendSynTCP(r *stack.Route, id stack.TransportEndpointID, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts header.TCPSynOptions) *tcpip.Error {
-	options := makeSynOptions(opts)
+// tcpFields is a struct to carry different parameters required by the
+// send*TCP variant functions below.
+type tcpFields struct {
+	id     stack.TransportEndpointID
+	ttl    uint8
+	tos    uint8
+	flags  byte
+	seq    seqnum.Value
+	ack    seqnum.Value
+	rcvWnd seqnum.Size
+	opts   []byte
+	txHash uint32
+}
+
+func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) *tcpip.Error {
+	tf.opts = makeSynOptions(opts)
 	// We ignore SYN send errors and let the callers re-attempt send.
-	if err := e.sendTCP(r, id, buffer.VectorisedView{}, ttl, tos, flags, seq, ack, rcvWnd, options, nil); err != nil {
+	if err := e.sendTCP(r, tf, buffer.VectorisedView{}, nil); err != nil {
 		e.stats.SendErrors.SynSendToNetworkFailed.Increment()
 	}
-	putOptions(options)
+	putOptions(tf.opts)
 	return nil
 }
 
-func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
-	if err := sendTCP(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso); err != nil {
+func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
+	tf.txHash = e.txHash
+	if err := sendTCP(r, tf, data, gso, e.owner); err != nil {
 		e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
 		return err
 	}
@@ -702,24 +740,23 @@ func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data bu
 	return nil
 }
 
-func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *tcpip.PacketBuffer, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) {
-	optLen := len(opts)
+func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *stack.GSO) {
+	optLen := len(tf.opts)
 	hdr := &pkt.Header
-	packetSize := pkt.DataSize
-	off := pkt.DataOffset
+	packetSize := pkt.Data.Size()
 	// Initialize the header.
 	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
 	pkt.TransportHeader = buffer.View(tcp)
 	tcp.Encode(&header.TCPFields{
-		SrcPort:    id.LocalPort,
-		DstPort:    id.RemotePort,
-		SeqNum:     uint32(seq),
-		AckNum:     uint32(ack),
+		SrcPort:    tf.id.LocalPort,
+		DstPort:    tf.id.RemotePort,
+		SeqNum:     uint32(tf.seq),
+		AckNum:     uint32(tf.ack),
 		DataOffset: uint8(header.TCPMinimumSize + optLen),
-		Flags:      flags,
-		WindowSize: uint16(rcvWnd),
+		Flags:      tf.flags,
+		WindowSize: uint16(tf.rcvWnd),
 	})
-	copy(tcp[header.TCPMinimumSize:], opts)
+	copy(tcp[header.TCPMinimumSize:], tf.opts)
 
 	length := uint16(hdr.UsedLength() + packetSize)
 	xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
@@ -731,48 +768,52 @@ func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, pkt *tcpip.Packet
 		// header and data and get the right sum of the TCP packet.
 		tcp.SetChecksum(xsum)
 	} else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
-		xsum = header.ChecksumVVWithOffset(pkt.Data, xsum, off, packetSize)
+		xsum = header.ChecksumVV(pkt.Data, xsum)
 		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
 	}
-
 }
 
-func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
-	optLen := len(opts)
-	if rcvWnd > 0xffff {
-		rcvWnd = 0xffff
+func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error {
+	// We need to shallow clone the VectorisedView here as ReadToView will
+	// split the VectorisedView and Trim underlying views as it splits. Not
+	// doing the clone here will cause the underlying views of data itself
+	// to be altered.
+	data = data.Clone(nil)
+
+	optLen := len(tf.opts)
+	if tf.rcvWnd > 0xffff {
+		tf.rcvWnd = 0xffff
 	}
 
 	mss := int(gso.MSS)
 	n := (data.Size() + mss - 1) / mss
 
-	// Allocate one big slice for all the headers.
-	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
-	buf := make([]byte, n*hdrSize)
-	pkts := make([]tcpip.PacketBuffer, n)
-	for i := range pkts {
-		pkts[i].Header = buffer.NewEmptyPrependableFromView(buf[i*hdrSize:][:hdrSize])
-	}
-
 	size := data.Size()
-	off := 0
+	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
+	var pkts stack.PacketBufferList
 	for i := 0; i < n; i++ {
 		packetSize := mss
 		if packetSize > size {
 			packetSize = size
 		}
 		size -= packetSize
-		pkts[i].DataOffset = off
-		pkts[i].DataSize = packetSize
-		pkts[i].Data = data
-		buildTCPHdr(r, id, &pkts[i], flags, seq, ack, rcvWnd, opts, gso)
-		off += packetSize
-		seq = seq.Add(seqnum.Size(packetSize))
-	}
-	if ttl == 0 {
-		ttl = r.DefaultTTL()
-	}
-	sent, err := r.WritePackets(gso, pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos})
+		var pkt stack.PacketBuffer
+		pkt.Header = buffer.NewPrependable(hdrSize)
+		pkt.Hash = tf.txHash
+		pkt.Owner = owner
+		pkt.EgressRoute = r
+		pkt.GSOOptions = gso
+		pkt.NetworkProtocolNumber = r.NetworkProtocolNumber()
+		data.ReadToVV(&pkt.Data, packetSize)
+		buildTCPHdr(r, tf, &pkt, gso)
+		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
+		pkts.PushBack(&pkt)
+	}
+
+	if tf.ttl == 0 {
+		tf.ttl = r.DefaultTTL()
+	}
+	sent, err := r.WritePackets(gso, pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos})
 	if err != nil {
 		r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
 	}
@@ -782,33 +823,33 @@ func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.Vect
 
 // sendTCP sends a TCP segment with the provided options via the provided
 // network endpoint and under the provided identity.
-func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
-	optLen := len(opts)
-	if rcvWnd > 0xffff {
-		rcvWnd = 0xffff
+func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error {
+	optLen := len(tf.opts)
+	if tf.rcvWnd > 0xffff {
+		tf.rcvWnd = 0xffff
 	}
 
 	if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
-		return sendTCPBatch(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso)
+		return sendTCPBatch(r, tf, data, gso, owner)
 	}
 
-	pkt := tcpip.PacketBuffer{
-		Header:     buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
-		DataOffset: 0,
-		DataSize:   data.Size(),
-		Data:       data,
+	pkt := &stack.PacketBuffer{
+		Header: buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
+		Data:   data,
+		Hash:   tf.txHash,
+		Owner:  owner,
 	}
-	buildTCPHdr(r, id, &pkt, flags, seq, ack, rcvWnd, opts, gso)
+	buildTCPHdr(r, tf, pkt, gso)
 
-	if ttl == 0 {
-		ttl = r.DefaultTTL()
+	if tf.ttl == 0 {
+		tf.ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, pkt); err != nil {
+	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil {
 		r.Stats().TCP.SegmentSendErrors.Increment()
 		return err
 	}
 	r.Stats().TCP.SegmentsSent.Increment()
-	if (flags & header.TCPFlagRst) != 0 {
+	if (tf.flags & header.TCPFlagRst) != 0 {
 		r.Stats().TCP.ResetsSent.Increment()
 	}
 	return nil
@@ -860,7 +901,16 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
-	err := e.sendTCP(&e.route, e.ID, data, e.ttl, e.sendTOS, flags, seq, ack, rcvWnd, options, e.gso)
+	err := e.sendTCP(&e.route, tcpFields{
+		id:     e.ID,
+		ttl:    e.ttl,
+		tos:    e.sendTOS,
+		flags:  flags,
+		seq:    seq,
+		ack:    ack,
+		rcvWnd: rcvWnd,
+		opts:   options,
+	}, data, e.gso)
 	putOptions(options)
 	return err
 }
@@ -875,7 +925,6 @@ func (e *endpoint) handleWrite() *tcpip.Error {
 	first := e.sndQueue.Front()
 	if first != nil {
 		e.snd.writeList.PushBackList(&e.sndQueue)
-		e.snd.sndNxtList.UpdateForward(e.sndBufInQueue)
 		e.sndBufInQueue = 0
 	}
 
@@ -994,22 +1043,40 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 		ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, &s.route)
 	}
 	if ep == nil {
-		replyWithReset(s)
+		replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
 		s.decRef()
 		return
 	}
+
+	if e == ep {
+		panic("current endpoint not removed from demuxer, enqueing segments to itself")
+	}
+
 	if ep.(*endpoint).enqueueSegment(s) {
 		ep.(*endpoint).newSegmentWaker.Assert()
 	}
 }
 
+// Drain segment queue from the endpoint and try to re-match the segment to a
+// different endpoint. This is used when the current endpoint is transitioned to
+// StateClose and has been unregistered from the transport demuxer.
+func (e *endpoint) drainClosingSegmentQueue() {
+	for {
+		s := e.segmentQueue.dequeue()
+		if s == nil {
+			break
+		}
+
+		e.tryDeliverSegmentFromClosedEndpoint(s)
+	}
+}
+
 func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 	if e.rcv.acceptable(s.sequenceNumber, 0) {
 		// RFC 793, page 37 states that "in all states
 		// except SYN-SENT, all reset (RST) segments are
 		// validated by checking their SEQ-fields." So
 		// we only process it if it's acceptable.
-		e.mu.Lock()
 		switch e.EndpointState() {
 		// In case of a RST in CLOSE-WAIT linux moves
 		// the socket to closed state with an error set
@@ -1033,11 +1100,9 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		case StateCloseWait:
 			e.transitionToStateCloseLocked()
 			e.HardError = tcpip.ErrAborted
-			e.mu.Unlock()
 			e.notifyProtocolGoroutine(notifyTickleWorker)
 			return false, nil
 		default:
-			e.mu.Unlock()
 			// RFC 793, page 37 states that "in all states
 			// except SYN-SENT, all reset (RST) segments are
 			// validated by checking their SEQ-fields." So
@@ -1150,9 +1215,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
 		// Now check if the received segment has caused us to transition
 		// to a CLOSED state, if yes then terminate processing and do
 		// not invoke the sender.
-		e.mu.RLock()
 		state := e.state
-		e.mu.RUnlock()
 		if state == StateClose {
 			// When we get into StateClose while processing from the queue,
 			// return immediately and let the protocolMainloop handle it.
@@ -1175,9 +1238,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
 // keepalive packets periodically when the connection is idle. If we don't hear
 // from the other side after a number of tries, we terminate the connection.
 func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
-	e.mu.RLock()
 	userTimeout := e.userTimeout
-	e.mu.RUnlock()
 
 	e.keepalive.Lock()
 	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
@@ -1241,6 +1302,7 @@ func (e *endpoint) disableKeepaliveTimer() {
 // goroutine and is responsible for sending segments and handling received
 // segments.
 func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) *tcpip.Error {
+	e.mu.Lock()
 	var closeTimer *time.Timer
 	var closeWaker sleep.Waker
 
@@ -1262,7 +1324,9 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 		}
 
 		e.mu.Unlock()
-		e.workMu.Unlock()
+
+		e.drainClosingSegmentQueue()
+
 		// When the protocol loop exits we should wake up our waiters.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}
@@ -1273,19 +1337,17 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 		// completion.
 		initialRcvWnd := e.initialReceiveWindow()
 		h := newHandshake(e, seqnum.Size(initialRcvWnd))
-		e.mu.Lock()
 		h.ep.setEndpointState(StateSynSent)
-		e.mu.Unlock()
 
 		if err := h.execute(); err != nil {
 			e.lastErrorMu.Lock()
 			e.lastError = err
 			e.lastErrorMu.Unlock()
 
-			e.mu.Lock()
 			e.setEndpointState(StateError)
 			e.HardError = err
 
+			e.workerCleanup = true
 			// Lock released below.
 			epilogue()
 			return err
@@ -1295,9 +1357,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	e.keepalive.timer.init(&e.keepalive.waker)
 	defer e.keepalive.timer.cleanup()
 
-	e.mu.Lock()
 	drained := e.drainDone != nil
-	e.mu.Unlock()
 	if drained {
 		close(e.drainDone)
 		<-e.undrain
@@ -1323,10 +1383,8 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 				// This means the socket is being closed due
 				// to the TCP-FIN-WAIT2 timeout was hit. Just
 				// mark the socket as closed.
-				e.mu.Lock()
 				e.transitionToStateCloseLocked()
 				e.workerCleanup = true
-				e.mu.Unlock()
 				return nil
 			},
 		},
@@ -1372,7 +1430,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 					e.snd.updateMaxPayloadSize(mtu, count)
 				}
 
-				if n&notifyReset != 0 {
+				if n&notifyReset != 0 || n&notifyAbort != 0 {
 					return tcpip.ErrConnectionAborted
 				}
 
@@ -1381,7 +1439,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 				}
 
 				if n&notifyClose != 0 && closeTimer == nil {
-					e.mu.Lock()
 					if e.EndpointState() == StateFinWait2 && e.closed {
 						// The socket has been closed and we are in FIN_WAIT2
 						// so start the FIN_WAIT2 timer.
@@ -1390,7 +1447,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 						})
 						e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 					}
-					e.mu.Unlock()
 				}
 
 				if n&notifyKeepaliveChanged != 0 {
@@ -1410,7 +1466,9 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 						// Only block the worker if the endpoint
 						// is not in closed state or error state.
 						close(e.drainDone)
+						e.mu.Unlock()
 						<-e.undrain
+						e.mu.Lock()
 					}
 				}
 
@@ -1453,7 +1511,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	}
 	e.rcvListMu.Unlock()
 
-	e.mu.Lock()
 	if e.workerCleanup {
 		e.notifyProtocolGoroutine(notifyClose)
 	}
@@ -1461,7 +1518,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
 	cleanupOnError := func(err *tcpip.Error) {
-		e.mu.Lock()
 		e.workerCleanup = true
 		if err != nil {
 			e.resetConnectionLocked(err)
@@ -1473,16 +1529,11 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 loop:
 	for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError {
 		e.mu.Unlock()
-		e.workMu.Unlock()
 		v, _ := s.Fetch(true)
-		e.workMu.Lock()
+		e.mu.Lock()
 
-		// We need to double check here because the notification maybe
+		// We need to double check here because the notification may be
 		// stale by the time we got around to processing it.
-		//
-		// NOTE: since we now hold the workMu the processors cannot
-		// change the state of the endpoint so it's safe to proceed
-		// after this check.
 		switch e.EndpointState() {
 		case StateError:
 			// If the endpoint has already transitioned to an ERROR
@@ -1495,21 +1546,17 @@ loop:
 		case StateTimeWait:
 			fallthrough
 		case StateClose:
-			e.mu.Lock()
 			break loop
 		default:
 			if err := funcs[v].f(); err != nil {
 				cleanupOnError(err)
 				return nil
 			}
-			e.mu.Lock()
 		}
 	}
 
-	state := e.EndpointState()
-	e.mu.Unlock()
 	var reuseTW func()
-	if state == StateTimeWait {
+	if e.EndpointState() == StateTimeWait {
 		// Disable close timer as we now entering real TIME_WAIT.
 		if closeTimer != nil {
 			closeTimer.Stop()
@@ -1519,14 +1566,11 @@ loop:
 		s.Done()
 		// Wake up any waiters before we enter TIME_WAIT.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
-		e.mu.Lock()
 		e.workerCleanup = true
-		e.mu.Unlock()
 		reuseTW = e.doTimeWait()
 	}
 
 	// Mark endpoint as closed.
-	e.mu.Lock()
 	if e.EndpointState() != StateError {
 		e.transitionToStateCloseLocked()
 	}
@@ -1534,19 +1578,6 @@ loop:
 	// Lock released below.
 	epilogue()
 
-	// epilogue removes the endpoint from the transport-demuxer and
-	// unlocks e.mu. Now that no new segments can get enqueued to this
-	// endpoint, try to re-match the segment to a different endpoint
-	// as the current endpoint is closed.
-	for {
-		s := e.segmentQueue.dequeue()
-		if s == nil {
-			break
-		}
-
-		e.tryDeliverSegmentFromClosedEndpoint(s)
-	}
-
 	// A new SYN was received during TIME_WAIT and we need to abort
 	// the timewait and redirect the segment to the listener queue
 	if reuseTW != nil {
@@ -1632,6 +1663,7 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 	const timeWaitDone = 3
 
 	s := sleep.Sleeper{}
+	defer s.Done()
 	s.AddWaker(&e.newSegmentWaker, newSegment)
 	s.AddWaker(&e.notificationWaker, notification)
 
@@ -1641,9 +1673,9 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 	defer timeWaitTimer.Stop()
 
 	for {
-		e.workMu.Unlock()
+		e.mu.Unlock()
 		v, _ := s.Fetch(true)
-		e.workMu.Lock()
+		e.mu.Lock()
 		switch v {
 		case newSegment:
 			extendTimeWait, reuseTW := e.handleTimeWaitSegments()
@@ -1655,7 +1687,7 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 			}
 		case notification:
 			n := e.fetchNotifications()
-			if n&notifyClose != 0 {
+			if n&notifyClose != 0 || n&notifyAbort != 0 {
 				return nil
 			}
 			if n&notifyDrain != 0 {
@@ -1666,7 +1698,9 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 					e.handleTimeWaitSegments()
 				}
 				close(e.drainDone)
+				e.mu.Unlock()
 				<-e.undrain
+				e.mu.Lock()
 				return nil
 			}
 		case timeWaitDone:
diff --git a/test/root/testdata/busybox.go b/pkg/tcpip/transport/tcp/connect_unsafe.go
index e4dbd2843..cfc304616 100644
--- a/test/root/testdata/busybox.go
+++ b/pkg/tcpip/transport/tcp/connect_unsafe.go
@@ -12,21 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package testdata
+package tcp
 
-// MountOverSymlink is a JSON config for a container that /etc/resolv.conf is a
-// symlink to /tmp/resolv.conf.
-var MountOverSymlink = `
-{
-        "metadata": {
-                "name": "busybox"
-        },
-        "image": {
-                "image": "k8s.gcr.io/busybox"
-        },
-        "command": [
-                "sleep",
-                "1000"
-        ]
+import (
+	"reflect"
+	"unsafe"
+)
+
+// optionsToArray converts a slice of capacity >-= maxOptionSize to an array.
+//
+// optionsToArray panics if the capacity of options is smaller than
+// maxOptionSize.
+func optionsToArray(options []byte) *[maxOptionSize]byte {
+	// Reslice to full capacity.
+	options = options[0:maxOptionSize]
+	return (*[maxOptionSize]byte)(unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&options)).Data))
 }
-`
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index e18012ac0..047704c80 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -18,7 +18,6 @@ import (
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -68,17 +67,28 @@ func (q *epQueue) empty() bool {
 type processor struct {
 	epQ              epQueue
 	newEndpointWaker sleep.Waker
+	closeWaker       sleep.Waker
 	id               int
+	wg               sync.WaitGroup
 }
 
 func newProcessor(id int) *processor {
 	p := &processor{
 		id: id,
 	}
+	p.wg.Add(1)
 	go p.handleSegments()
 	return p
 }
 
+func (p *processor) close() {
+	p.closeWaker.Assert()
+}
+
+func (p *processor) wait() {
+	p.wg.Wait()
+}
+
 func (p *processor) queueEndpoint(ep *endpoint) {
 	// Queue an endpoint for processing by the processor goroutine.
 	p.epQ.enqueue(ep)
@@ -87,11 +97,17 @@ func (p *processor) queueEndpoint(ep *endpoint) {
 
 func (p *processor) handleSegments() {
 	const newEndpointWaker = 1
+	const closeWaker = 2
 	s := sleep.Sleeper{}
 	s.AddWaker(&p.newEndpointWaker, newEndpointWaker)
+	s.AddWaker(&p.closeWaker, closeWaker)
 	defer s.Done()
 	for {
-		s.Fetch(true)
+		id, ok := s.Fetch(true)
+		if ok && id == closeWaker {
+			p.wg.Done()
+			return
+		}
 		for ep := p.epQ.dequeue(); ep != nil; ep = p.epQ.dequeue() {
 			if ep.segmentQueue.empty() {
 				continue
@@ -111,7 +127,7 @@ func (p *processor) handleSegments() {
 				continue
 			}
 
-			if !ep.workMu.TryLock() {
+			if !ep.mu.TryLock() {
 				ep.newSegmentWaker.Assert()
 				continue
 			}
@@ -121,12 +137,10 @@ func (p *processor) handleSegments() {
 			if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose {
 				// Send any active resets if required.
 				if err != nil {
-					ep.mu.Lock()
 					ep.resetConnectionLocked(err)
-					ep.mu.Unlock()
 				}
 				ep.notifyProtocolGoroutine(notifyTickleWorker)
-				ep.workMu.Unlock()
+				ep.mu.Unlock()
 				continue
 			}
 
@@ -134,7 +148,7 @@ func (p *processor) handleSegments() {
 				p.epQ.enqueue(ep)
 			}
 
-			ep.workMu.Unlock()
+			ep.mu.Unlock()
 		}
 	}
 }
@@ -160,7 +174,19 @@ func newDispatcher(nProcessors int) *dispatcher {
 	}
 }
 
-func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (d *dispatcher) close() {
+	for _, p := range d.processors {
+		p.close()
+	}
+}
+
+func (d *dispatcher) wait() {
+	for _, p := range d.processors {
+		p.wait()
+	}
+}
+
+func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	ep := stackEP.(*endpoint)
 	s := newSegment(r, id, pkt)
 	if !s.parse() {
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index 4f361b226..804e95aea 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -568,11 +568,10 @@ func TestV4AcceptOnV4(t *testing.T) {
 func testV4ListenClose(t *testing.T, c *context.Context) {
 	// Set the SynRcvd threshold to zero to force a syn cookie based accept
 	// to happen.
-	saved := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = saved
-	}()
-	tcp.SynRcvdCountThreshold = 0
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption failed: %s", err)
+	}
+
 	const n = uint16(32)
 
 	// Start listening.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index f2be0e651..19f7bf449 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -18,6 +18,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"math"
+	"runtime"
 	"strings"
 	"sync/atomic"
 	"time"
@@ -29,11 +30,9 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
-	"gvisor.dev/gvisor/pkg/tmutex"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -64,7 +63,8 @@ const (
 	StateClosing
 )
 
-// connected is the set of states where an endpoint is connected to a peer.
+// connected returns true when s is one of the states representing an
+// endpoint connected to a peer.
 func (s EndpointState) connected() bool {
 	switch s {
 	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
@@ -74,6 +74,40 @@ func (s EndpointState) connected() bool {
 	}
 }
 
+// connecting returns true when s is one of the states representing a
+// connection in progress, but not yet fully established.
+func (s EndpointState) connecting() bool {
+	switch s {
+	case StateConnecting, StateSynSent, StateSynRecv:
+		return true
+	default:
+		return false
+	}
+}
+
+// handshake returns true when s is one of the states representing an endpoint
+// in the middle of a TCP handshake.
+func (s EndpointState) handshake() bool {
+	switch s {
+	case StateSynSent, StateSynRecv:
+		return true
+	default:
+		return false
+	}
+}
+
+// closed returns true when s is one of the states an endpoint transitions to
+// when closed or when it encounters an error. This is distinct from a newly
+// initialized endpoint that was never connected.
+func (s EndpointState) closed() bool {
+	switch s {
+	case StateClose, StateError:
+		return true
+	default:
+		return false
+	}
+}
+
 // String implements fmt.Stringer.String.
 func (s EndpointState) String() string {
 	switch s {
@@ -121,6 +155,8 @@ const (
 	notifyDrain
 	notifyReset
 	notifyResetByPeer
+	// notifyAbort is a request for an expedited teardown.
+	notifyAbort
 	notifyKeepaliveChanged
 	notifyMSSChanged
 	// notifyTickleWorker is used to tickle the protocol main loop during a
@@ -281,6 +317,38 @@ func (*EndpointInfo) IsEndpointInfo() {}
 // synchronized. The protocol implementation, however, runs in a single
 // goroutine.
 //
+// Each endpoint has a few mutexes:
+//
+// e.mu -> Primary mutex for an endpoint must be held for all operations except
+// in e.Readiness where acquiring it will result in a deadlock in epoll
+// implementation.
+//
+// The following three mutexes can be acquired independent of e.mu but if
+// acquired with e.mu then e.mu must be acquired first.
+//
+// e.acceptMu -> protects acceptedChan.
+// e.rcvListMu -> Protects the rcvList and associated fields.
+// e.sndBufMu -> Protects the sndQueue and associated fields.
+// e.lastErrorMu -> Protects the lastError field.
+//
+// LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
+// based on the context in which the lock is acquired. In the syscall context
+// e.LockUser/e.UnlockUser should be used and when doing background processing
+// e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
+// in brief.
+//
+// The reason for this locking behaviour is to avoid wakeups to handle packets.
+// In cases where the endpoint is already locked the background processor can
+// queue the packet up and go its merry way and the lock owner will eventually
+// process the backlog when releasing the lock. Similarly when acquiring the
+// lock from say a syscall goroutine we can implement a bit of spinning if we
+// know that the lock is not held by another syscall goroutine. Background
+// processors should never hold the lock for long and we can avoid an expensive
+// sleep/wakeup by spinning for a shortwhile.
+//
+// For more details please see the detailed documentation on
+// e.LockUser/e.UnlockUser methods.
+//
 // +stateify savable
 type endpoint struct {
 	EndpointInfo
@@ -297,12 +365,6 @@ type endpoint struct {
 	// Precondition: epQueue.mu must be held to read/write this field..
 	pendingProcessing bool `state:"nosave"`
 
-	// workMu is used to arbitrate which goroutine may perform protocol
-	// work. Only the main protocol goroutine is expected to call Lock() on
-	// it, but other goroutines (e.g., send) may call TryLock() to eagerly
-	// perform work without having to wait for the main one to wake up.
-	workMu tmutex.Mutex `state:"nosave"`
-
 	// The following fields are initialized at creation time and do not
 	// change throughout the lifetime of the endpoint.
 	stack       *stack.Stack  `state:"manual"`
@@ -328,15 +390,11 @@ type endpoint struct {
 	rcvBufSize    int
 	rcvBufUsed    int
 	rcvAutoParams rcvBufAutoTuneParams
-	// zeroWindow indicates that the window was closed due to receive buffer
-	// space being filled up. This is set by the worker goroutine before
-	// moving a segment to the rcvList. This setting is cleared by the
-	// endpoint when a Read() call reads enough data for the new window to
-	// be non-zero.
-	zeroWindow bool
 
-	// The following fields are protected by the mutex.
-	mu sync.RWMutex `state:"nosave"`
+	// mu protects all endpoint fields unless documented otherwise. mu must
+	// be acquired before interacting with the endpoint fields.
+	mu          sync.Mutex `state:"nosave"`
+	ownedByUser uint32
 
 	// state must be read/set using the EndpointState()/setEndpointState() methods.
 	state EndpointState `state:".(EndpointState)"`
@@ -447,6 +505,17 @@ type endpoint struct {
 	// for this endpoint using the TCP_MAXSEG setsockopt.
 	userMSS uint16
 
+	// maxSynRetries is the maximum number of SYN retransmits that TCP should
+	// send before aborting the attempt to connect. It cannot exceed 255.
+	//
+	// NOTE: This is currently a no-op and does not change the SYN
+	// retransmissions.
+	maxSynRetries uint8
+
+	// windowClamp is used to bound the size of the advertised window to
+	// this value.
+	windowClamp uint32
+
 	// The following fields are used to manage the send buffer. When
 	// segments are ready to be sent, they are added to sndQueue and the
 	// protocol goroutine is signaled via sndWaker.
@@ -511,6 +580,23 @@ type endpoint struct {
 	// to the acceptedChan below terminate before we close acceptedChan.
 	pendingAccepted sync.WaitGroup `state:"nosave"`
 
+	// acceptMu protects acceptedChan.
+	acceptMu sync.Mutex `state:"nosave"`
+
+	// acceptCond is a condition variable that can be used to block on when
+	// acceptedChan is full and an endpoint is ready to be delivered.
+	//
+	// This condition variable is required because just blocking on sending
+	// to acceptedChan does not work in cases where endpoint.Listen is
+	// called twice with different backlog values. In such cases the channel
+	// is closed and a new one created. Any pending goroutines blocking on
+	// the write to the channel will panic.
+	//
+	// We use this condition variable to block/unblock goroutines which
+	// tried to deliver an endpoint but couldn't because accept backlog was
+	// full ( See: endpoint.deliverAccepted ).
+	acceptCond *sync.Cond `state:"nosave"`
+
 	// acceptedChan is used by a listening endpoint protocol goroutine to
 	// send newly accepted connections to the endpoint so that they can be
 	// read by Accept() calls.
@@ -559,6 +645,13 @@ type endpoint struct {
 	// endpoint and at this point the endpoint is only around
 	// to complete the TCP shutdown.
 	closed bool
+
+	// txHash is the transport layer hash to be set on outbound packets
+	// emitted by this endpoint.
+	txHash uint32
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -581,14 +674,93 @@ func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 {
 	return maxMSS
 }
 
+// LockUser tries to lock e.mu and if it fails it will check if the lock is held
+// by another syscall goroutine. If yes, then it will goto sleep waiting for the
+// lock to be released, if not then it will spin till it acquires the lock or
+// another syscall goroutine acquires it in which case it will goto sleep as
+// described above.
+//
+// The assumption behind spinning here being that background packet processing
+// should not be holding the lock for long and spinning reduces latency as we
+// avoid an expensive sleep/wakeup of of the syscall goroutine).
+func (e *endpoint) LockUser() {
+	for {
+		// Try first if the sock is locked then check if it's owned
+		// by another user goroutine if not then we spin, otherwise
+		// we just goto sleep on the Lock() and wait.
+		if !e.mu.TryLock() {
+			// If socket is owned by the user then just goto sleep
+			// as the lock could be held for a reasonably long time.
+			if atomic.LoadUint32(&e.ownedByUser) == 1 {
+				e.mu.Lock()
+				atomic.StoreUint32(&e.ownedByUser, 1)
+				return
+			}
+			// Spin but yield the processor since the lower half
+			// should yield the lock soon.
+			runtime.Gosched()
+			continue
+		}
+		atomic.StoreUint32(&e.ownedByUser, 1)
+		return
+	}
+}
+
+// UnlockUser will check if there are any segments already queued for processing
+// and process any such segments before unlocking e.mu. This is required because
+// we when packets arrive and endpoint lock is already held then such packets
+// are queued up to be processed. If the lock is held by the endpoint goroutine
+// then it will process these packets but if the lock is instead held by the
+// syscall goroutine then we can have the syscall goroutine process the backlog
+// before unlocking.
+//
+// This avoids an unnecessary wakeup of the endpoint protocol goroutine for the
+// endpoint. It's also required eventually when we get rid of the endpoint
+// protocol goroutine altogether.
+//
+// Precondition: e.LockUser() must have been called before calling e.UnlockUser()
+func (e *endpoint) UnlockUser() {
+	// Lock segment queue before checking so that we avoid a race where
+	// segments can be queued between the time we check if queue is empty
+	// and actually unlock the endpoint mutex.
+	for {
+		e.segmentQueue.mu.Lock()
+		if e.segmentQueue.emptyLocked() {
+			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
+				panic("e.UnlockUser() called without calling e.LockUser()")
+			}
+			e.mu.Unlock()
+			e.segmentQueue.mu.Unlock()
+			return
+		}
+		e.segmentQueue.mu.Unlock()
+
+		switch e.EndpointState() {
+		case StateEstablished:
+			if err := e.handleSegments(true /* fastPath */); err != nil {
+				e.notifyProtocolGoroutine(notifyTickleWorker)
+			}
+		default:
+			// Since we are waking the endpoint goroutine here just unlock
+			// and let it process the queued segments.
+			e.newSegmentWaker.Assert()
+			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
+				panic("e.UnlockUser() called without calling e.LockUser()")
+			}
+			e.mu.Unlock()
+			return
+		}
+	}
+}
+
 // StopWork halts packet processing. Only to be used in tests.
 func (e *endpoint) StopWork() {
-	e.workMu.Lock()
+	e.mu.Lock()
 }
 
 // ResumeWork resumes packet processing. Only to be used in tests.
 func (e *endpoint) ResumeWork() {
-	e.workMu.Unlock()
+	e.mu.Unlock()
 }
 
 // setEndpointState updates the state of the endpoint to state atomically. This
@@ -669,7 +841,10 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 			interval: 75 * time.Second,
 			count:    9,
 		},
-		uniqueID: s.UniqueID(),
+		uniqueID:      s.UniqueID(),
+		txHash:        s.Rand().Uint32(),
+		windowClamp:   DefaultReceiveBufferSize,
+		maxSynRetries: DefaultSynRetries,
 	}
 
 	var ss SendBufferSizeOption
@@ -694,7 +869,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 
 	var de DelayEnabled
 	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
-		e.SetSockOptInt(tcpip.DelayOption, 1)
+		e.SetSockOptBool(tcpip.DelayOption, true)
 	}
 
 	var tcpLT tcpip.TCPLingerTimeoutOption
@@ -702,14 +877,18 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		e.tcpLingerTimeout = time.Duration(tcpLT)
 	}
 
+	var synRetries tcpip.TCPSynRetriesOption
+	if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil {
+		e.maxSynRetries = uint8(synRetries)
+	}
+
 	if p := s.GetTCPProbe(); p != nil {
 		e.probe = p
 	}
 
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
-	e.workMu.Init()
-	e.workMu.Lock()
 	e.tsOffset = timeStampOffset()
+	e.acceptCond = sync.NewCond(&e.acceptMu)
 
 	return e
 }
@@ -719,9 +898,6 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	result := waiter.EventMask(0)
 
-	e.mu.RLock()
-	defer e.mu.RUnlock()
-
 	switch e.EndpointState() {
 	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
 		// Ready for nothing.
@@ -733,9 +909,11 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	case StateListen:
 		// Check if there's anything in the accepted channel.
 		if (mask & waiter.EventIn) != 0 {
+			e.acceptMu.Lock()
 			if len(e.acceptedChan) > 0 {
 				result |= waiter.EventIn
 			}
+			e.acceptMu.Unlock()
 		}
 	}
 	if e.EndpointState().connected() {
@@ -785,29 +963,58 @@ func (e *endpoint) notifyProtocolGoroutine(n uint32) {
 	}
 }
 
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	// The abort notification is not processed synchronously, so no
+	// synchronization is needed.
+	//
+	// If the endpoint becomes connected after this check, we still close
+	// the endpoint. This worst case results in a slower abort.
+	//
+	// If the endpoint disconnected after the check, nothing needs to be
+	// done, so sending a notification which will potentially be ignored is
+	// fine.
+	//
+	// If the endpoint connecting finishes after the check, the endpoint
+	// is either in a connected state (where we would notifyAbort anyway),
+	// SYN-RECV (where we would also notifyAbort anyway), or in an error
+	// state where nothing is required and the notification can be safely
+	// ignored.
+	//
+	// Endpoints where a Close during connecting or SYN-RECV state would be
+	// problematic are set to state connecting before being registered (and
+	// thus possible to be Aborted). They are never available in initial
+	// state.
+	//
+	// Endpoints transitioning from initial to connecting state may be
+	// safely either closed or sent notifyAbort.
+	if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() {
+		e.notifyProtocolGoroutine(notifyAbort)
+		return
+	}
+	e.Close()
+}
+
 // Close puts the endpoint in a closed state and frees all resources associated
 // with it. It must be called only once and with no other concurrent calls to
 // the endpoint.
 func (e *endpoint) Close() {
-	e.mu.Lock()
-	closed := e.closed
-	e.mu.Unlock()
-	if closed {
+	e.LockUser()
+	defer e.UnlockUser()
+	if e.closed {
 		return
 	}
 
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
-	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
-	e.closeNoShutdown()
+	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
+	e.closeNoShutdownLocked()
 }
 
 // closeNoShutdown closes the endpoint without doing a full shutdown. This is
 // used when a connection needs to be aborted with a RST and we want to skip
 // a full 4 way TCP shutdown.
-func (e *endpoint) closeNoShutdown() {
-	e.mu.Lock()
-
+func (e *endpoint) closeNoShutdownLocked() {
 	// For listening sockets, we always release ports inline so that they
 	// are immediately available for reuse after Close() is called. If also
 	// registered, we unregister as well otherwise the next user would fail
@@ -826,57 +1033,55 @@ func (e *endpoint) closeNoShutdown() {
 
 	// Mark endpoint as closed.
 	e.closed = true
+
+	switch e.EndpointState() {
+	case StateClose, StateError:
+		return
+	}
+
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
-	tcpip.AddDanglingEndpoint(e)
-	if !e.workerRunning {
-		e.cleanupLocked()
-	} else {
+	if e.workerRunning {
 		e.workerCleanup = true
+		tcpip.AddDanglingEndpoint(e)
+		// Worker will remove the dangling endpoint when the endpoint
+		// goroutine terminates.
 		e.notifyProtocolGoroutine(notifyClose)
+	} else {
+		e.transitionToStateCloseLocked()
 	}
-
-	e.mu.Unlock()
 }
 
 // closePendingAcceptableConnections closes all connections that have completed
 // handshake but not yet been delivered to the application.
 func (e *endpoint) closePendingAcceptableConnectionsLocked() {
-	done := make(chan struct{})
-	// Spin a goroutine up as ranging on e.acceptedChan will just block when
-	// there are no more connections in the channel. Using a non-blocking
-	// select does not work as it can potentially select the default case
-	// even when there are pending writes but that are not yet written to
-	// the channel.
-	go func() {
-		defer close(done)
-		for n := range e.acceptedChan {
-			n.notifyProtocolGoroutine(notifyReset)
-			// close all connections that have completed but
-			// not accepted by the application.
-			n.Close()
-		}
-	}()
-	// pendingAccepted(see endpoint.deliverAccepted) tracks the number of
-	// endpoints which have completed handshake but are not yet written to
-	// the e.acceptedChan. We wait here till the goroutine above can drain
-	// all such connections from e.acceptedChan.
-	e.pendingAccepted.Wait()
+	e.acceptMu.Lock()
+	if e.acceptedChan == nil {
+		e.acceptMu.Unlock()
+		return
+	}
 	close(e.acceptedChan)
-	<-done
+	ch := e.acceptedChan
 	e.acceptedChan = nil
+	e.acceptCond.Broadcast()
+	e.acceptMu.Unlock()
+
+	// Reset all connections that are waiting to be accepted.
+	for n := range ch {
+		n.notifyProtocolGoroutine(notifyReset)
+	}
+	// Wait for reset of all endpoints that are still waiting to be delivered to
+	// the now closed acceptedChan.
+	e.pendingAccepted.Wait()
 }
 
 // cleanupLocked frees all resources associated with the endpoint. It is called
 // after Close() is called and the worker goroutine (if any) is done with its
 // work.
 func (e *endpoint) cleanupLocked() {
-
 	// Close all endpoints that might have been accepted by TCP but not by
 	// the client.
-	if e.acceptedChan != nil {
-		e.closePendingAcceptableConnectionsLocked()
-	}
+	e.closePendingAcceptableConnectionsLocked()
 
 	e.workerCleanup = false
 
@@ -910,12 +1115,28 @@ func (e *endpoint) initialReceiveWindow() int {
 	if rcvWnd > routeWnd {
 		rcvWnd = routeWnd
 	}
+	rcvWndScale := e.rcvWndScaleForHandshake()
+
+	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
+	// window offered in SYN won't be reduced due to the loss of precision if
+	// window scaling is enabled after the handshake.
+	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
+
+	// Ensure we can always accept at least 1 byte if the scale specified
+	// was too high for the provided rcvWnd.
+	if rcvWnd == 0 {
+		rcvWnd = 1
+	}
+
 	return rcvWnd
 }
 
 // ModerateRecvBuf adjusts the receive buffer and the advertised window
-// based on the number of bytes copied to user space.
+// based on the number of bytes copied to userspace.
 func (e *endpoint) ModerateRecvBuf(copied int) {
+	e.LockUser()
+	defer e.UnlockUser()
+
 	e.rcvListMu.Lock()
 	if e.rcvAutoParams.disabled {
 		e.rcvListMu.Unlock()
@@ -965,7 +1186,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 			e.rcvBufSize = rcvWnd
 			availAfter := e.receiveBufferAvailableLocked()
 			mask := uint32(notifyReceiveWindowChanged)
-			if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 				mask |= notifyNonZeroReceiveWindow
 			}
 			e.notifyProtocolGoroutine(mask)
@@ -982,14 +1203,13 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.rcvListMu.Unlock()
 }
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
-	return e.stack.IPTables(), nil
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
 }
 
 // Read reads data from the endpoint.
 func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
-	e.mu.RLock()
+	e.LockUser()
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data. Also note that a RST being received
 	// would cause the state to become StateError so we should allow the
@@ -999,7 +1219,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
 		e.rcvListMu.Unlock()
 		he := e.HardError
-		e.mu.RUnlock()
+		e.UnlockUser()
 		if s == StateError {
 			return buffer.View{}, tcpip.ControlMessages{}, he
 		}
@@ -1009,8 +1229,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 	v, err := e.readLocked()
 	e.rcvListMu.Unlock()
-
-	e.mu.RUnlock()
+	e.UnlockUser()
 
 	if err == tcpip.ErrClosedForReceive {
 		e.stats.ReadErrors.ReadClosed.Increment()
@@ -1042,7 +1261,7 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	// enough buffer space, to either fit an aMSS or half a receive buffer
 	// (whichever smaller), then notify the protocol goroutine to send a
 	// window update.
-	if crossed, above := e.windowCrossedACKThreshold(len(v)); crossed && above {
+	if crossed, above := e.windowCrossedACKThresholdLocked(len(v)); crossed && above {
 		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 	}
 
@@ -1083,13 +1302,13 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
 	// and opts.EndOfRecord are also ignored.
 
-	e.mu.RLock()
+	e.LockUser()
 	e.sndBufMu.Lock()
 
 	avail, err := e.isEndpointWritableLocked()
 	if err != nil {
 		e.sndBufMu.Unlock()
-		e.mu.RUnlock()
+		e.UnlockUser()
 		e.stats.WriteErrors.WriteClosed.Increment()
 		return 0, nil, err
 	}
@@ -1101,113 +1320,68 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	// are copying data in.
 	if !opts.Atomic {
 		e.sndBufMu.Unlock()
-		e.mu.RUnlock()
+		e.UnlockUser()
 	}
 
 	// Fetch data.
 	v, perr := p.Payload(avail)
 	if perr != nil || len(v) == 0 {
-		if opts.Atomic { // See above.
+		// Note that perr may be nil if len(v) == 0.
+		if opts.Atomic {
 			e.sndBufMu.Unlock()
-			e.mu.RUnlock()
+			e.UnlockUser()
 		}
-		// Note that perr may be nil if len(v) == 0.
 		return 0, nil, perr
 	}
 
-	if opts.Atomic {
+	queueAndSend := func() (int64, <-chan struct{}, *tcpip.Error) {
 		// Add data to the send queue.
 		s := newSegmentFromView(&e.route, e.ID, v)
 		e.sndBufUsed += len(v)
 		e.sndBufInQueue += seqnum.Size(len(v))
 		e.sndQueue.PushBack(s)
 		e.sndBufMu.Unlock()
-		// Release the endpoint lock to prevent deadlocks due to lock
-		// order inversion when acquiring workMu.
-		e.mu.RUnlock()
-	}
-
-	if e.workMu.TryLock() {
-		// Since we released locks in between it's possible that the
-		// endpoint transitioned to a CLOSED/ERROR states so make
-		// sure endpoint is still writable before trying to write.
-		if !opts.Atomic { // See above.
-			e.mu.RLock()
-			e.sndBufMu.Lock()
 
-			// Because we released the lock before copying, check state again
-			// to make sure the endpoint is still in a valid state for a write.
-			avail, err = e.isEndpointWritableLocked()
-			if err != nil {
-				e.sndBufMu.Unlock()
-				e.mu.RUnlock()
-				e.stats.WriteErrors.WriteClosed.Increment()
-				return 0, nil, err
-			}
-
-			// Discard any excess data copied in due to avail being reduced due
-			// to a simultaneous write call to the socket.
-			if avail < len(v) {
-				v = v[:avail]
-			}
-			// Add data to the send queue.
-			s := newSegmentFromView(&e.route, e.ID, v)
-			e.sndBufUsed += len(v)
-			e.sndBufInQueue += seqnum.Size(len(v))
-			e.sndQueue.PushBack(s)
-			e.sndBufMu.Unlock()
-			// Release the endpoint lock to prevent deadlocks due to lock
-			// order inversion when acquiring workMu.
-			e.mu.RUnlock()
-
-		}
 		// Do the work inline.
 		e.handleWrite()
-		e.workMu.Unlock()
-	} else {
-		if !opts.Atomic { // See above.
-			e.mu.RLock()
-			e.sndBufMu.Lock()
+		e.UnlockUser()
+		return int64(len(v)), nil, nil
+	}
 
-			// Because we released the lock before copying, check state again
-			// to make sure the endpoint is still in a valid state for a write.
-			avail, err = e.isEndpointWritableLocked()
-			if err != nil {
-				e.sndBufMu.Unlock()
-				e.mu.RUnlock()
-				e.stats.WriteErrors.WriteClosed.Increment()
-				return 0, nil, err
-			}
+	if opts.Atomic {
+		// Locks released in queueAndSend()
+		return queueAndSend()
+	}
 
-			// Discard any excess data copied in due to avail being reduced due
-			// to a simultaneous write call to the socket.
-			if avail < len(v) {
-				v = v[:avail]
-			}
-			// Add data to the send queue.
-			s := newSegmentFromView(&e.route, e.ID, v)
-			e.sndBufUsed += len(v)
-			e.sndBufInQueue += seqnum.Size(len(v))
-			e.sndQueue.PushBack(s)
-			e.sndBufMu.Unlock()
-			// Release the endpoint lock to prevent deadlocks due to lock
-			// order inversion when acquiring workMu.
-			e.mu.RUnlock()
+	// Since we released locks in between it's possible that the
+	// endpoint transitioned to a CLOSED/ERROR states so make
+	// sure endpoint is still writable before trying to write.
+	e.LockUser()
+	e.sndBufMu.Lock()
+	avail, err = e.isEndpointWritableLocked()
+	if err != nil {
+		e.sndBufMu.Unlock()
+		e.UnlockUser()
+		e.stats.WriteErrors.WriteClosed.Increment()
+		return 0, nil, err
+	}
 
-		}
-		// Let the protocol goroutine do the work.
-		e.sndWaker.Assert()
+	// Discard any excess data copied in due to avail being reduced due
+	// to a simultaneous write call to the socket.
+	if avail < len(v) {
+		v = v[:avail]
 	}
 
-	return int64(len(v)), nil, nil
+	// Locks released in queueAndSend()
+	return queueAndSend()
 }
 
 // Peek reads data without consuming it from the endpoint.
 //
 // This method does not block if there is no data pending.
 func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data.
@@ -1260,9 +1434,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	return num, tcpip.ControlMessages{}, nil
 }
 
-// windowCrossedACKThreshold checks if the receive window to be announced now
-// would be under aMSS or under half receive buffer, whichever smaller. This is
-// useful as a receive side silly window syndrome prevention mechanism. If
+// windowCrossedACKThresholdLocked checks if the receive window to be announced
+// now would be under aMSS or under half receive buffer, whichever smaller. This
+// is useful as a receive side silly window syndrome prevention mechanism. If
 // window grows to reasonable value, we should send ACK to the sender to inform
 // the rx space is now large. We also want ensure a series of small read()'s
 // won't trigger a flood of spurious tiny ACK's.
@@ -1273,7 +1447,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 // crossed will be true if the window size crossed the ACK threshold.
 // above will be true if the new window is >= ACK threshold and false
 // otherwise.
-func (e *endpoint) windowCrossedACKThreshold(deltaBefore int) (crossed bool, above bool) {
+//
+// Precondition: e.mu and e.rcvListMu must be held.
+func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
 	newAvail := e.receiveBufferAvailableLocked()
 	oldAvail := newAvail - deltaBefore
 	if oldAvail < 0 {
@@ -1297,21 +1473,71 @@ func (e *endpoint) windowCrossedACKThreshold(deltaBefore int) (crossed bool, abo
 // SetSockOptBool sets a socket option.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 	switch opt {
+
+	case tcpip.BroadcastOption:
+		e.LockUser()
+		e.broadcast = v
+		e.UnlockUser()
+
+	case tcpip.CorkOption:
+		e.LockUser()
+		if !v {
+			atomic.StoreUint32(&e.cork, 0)
+
+			// Handle the corked data.
+			e.sndWaker.Assert()
+		} else {
+			atomic.StoreUint32(&e.cork, 1)
+		}
+		e.UnlockUser()
+
+	case tcpip.DelayOption:
+		if v {
+			atomic.StoreUint32(&e.delay, 1)
+		} else {
+			atomic.StoreUint32(&e.delay, 0)
+
+			// Handle delayed data.
+			e.sndWaker.Assert()
+		}
+
+	case tcpip.KeepaliveEnabledOption:
+		e.keepalive.Lock()
+		e.keepalive.enabled = v
+		e.keepalive.Unlock()
+		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+
+	case tcpip.QuickAckOption:
+		o := uint32(1)
+		if v {
+			o = 0
+		}
+		atomic.StoreUint32(&e.slowAck, o)
+
+	case tcpip.ReuseAddressOption:
+		e.LockUser()
+		e.reuseAddr = v
+		e.UnlockUser()
+
+	case tcpip.ReusePortOption:
+		e.LockUser()
+		e.reusePort = v
+		e.UnlockUser()
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
 			return tcpip.ErrInvalidEndpointState
 		}
 
-		e.mu.Lock()
-		defer e.mu.Unlock()
-
 		// We only allow this to be set when we're in the initial state.
 		if e.EndpointState() != StateInitial {
 			return tcpip.ErrInvalidEndpointState
 		}
 
+		e.LockUser()
 		e.v6only = v
+		e.UnlockUser()
 	}
 
 	return nil
@@ -1319,23 +1545,56 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 
 // SetSockOptInt sets a socket option.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
+	const inetECNMask = 3
+
 	switch opt {
+	case tcpip.KeepaliveCountOption:
+		e.keepalive.Lock()
+		e.keepalive.count = v
+		e.keepalive.Unlock()
+		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+
+	case tcpip.IPv4TOSOption:
+		e.LockUser()
+		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
+		// ignore the bits for now.
+		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
+		e.UnlockUser()
+
+	case tcpip.IPv6TrafficClassOption:
+		e.LockUser()
+		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
+		// ignore the bits for now.
+		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
+		e.UnlockUser()
+
+	case tcpip.MaxSegOption:
+		userMSS := v
+		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
+			return tcpip.ErrInvalidOptionValue
+		}
+		e.LockUser()
+		e.userMSS = uint16(userMSS)
+		e.UnlockUser()
+		e.notifyProtocolGoroutine(notifyMSSChanged)
+
 	case tcpip.ReceiveBufferSizeOption:
 		// Make sure the receive buffer size is within the min and max
 		// allowed.
 		var rs ReceiveBufferSizeOption
-		size := int(v)
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
-			if size < rs.Min {
-				size = rs.Min
+			if v < rs.Min {
+				v = rs.Min
 			}
-			if size > rs.Max {
-				size = rs.Max
+			if v > rs.Max {
+				v = rs.Max
 			}
 		}
 
 		mask := uint32(notifyReceiveWindowChanged)
 
+		e.LockUser()
 		e.rcvListMu.Lock()
 
 		// Make sure the receive buffer size allows us to send a
@@ -1344,17 +1603,17 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		if e.rcv != nil {
 			scale = e.rcv.rcvWndScale
 		}
-		if size>>scale == 0 {
-			size = 1 << scale
+		if v>>scale == 0 {
+			v = 1 << scale
 		}
 
 		// Make sure 2*size doesn't overflow.
-		if size > math.MaxInt32/2 {
-			size = math.MaxInt32 / 2
+		if v > math.MaxInt32/2 {
+			v = math.MaxInt32 / 2
 		}
 
 		availBefore := e.receiveBufferAvailableLocked()
-		e.rcvBufSize = size
+		e.rcvBufSize = v
 		availAfter := e.receiveBufferAvailableLocked()
 
 		e.rcvAutoParams.disabled = true
@@ -1362,151 +1621,101 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		// Immediately send an ACK to uncork the sender silly window
 		// syndrome prevetion, when our available space grows above aMSS
 		// or half receive buffer, whichever smaller.
-		if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 			mask |= notifyNonZeroReceiveWindow
 		}
-		e.rcvListMu.Unlock()
 
+		e.rcvListMu.Unlock()
+		e.UnlockUser()
 		e.notifyProtocolGoroutine(mask)
-		return nil
 
 	case tcpip.SendBufferSizeOption:
 		// Make sure the send buffer size is within the min and max
 		// allowed.
-		size := int(v)
 		var ss SendBufferSizeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
-			if size < ss.Min {
-				size = ss.Min
+			if v < ss.Min {
+				v = ss.Min
 			}
-			if size > ss.Max {
-				size = ss.Max
+			if v > ss.Max {
+				v = ss.Max
 			}
 		}
 
 		e.sndBufMu.Lock()
-		e.sndBufSize = size
+		e.sndBufSize = v
 		e.sndBufMu.Unlock()
-		return nil
 
-	case tcpip.DelayOption:
-		if v == 0 {
-			atomic.StoreUint32(&e.delay, 0)
+	case tcpip.TTLOption:
+		e.LockUser()
+		e.ttl = uint8(v)
+		e.UnlockUser()
 
-			// Handle delayed data.
-			e.sndWaker.Assert()
-		} else {
-			atomic.StoreUint32(&e.delay, 1)
+	case tcpip.TCPSynCountOption:
+		if v < 1 || v > 255 {
+			return tcpip.ErrInvalidOptionValue
 		}
-		return nil
+		e.LockUser()
+		e.maxSynRetries = uint8(v)
+		e.UnlockUser()
 
-	default:
-		return nil
+	case tcpip.TCPWindowClampOption:
+		if v == 0 {
+			e.LockUser()
+			switch e.EndpointState() {
+			case StateClose, StateInitial:
+				e.windowClamp = 0
+				e.UnlockUser()
+				return nil
+			default:
+				e.UnlockUser()
+				return tcpip.ErrInvalidOptionValue
+			}
+		}
+		var rs ReceiveBufferSizeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
+			if v < rs.Min/2 {
+				v = rs.Min / 2
+			}
+		}
+		e.LockUser()
+		e.windowClamp = uint32(v)
+		e.UnlockUser()
 	}
+	return nil
 }
 
 // SetSockOpt sets a socket option.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
-	const inetECNMask = 3
 	switch v := opt.(type) {
-	case tcpip.CorkOption:
-		if v == 0 {
-			atomic.StoreUint32(&e.cork, 0)
-
-			// Handle the corked data.
-			e.sndWaker.Assert()
-		} else {
-			atomic.StoreUint32(&e.cork, 1)
-		}
-		return nil
-
-	case tcpip.ReuseAddressOption:
-		e.mu.Lock()
-		e.reuseAddr = v != 0
-		e.mu.Unlock()
-		return nil
-
-	case tcpip.ReusePortOption:
-		e.mu.Lock()
-		e.reusePort = v != 0
-		e.mu.Unlock()
-		return nil
-
 	case tcpip.BindToDeviceOption:
 		id := tcpip.NICID(v)
 		if id != 0 && !e.stack.HasNIC(id) {
 			return tcpip.ErrUnknownDevice
 		}
-		e.mu.Lock()
+		e.LockUser()
 		e.bindToDevice = id
-		e.mu.Unlock()
-		return nil
-
-	case tcpip.QuickAckOption:
-		if v == 0 {
-			atomic.StoreUint32(&e.slowAck, 1)
-		} else {
-			atomic.StoreUint32(&e.slowAck, 0)
-		}
-		return nil
-
-	case tcpip.MaxSegOption:
-		userMSS := v
-		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
-			return tcpip.ErrInvalidOptionValue
-		}
-		e.mu.Lock()
-		e.userMSS = uint16(userMSS)
-		e.mu.Unlock()
-		e.notifyProtocolGoroutine(notifyMSSChanged)
-		return nil
-
-	case tcpip.TTLOption:
-		e.mu.Lock()
-		e.ttl = uint8(v)
-		e.mu.Unlock()
-		return nil
-
-	case tcpip.KeepaliveEnabledOption:
-		e.keepalive.Lock()
-		e.keepalive.enabled = v != 0
-		e.keepalive.Unlock()
-		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
-		return nil
+		e.UnlockUser()
 
 	case tcpip.KeepaliveIdleOption:
 		e.keepalive.Lock()
 		e.keepalive.idle = time.Duration(v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
-		return nil
 
 	case tcpip.KeepaliveIntervalOption:
 		e.keepalive.Lock()
 		e.keepalive.interval = time.Duration(v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
-		return nil
 
-	case tcpip.KeepaliveCountOption:
-		e.keepalive.Lock()
-		e.keepalive.count = int(v)
-		e.keepalive.Unlock()
-		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
-		return nil
+	case tcpip.OutOfBandInlineOption:
+		// We don't currently support disabling this option.
 
 	case tcpip.TCPUserTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		e.userTimeout = time.Duration(v)
-		e.mu.Unlock()
-		return nil
-
-	case tcpip.BroadcastOption:
-		e.mu.Lock()
-		e.broadcast = v != 0
-		e.mu.Unlock()
-		return nil
+		e.UnlockUser()
 
 	case tcpip.CongestionControlOption:
 		// Query the available cc algorithms in the stack and
@@ -1519,22 +1728,16 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		availCC := strings.Split(string(avail), " ")
 		for _, cc := range availCC {
 			if v == tcpip.CongestionControlOption(cc) {
-				// Acquire the work mutex as we may need to
-				// reinitialize the congestion control state.
-				e.mu.Lock()
+				e.LockUser()
 				state := e.EndpointState()
 				e.cc = v
-				e.mu.Unlock()
 				switch state {
 				case StateEstablished:
-					e.workMu.Lock()
-					e.mu.Lock()
 					if e.EndpointState() == state {
 						e.snd.cc = e.snd.initCongestionControl(e.cc)
 					}
-					e.mu.Unlock()
-					e.workMu.Unlock()
 				}
+				e.UnlockUser()
 				return nil
 			}
 		}
@@ -1543,24 +1746,8 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		// control algorithm is specified.
 		return tcpip.ErrNoSuchFile
 
-	case tcpip.IPv4TOSOption:
-		e.mu.Lock()
-		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
-		// ignore the bits for now.
-		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
-		e.mu.Unlock()
-		return nil
-
-	case tcpip.IPv6TrafficClassOption:
-		e.mu.Lock()
-		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
-		// ignore the bits for now.
-		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
-		e.mu.Unlock()
-		return nil
-
 	case tcpip.TCPLingerTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		if v < 0 {
 			// Same as effectively disabling TCPLinger timeout.
 			v = 0
@@ -1578,27 +1765,26 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			v = stkTCPLingerTimeout
 		}
 		e.tcpLingerTimeout = time.Duration(v)
-		e.mu.Unlock()
-		return nil
+		e.UnlockUser()
 
 	case tcpip.TCPDeferAcceptOption:
-		e.mu.Lock()
+		e.LockUser()
 		if time.Duration(v) > MaxRTO {
 			v = tcpip.TCPDeferAcceptOption(MaxRTO)
 		}
 		e.deferAccept = time.Duration(v)
-		e.mu.Unlock()
-		return nil
+		e.UnlockUser()
 
 	default:
 		return nil
 	}
+	return nil
 }
 
 // readyReceiveSize returns the number of bytes ready to be received.
 func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	// The endpoint cannot be in listen state.
 	if e.EndpointState() == StateListen {
@@ -1614,25 +1800,89 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	switch opt {
+	case tcpip.BroadcastOption:
+		e.LockUser()
+		v := e.broadcast
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.CorkOption:
+		return atomic.LoadUint32(&e.cork) != 0, nil
+
+	case tcpip.DelayOption:
+		return atomic.LoadUint32(&e.delay) != 0, nil
+
+	case tcpip.KeepaliveEnabledOption:
+		e.keepalive.Lock()
+		v := e.keepalive.enabled
+		e.keepalive.Unlock()
+
+		return v, nil
+
+	case tcpip.QuickAckOption:
+		v := atomic.LoadUint32(&e.slowAck) == 0
+		return v, nil
+
+	case tcpip.ReuseAddressOption:
+		e.LockUser()
+		v := e.reuseAddr
+		e.UnlockUser()
+
+		return v, nil
+
+	case tcpip.ReusePortOption:
+		e.LockUser()
+		v := e.reusePort
+		e.UnlockUser()
+
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
 			return false, tcpip.ErrUnknownProtocolOption
 		}
 
-		e.mu.Lock()
+		e.LockUser()
 		v := e.v6only
-		e.mu.Unlock()
+		e.UnlockUser()
 
 		return v, nil
-	}
 
-	return false, tcpip.ErrUnknownProtocolOption
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
+	case tcpip.KeepaliveCountOption:
+		e.keepalive.Lock()
+		v := e.keepalive.count
+		e.keepalive.Unlock()
+		return v, nil
+
+	case tcpip.IPv4TOSOption:
+		e.LockUser()
+		v := int(e.sendTOS)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.IPv6TrafficClassOption:
+		e.LockUser()
+		v := int(e.sendTOS)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.MaxSegOption:
+		// This is just stubbed out. Linux never returns the user_mss
+		// value as it either returns the defaultMSS or returns the
+		// actual current MSS. Netstack just returns the defaultMSS
+		// always for now.
+		v := header.TCPDefaultMSS
+		return v, nil
+
 	case tcpip.ReceiveQueueSizeOption:
 		return e.readyReceiveSize()
 
@@ -1648,12 +1898,23 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		e.rcvListMu.Unlock()
 		return v, nil
 
-	case tcpip.DelayOption:
-		var o int
-		if v := atomic.LoadUint32(&e.delay); v != 0 {
-			o = 1
-		}
-		return o, nil
+	case tcpip.TTLOption:
+		e.LockUser()
+		v := int(e.ttl)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.TCPSynCountOption:
+		e.LockUser()
+		v := int(e.maxSynRetries)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.TCPWindowClampOption:
+		e.LockUser()
+		v := int(e.windowClamp)
+		e.UnlockUser()
+		return v, nil
 
 	default:
 		return -1, tcpip.ErrUnknownProtocolOption
@@ -1670,168 +1931,71 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.lastErrorMu.Unlock()
 		return err
 
-	case *tcpip.MaxSegOption:
-		// This is just stubbed out. Linux never returns the user_mss
-		// value as it either returns the defaultMSS or returns the
-		// actual current MSS. Netstack just returns the defaultMSS
-		// always for now.
-		*o = header.TCPDefaultMSS
-		return nil
-
-	case *tcpip.CorkOption:
-		*o = 0
-		if v := atomic.LoadUint32(&e.cork); v != 0 {
-			*o = 1
-		}
-		return nil
-
-	case *tcpip.ReuseAddressOption:
-		e.mu.RLock()
-		v := e.reuseAddr
-		e.mu.RUnlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
-
-	case *tcpip.ReusePortOption:
-		e.mu.RLock()
-		v := e.reusePort
-		e.mu.RUnlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
-
 	case *tcpip.BindToDeviceOption:
-		e.mu.RLock()
+		e.LockUser()
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
-		e.mu.RUnlock()
-		return nil
-
-	case *tcpip.QuickAckOption:
-		*o = 1
-		if v := atomic.LoadUint32(&e.slowAck); v != 0 {
-			*o = 0
-		}
-		return nil
-
-	case *tcpip.TTLOption:
-		e.mu.Lock()
-		*o = tcpip.TTLOption(e.ttl)
-		e.mu.Unlock()
-		return nil
+		e.UnlockUser()
 
 	case *tcpip.TCPInfoOption:
 		*o = tcpip.TCPInfoOption{}
-		e.mu.RLock()
+		e.LockUser()
 		snd := e.snd
-		e.mu.RUnlock()
+		e.UnlockUser()
 		if snd != nil {
 			snd.rtt.Lock()
 			o.RTT = snd.rtt.srtt
 			o.RTTVar = snd.rtt.rttvar
 			snd.rtt.Unlock()
 		}
-		return nil
-
-	case *tcpip.KeepaliveEnabledOption:
-		e.keepalive.Lock()
-		v := e.keepalive.enabled
-		e.keepalive.Unlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
 
 	case *tcpip.KeepaliveIdleOption:
 		e.keepalive.Lock()
 		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
 		e.keepalive.Unlock()
-		return nil
 
 	case *tcpip.KeepaliveIntervalOption:
 		e.keepalive.Lock()
 		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
 		e.keepalive.Unlock()
-		return nil
-
-	case *tcpip.KeepaliveCountOption:
-		e.keepalive.Lock()
-		*o = tcpip.KeepaliveCountOption(e.keepalive.count)
-		e.keepalive.Unlock()
-		return nil
 
 	case *tcpip.TCPUserTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
-		e.mu.Unlock()
-		return nil
+		e.UnlockUser()
 
 	case *tcpip.OutOfBandInlineOption:
 		// We don't currently support disabling this option.
 		*o = 1
-		return nil
-
-	case *tcpip.BroadcastOption:
-		e.mu.Lock()
-		v := e.broadcast
-		e.mu.Unlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
 
 	case *tcpip.CongestionControlOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = e.cc
-		e.mu.Unlock()
-		return nil
-
-	case *tcpip.IPv4TOSOption:
-		e.mu.RLock()
-		*o = tcpip.IPv4TOSOption(e.sendTOS)
-		e.mu.RUnlock()
-		return nil
-
-	case *tcpip.IPv6TrafficClassOption:
-		e.mu.RLock()
-		*o = tcpip.IPv6TrafficClassOption(e.sendTOS)
-		e.mu.RUnlock()
-		return nil
+		e.UnlockUser()
 
 	case *tcpip.TCPLingerTimeoutOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
-		e.mu.Unlock()
-		return nil
+		e.UnlockUser()
 
 	case *tcpip.TCPDeferAcceptOption:
-		e.mu.Lock()
+		e.LockUser()
 		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
-		e.mu.Unlock()
-		return nil
+		e.UnlockUser()
 
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
+	return nil
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
 	if err != nil {
-		return 0, err
+		return tcpip.FullAddress{}, 0, err
 	}
-	*addr = unwrapped
-	return netProto, nil
+	return unwrapped, netProto, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect.
@@ -1856,12 +2020,12 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 // yet accepted by the app, they are restored without running the main goroutine
 // here.
 func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error {
-	e.mu.Lock()
-	defer e.mu.Unlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	connectingAddr := addr.Addr
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
@@ -2026,13 +2190,17 @@ func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
 // Shutdown closes the read and/or write end of the endpoint connection to its
 // peer.
 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
-	e.mu.Lock()
+	e.LockUser()
+	defer e.UnlockUser()
+	return e.shutdownLocked(flags)
+}
+
+func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 	e.shutdownFlags |= flags
-	finQueued := false
 	switch {
 	case e.EndpointState().connected():
 		// Close for read.
-		if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
+		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
 			// Mark read side as closed.
 			e.rcvListMu.Lock()
 			e.rcvClosed = true
@@ -2041,69 +2209,56 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 
 			// If we're fully closed and we have unread data we need to abort
 			// the connection with a RST.
-			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
-				e.mu.Unlock()
-				// Try to send an active reset immediately if the
-				// work mutex is available.
-				if e.workMu.TryLock() {
-					e.mu.Lock()
-					// We need to double check here to make
-					// sure worker has not transitioned the
-					// endpoint out of a connected state
-					// before trying to send a reset.
-					if e.EndpointState().connected() {
-						e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-						e.notifyProtocolGoroutine(notifyTickleWorker)
-					}
-					e.mu.Unlock()
-					e.workMu.Unlock()
-				} else {
-					e.notifyProtocolGoroutine(notifyReset)
-				}
+			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
+				e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+				// Wake up worker to terminate loop.
+				e.notifyProtocolGoroutine(notifyTickleWorker)
 				return nil
 			}
 		}
 
 		// Close for write.
-		if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 {
+		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
 			e.sndBufMu.Lock()
-
 			if e.sndClosed {
 				// Already closed.
 				e.sndBufMu.Unlock()
-				break
+				if e.EndpointState() == StateTimeWait {
+					return tcpip.ErrNotConnected
+				}
+				return nil
 			}
 
 			// Queue fin segment.
 			s := newSegmentFromView(&e.route, e.ID, nil)
 			e.sndQueue.PushBack(s)
 			e.sndBufInQueue++
-			finQueued = true
 			// Mark endpoint as closed.
 			e.sndClosed = true
 			e.sndBufMu.Unlock()
+			e.handleClose()
 		}
 
+		return nil
 	case e.EndpointState() == StateListen:
-		// Tell protocolListenLoop to stop.
-		if flags&tcpip.ShutdownRead != 0 {
-			e.notifyProtocolGoroutine(notifyClose)
+		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
+			// Reset all connections from the accept queue and keep the
+			// worker running so that it can continue handling incoming
+			// segments by replying with RST.
+			//
+			// By not removing this endpoint from the demuxer mapping, we
+			// ensure that any other bind to the same port fails, as on Linux.
+			e.rcvListMu.Lock()
+			e.rcvClosed = true
+			e.rcvListMu.Unlock()
+			e.closePendingAcceptableConnectionsLocked()
+			// Notify waiters that the endpoint is shutdown.
+			e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
 		}
+		return nil
 	default:
-		e.mu.Unlock()
 		return tcpip.ErrNotConnected
 	}
-	e.mu.Unlock()
-	if finQueued {
-		if e.workMu.TryLock() {
-			e.handleClose()
-			e.workMu.Unlock()
-		} else {
-			// Tell protocol goroutine to close.
-			e.sndCloseWaker.Assert()
-		}
-	}
-	return nil
 }
 
 // Listen puts the endpoint in "listen" mode, which allows it to accept
@@ -2118,28 +2273,40 @@ func (e *endpoint) Listen(backlog int) *tcpip.Error {
 }
 
 func (e *endpoint) listen(backlog int) *tcpip.Error {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-
-	// Allow the backlog to be adjusted if the endpoint is not shutting down.
-	// When the endpoint shuts down, it sets workerCleanup to true, and from
-	// that point onward, acceptedChan is the responsibility of the cleanup()
-	// method (and should not be touched anywhere else, including here).
-	if e.EndpointState() == StateListen && !e.workerCleanup {
-		// Adjust the size of the channel iff we can fix existing
-		// pending connections into the new one.
-		if len(e.acceptedChan) > backlog {
-			return tcpip.ErrInvalidEndpointState
-		}
-		if cap(e.acceptedChan) == backlog {
-			return nil
-		}
-		origChan := e.acceptedChan
-		e.acceptedChan = make(chan *endpoint, backlog)
-		close(origChan)
-		for ep := range origChan {
-			e.acceptedChan <- ep
+	e.LockUser()
+	defer e.UnlockUser()
+
+	if e.EndpointState() == StateListen && !e.closed {
+		e.acceptMu.Lock()
+		defer e.acceptMu.Unlock()
+		if e.acceptedChan == nil {
+			// listen is called after shutdown.
+			e.acceptedChan = make(chan *endpoint, backlog)
+			e.shutdownFlags = 0
+			e.rcvListMu.Lock()
+			e.rcvClosed = false
+			e.rcvListMu.Unlock()
+		} else {
+			// Adjust the size of the channel iff we can fix
+			// existing pending connections into the new one.
+			if len(e.acceptedChan) > backlog {
+				return tcpip.ErrInvalidEndpointState
+			}
+			if cap(e.acceptedChan) == backlog {
+				return nil
+			}
+			origChan := e.acceptedChan
+			e.acceptedChan = make(chan *endpoint, backlog)
+			close(origChan)
+			for ep := range origChan {
+				e.acceptedChan <- ep
+			}
 		}
+
+		// Notify any blocked goroutines that they can attempt to
+		// deliver endpoints again.
+		e.acceptCond.Broadcast()
+
 		return nil
 	}
 
@@ -2169,9 +2336,12 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	// The channel may be non-nil when we're restoring the endpoint, and it
 	// may be pre-populated with some previously accepted (but not Accepted)
 	// endpoints.
+	e.acceptMu.Lock()
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
+	e.acceptMu.Unlock()
+
 	e.workerRunning = true
 	go e.protocolListenLoop( // S/R-SAFE: drained on save.
 		seqnum.Size(e.receiveBufferAvailable()))
@@ -2181,7 +2351,6 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
 func (e *endpoint) startAcceptedLoop() {
-	e.mu.Lock()
 	e.workerRunning = true
 	e.mu.Unlock()
 	wakerInitDone := make(chan struct{})
@@ -2192,18 +2361,24 @@ func (e *endpoint) startAcceptedLoop() {
 // Accept returns a new endpoint if a peer has established a connection
 // to an endpoint previously set to listen mode.
 func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
+	e.rcvListMu.Lock()
+	rcvClosed := e.rcvClosed
+	e.rcvListMu.Unlock()
 	// Endpoint must be in listen state before it can accept connections.
-	if e.EndpointState() != StateListen {
+	if rcvClosed || e.EndpointState() != StateListen {
 		return nil, nil, tcpip.ErrInvalidEndpointState
 	}
 
 	// Get the new accepted endpoint.
+	e.acceptMu.Lock()
+	defer e.acceptMu.Unlock()
 	var n *endpoint
 	select {
 	case n = <-e.acceptedChan:
+		e.acceptCond.Signal()
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
@@ -2212,8 +2387,8 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 // Bind binds the endpoint to a specific local port and optionally address.
 func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
-	e.mu.Lock()
-	defer e.mu.Unlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	return e.bindLocked(addr)
 }
@@ -2227,7 +2402,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	}
 
 	e.BindAddr = addr.Addr
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
@@ -2291,8 +2466,8 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 
 // GetLocalAddress returns the address to which the endpoint is bound.
 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	return tcpip.FullAddress{
 		Addr: e.ID.LocalAddress,
@@ -2303,8 +2478,8 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 
 // GetRemoteAddress returns the address to which the endpoint is connected.
 func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
+	e.LockUser()
+	defer e.UnlockUser()
 
 	if !e.EndpointState().connected() {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
@@ -2317,7 +2492,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	}, nil
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	// TCP HandlePacket is not required anymore as inbound packets first
 	// land at the Dispatcher which then can either delivery using the
 	// worker go routine or directly do the invoke the tcp processing inline
@@ -2336,7 +2511,7 @@ func (e *endpoint) enqueueSegment(s *segment) bool {
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	switch typ {
 	case stack.ControlPacketTooBig:
 		e.sndBufMu.Lock()
@@ -2377,7 +2552,7 @@ func (e *endpoint) readyToRead(s *segment) {
 		e.rcvBufUsed += s.data.Size()
 		// Increase counter if the receive window falls down below MSS
 		// or half receive buffer size, whichever smaller.
-		if crossed, above := e.windowCrossedACKThreshold(-s.data.Size()); crossed && !above {
+		if crossed, above := e.windowCrossedACKThresholdLocked(-s.data.Size()); crossed && !above {
 			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
 		}
 		e.rcvList.PushBack(s)
@@ -2385,7 +2560,6 @@ func (e *endpoint) readyToRead(s *segment) {
 		e.rcvClosed = true
 	}
 	e.rcvListMu.Unlock()
-
 	e.waiterQueue.Notify(waiter.EventIn)
 }
 
@@ -2529,9 +2703,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 	s.SegTime = time.Now()
 
 	// Copy EndpointID.
-	e.mu.Lock()
 	s.ID = stack.TCPEndpointID(e.ID)
-	e.mu.Unlock()
 
 	// Copy endpoint rcv state.
 	e.rcvListMu.Lock()
@@ -2661,10 +2833,10 @@ func (e *endpoint) State() uint32 {
 
 // Info returns a copy of the endpoint info.
 func (e *endpoint) Info() tcpip.EndpointInfo {
-	e.mu.RLock()
+	e.LockUser()
 	// Make a copy of the endpoint info.
 	ret := e.EndpointInfo
-	e.mu.RUnlock()
+	e.UnlockUser()
 	return &ret
 }
 
@@ -2679,9 +2851,9 @@ func (e *endpoint) Wait() {
 	e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp)
 	defer e.waiterQueue.EventUnregister(&waitEntry)
 	for {
-		e.mu.Lock()
+		e.LockUser()
 		running := e.workerRunning
-		e.mu.Unlock()
+		e.UnlockUser()
 		if !running {
 			break
 		}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 4a46f0ec5..cbb779666 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -49,11 +49,10 @@ func (e *endpoint) beforeSave() {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	switch e.EndpointState() {
-	case StateInitial, StateBound:
-		// TODO(b/138137272): this enumeration duplicates
-		// EndpointState.connected. remove it.
-	case StateEstablished, StateSynSent, StateSynRecv, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
+	epState := e.EndpointState()
+	switch {
+	case epState == StateInitial || epState == StateBound:
+	case epState.connected() || epState.handshake():
 		if e.route.Capabilities()&stack.CapabilitySaveRestore == 0 {
 			if e.route.Capabilities()&stack.CapabilityDisconnectOk == 0 {
 				panic(tcpip.ErrSaveRejection{fmt.Errorf("endpoint cannot be saved in connected state: local %v:%d, remote %v:%d", e.ID.LocalAddress, e.ID.LocalPort, e.ID.RemoteAddress, e.ID.RemotePort)})
@@ -69,15 +68,16 @@ func (e *endpoint) beforeSave() {
 			break
 		}
 		fallthrough
-	case StateListen, StateConnecting:
+	case epState == StateListen || epState == StateConnecting:
 		e.drainSegmentLocked()
-		if e.EndpointState() != StateClose && e.EndpointState() != StateError {
+		// Refresh epState, since drainSegmentLocked may have changed it.
+		epState = e.EndpointState()
+		if !epState.closed() {
 			if !e.workerRunning {
 				panic("endpoint has no worker running in listen, connecting, or connected state")
 			}
-			break
 		}
-	case StateError, StateClose:
+	case epState.closed():
 		for e.workerRunning {
 			e.mu.Unlock()
 			time.Sleep(100 * time.Millisecond)
@@ -148,23 +148,23 @@ var connectingLoading sync.WaitGroup
 // Bound endpoint loading happens last.
 
 // loadState is invoked by stateify.
-func (e *endpoint) loadState(state EndpointState) {
+func (e *endpoint) loadState(epState EndpointState) {
 	// This is to ensure that the loading wait groups include all applicable
 	// endpoints before any asynchronous calls to the Wait() methods.
 	// For restore purposes we treat TimeWait like a connected endpoint.
-	if state.connected() || state == StateTimeWait {
+	if epState.connected() || epState == StateTimeWait {
 		connectedLoading.Add(1)
 	}
-	switch state {
-	case StateListen:
+	switch {
+	case epState == StateListen:
 		listenLoading.Add(1)
-	case StateConnecting, StateSynSent, StateSynRecv:
+	case epState.connecting():
 		connectingLoading.Add(1)
 	}
 	// Directly update the state here rather than using e.setEndpointState
-	// as the endpoint is still being loaded and the stack reference to increment
-	// metrics is not yet initialized.
-	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
+	// as the endpoint is still being loaded and the stack reference is not
+	// yet initialized.
+	atomic.StoreUint32((*uint32)(&e.state), uint32(epState))
 }
 
 // afterLoad is invoked by stateify.
@@ -173,6 +173,9 @@ func (e *endpoint) afterLoad() {
 	// Restore the endpoint to InitialState as it will be moved to
 	// its origEndpointState during Resume.
 	e.state = StateInitial
+	// Condition variables and mutexs are not S/R'ed so reinitialize
+	// acceptCond with e.acceptMu.
+	e.acceptCond = sync.NewCond(&e.acceptMu)
 	stack.StackFromEnv.RegisterRestoredEndpoint(e)
 }
 
@@ -180,9 +183,8 @@ func (e *endpoint) afterLoad() {
 func (e *endpoint) Resume(s *stack.Stack) {
 	e.stack = s
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
-	e.workMu.Init()
-	state := e.origEndpointState
-	switch state {
+	epState := e.origEndpointState
+	switch epState {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
 		var ss SendBufferSizeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
@@ -206,8 +208,8 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		}
 	}
 
-	switch state {
-	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
+	switch {
+	case epState.connected():
 		bind()
 		if len(e.connectingAddress) == 0 {
 			e.connectingAddress = e.ID.RemoteAddress
@@ -230,13 +232,13 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		closed := e.closed
 		e.mu.Unlock()
 		e.notifyProtocolGoroutine(notifyTickleWorker)
-		if state == StateFinWait2 && closed {
+		if epState == StateFinWait2 && closed {
 			// If the endpoint has been closed then make sure we notify so
 			// that the FIN_WAIT2 timer is started after a restore.
 			e.notifyProtocolGoroutine(notifyClose)
 		}
 		connectedLoading.Done()
-	case StateListen:
+	case epState == StateListen:
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -245,10 +247,15 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			if err := e.Listen(backlog); err != nil {
 				panic("endpoint listening failed: " + err.String())
 			}
+			e.LockUser()
+			if e.shutdownFlags != 0 {
+				e.shutdownLocked(e.shutdownFlags)
+			}
+			e.UnlockUser()
 			listenLoading.Done()
 			tcpip.AsyncLoading.Done()
 		}()
-	case StateConnecting, StateSynSent, StateSynRecv:
+	case epState.connecting():
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -260,7 +267,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			connectingLoading.Done()
 			tcpip.AsyncLoading.Done()
 		}()
-	case StateBound:
+	case epState == StateBound:
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -269,7 +276,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			bind()
 			tcpip.AsyncLoading.Done()
 		}()
-	case StateClose:
+	case epState == StateClose:
 		if e.isPortReserved {
 			tcpip.AsyncLoading.Add(1)
 			go func() {
@@ -284,12 +291,11 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		e.state = StateClose
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
-	case StateError:
+	case epState == StateError:
 		e.state = StateError
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
 	}
-
 }
 
 // saveLastError is invoked by stateify.
@@ -307,7 +313,7 @@ func (e *endpoint) loadLastError(s string) {
 		return
 	}
 
-	e.lastError = loadError(s)
+	e.lastError = tcpip.StringToError(s)
 }
 
 // saveHardError is invoked by stateify.
@@ -325,71 +331,7 @@ func (e *EndpointInfo) loadHardError(s string) {
 		return
 	}
 
-	e.HardError = loadError(s)
-}
-
-var messageToError map[string]*tcpip.Error
-
-var populate sync.Once
-
-func loadError(s string) *tcpip.Error {
-	populate.Do(func() {
-		var errors = []*tcpip.Error{
-			tcpip.ErrUnknownProtocol,
-			tcpip.ErrUnknownNICID,
-			tcpip.ErrUnknownDevice,
-			tcpip.ErrUnknownProtocolOption,
-			tcpip.ErrDuplicateNICID,
-			tcpip.ErrDuplicateAddress,
-			tcpip.ErrNoRoute,
-			tcpip.ErrBadLinkEndpoint,
-			tcpip.ErrAlreadyBound,
-			tcpip.ErrInvalidEndpointState,
-			tcpip.ErrAlreadyConnecting,
-			tcpip.ErrAlreadyConnected,
-			tcpip.ErrNoPortAvailable,
-			tcpip.ErrPortInUse,
-			tcpip.ErrBadLocalAddress,
-			tcpip.ErrClosedForSend,
-			tcpip.ErrClosedForReceive,
-			tcpip.ErrWouldBlock,
-			tcpip.ErrConnectionRefused,
-			tcpip.ErrTimeout,
-			tcpip.ErrAborted,
-			tcpip.ErrConnectStarted,
-			tcpip.ErrDestinationRequired,
-			tcpip.ErrNotSupported,
-			tcpip.ErrQueueSizeNotSupported,
-			tcpip.ErrNotConnected,
-			tcpip.ErrConnectionReset,
-			tcpip.ErrConnectionAborted,
-			tcpip.ErrNoSuchFile,
-			tcpip.ErrInvalidOptionValue,
-			tcpip.ErrNoLinkAddress,
-			tcpip.ErrBadAddress,
-			tcpip.ErrNetworkUnreachable,
-			tcpip.ErrMessageTooLong,
-			tcpip.ErrNoBufferSpace,
-			tcpip.ErrBroadcastDisabled,
-			tcpip.ErrNotPermitted,
-			tcpip.ErrAddressFamilyNotSupported,
-		}
-
-		messageToError = make(map[string]*tcpip.Error)
-		for _, e := range errors {
-			if messageToError[e.String()] != nil {
-				panic("tcpip errors with duplicated message: " + e.String())
-			}
-			messageToError[e.String()] = e
-		}
-	})
-
-	e, ok := messageToError[s]
-	if !ok {
-		panic("unknown error message: " + s)
-	}
-
-	return e
+	e.HardError = tcpip.StringToError(s)
 }
 
 // saveMeasureTime is invoked by stateify.
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index c9ee5bf06..070b634b4 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -61,7 +61,7 @@ func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*Forward
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
@@ -130,7 +130,7 @@ func (r *ForwarderRequest) Complete(sendReset bool) {
 
 	// If the caller requested, send a reset.
 	if sendReset {
-		replyWithReset(r.segment)
+		replyWithReset(r.segment, stack.DefaultTOS, r.segment.route.DefaultTTL())
 	}
 
 	// Release all resources.
@@ -157,7 +157,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 		TSVal:         r.synOptions.TSVal,
 		TSEcr:         r.synOptions.TSEcr,
 		SACKPermitted: r.synOptions.SACKPermitted,
-	}, queue)
+	}, queue, nil)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 958c06fa7..73b8a6782 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -21,6 +21,7 @@
 package tcp
 
 import (
+	"fmt"
 	"runtime"
 	"strings"
 	"time"
@@ -64,6 +65,10 @@ const (
 	// DefaultTCPTimeWaitTimeout is the amount of time that sockets linger
 	// in TIME_WAIT state before being marked closed.
 	DefaultTCPTimeWaitTimeout = 60 * time.Second
+
+	// DefaultSynRetries is the default value for the number of SYN retransmits
+	// before a connect is aborted.
+	DefaultSynRetries = 6
 )
 
 // SACKEnabled option can be used to enable SACK support in the TCP
@@ -94,8 +99,65 @@ const (
 	ccCubic = "cubic"
 )
 
+// syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The
+// value is protected by a mutex so that we can increment only when it's
+// guaranteed not to go above a threshold.
+type synRcvdCounter struct {
+	sync.Mutex
+	value     uint64
+	pending   sync.WaitGroup
+	threshold uint64
+}
+
+// inc tries to increment the global number of endpoints in SYN-RCVD state. It
+// succeeds if the increment doesn't make the count go beyond the threshold, and
+// fails otherwise.
+func (s *synRcvdCounter) inc() bool {
+	s.Lock()
+	defer s.Unlock()
+	if s.value >= s.threshold {
+		return false
+	}
+
+	s.pending.Add(1)
+	s.value++
+
+	return true
+}
+
+// dec atomically decrements the global number of endpoints in SYN-RCVD
+// state. It must only be called if a previous call to inc succeeded.
+func (s *synRcvdCounter) dec() {
+	s.Lock()
+	defer s.Unlock()
+	s.value--
+	s.pending.Done()
+}
+
+// synCookiesInUse returns true if the synRcvdCount is greater than
+// SynRcvdCountThreshold.
+func (s *synRcvdCounter) synCookiesInUse() bool {
+	s.Lock()
+	defer s.Unlock()
+	return s.value >= s.threshold
+}
+
+// SetThreshold sets synRcvdCounter.Threshold to ths new threshold.
+func (s *synRcvdCounter) SetThreshold(threshold uint64) {
+	s.Lock()
+	defer s.Unlock()
+	s.threshold = threshold
+}
+
+// Threshold returns the current value of synRcvdCounter.Threhsold.
+func (s *synRcvdCounter) Threshold() uint64 {
+	s.Lock()
+	defer s.Unlock()
+	return s.threshold
+}
+
 type protocol struct {
-	mu                         sync.Mutex
+	mu                         sync.RWMutex
 	sackEnabled                bool
 	delayEnabled               bool
 	sendBufferSize             SendBufferSizeOption
@@ -105,6 +167,11 @@ type protocol struct {
 	moderateReceiveBuffer      bool
 	tcpLingerTimeout           time.Duration
 	tcpTimeWaitTimeout         time.Duration
+	minRTO                     time.Duration
+	maxRTO                     time.Duration
+	maxRetries                 uint32
+	synRcvdCount               synRcvdCounter
+	synRetries                 uint8
 	dispatcher                 *dispatcher
 }
 
@@ -140,7 +207,7 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 // to a specific processing queue. Each queue is serviced by its own processor
 // goroutine which is responsible for dequeuing and doing full TCP dispatch of
 // the packet.
-func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	p.dispatcher.queuePacket(r, ep, id, pkt)
 }
 
@@ -151,7 +218,7 @@ func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id st
 // a reset is sent in response to any incoming segment except another reset. In
 // particular, SYNs addressed to a non-existent connection are rejected by this
 // means."
-func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
@@ -164,12 +231,12 @@ func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Transpo
 		return true
 	}
 
-	replyWithReset(s)
+	replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
 	return true
 }
 
 // replyWithReset replies to the given segment with a reset segment.
-func replyWithReset(s *segment) {
+func replyWithReset(s *segment, tos, ttl uint8) {
 	// Get the seqnum from the packet if the ack flag is set.
 	seq := seqnum.Value(0)
 	ack := seqnum.Value(0)
@@ -191,10 +258,18 @@ func replyWithReset(s *segment) {
 		flags |= header.TCPFlagAck
 		ack = s.sequenceNumber.Add(s.logicalLen())
 	}
-	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, flags, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
+	sendTCP(&s.route, tcpFields{
+		id:     s.id,
+		ttl:    ttl,
+		tos:    tos,
+		flags:  flags,
+		seq:    seq,
+		ack:    ack,
+		rcvWnd: 0,
+	}, buffer.VectorisedView{}, nil /* gso */, nil /* PacketOwner */)
 }
 
-// SetOption implements TransportProtocol.SetOption.
+// SetOption implements stack.TransportProtocol.SetOption.
 func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 	switch v := option.(type) {
 	case SACKEnabled:
@@ -264,66 +339,135 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
+	case tcpip.TCPMinRTOOption:
+		if v < 0 {
+			v = tcpip.TCPMinRTOOption(MinRTO)
+		}
+		p.mu.Lock()
+		p.minRTO = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPMaxRTOOption:
+		if v < 0 {
+			v = tcpip.TCPMaxRTOOption(MaxRTO)
+		}
+		p.mu.Lock()
+		p.maxRTO = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPMaxRetriesOption:
+		p.mu.Lock()
+		p.maxRetries = uint32(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPSynRcvdCountThresholdOption:
+		p.mu.Lock()
+		p.synRcvdCount.SetThreshold(uint64(v))
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPSynRetriesOption:
+		if v < 1 || v > 255 {
+			return tcpip.ErrInvalidOptionValue
+		}
+		p.mu.Lock()
+		p.synRetries = uint8(v)
+		p.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-// Option implements TransportProtocol.Option.
+// Option implements stack.TransportProtocol.Option.
 func (p *protocol) Option(option interface{}) *tcpip.Error {
 	switch v := option.(type) {
 	case *SACKEnabled:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = SACKEnabled(p.sackEnabled)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *DelayEnabled:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = DelayEnabled(p.delayEnabled)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *SendBufferSizeOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = p.sendBufferSize
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *ReceiveBufferSizeOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = p.recvBufferSize
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.CongestionControlOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.CongestionControlOption(p.congestionControl)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.AvailableCongestionControlOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.ModerateReceiveBufferOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.ModerateReceiveBufferOption(p.moderateReceiveBuffer)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.TCPLingerTimeoutOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.TCPLingerTimeoutOption(p.tcpLingerTimeout)
-		p.mu.Unlock()
+		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.TCPTimeWaitTimeoutOption:
-		p.mu.Lock()
+		p.mu.RLock()
 		*v = tcpip.TCPTimeWaitTimeoutOption(p.tcpTimeWaitTimeout)
-		p.mu.Unlock()
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPMinRTOOption:
+		p.mu.RLock()
+		*v = tcpip.TCPMinRTOOption(p.minRTO)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPMaxRTOOption:
+		p.mu.RLock()
+		*v = tcpip.TCPMaxRTOOption(p.maxRTO)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPMaxRetriesOption:
+		p.mu.RLock()
+		*v = tcpip.TCPMaxRetriesOption(p.maxRetries)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPSynRcvdCountThresholdOption:
+		p.mu.RLock()
+		*v = tcpip.TCPSynRcvdCountThresholdOption(p.synRcvdCount.Threshold())
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPSynRetriesOption:
+		p.mu.RLock()
+		*v = tcpip.TCPSynRetriesOption(p.synRetries)
+		p.mu.RUnlock()
 		return nil
 
 	default:
@@ -331,6 +475,42 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 	}
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (p *protocol) Close() {
+	p.dispatcher.close()
+}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (p *protocol) Wait() {
+	p.dispatcher.wait()
+}
+
+// SynRcvdCounter returns a reference to the synRcvdCount for this protocol
+// instance.
+func (p *protocol) SynRcvdCounter() *synRcvdCounter {
+	return &p.synRcvdCount
+}
+
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	hdr, ok := pkt.Data.PullUp(header.TCPMinimumSize)
+	if !ok {
+		return false
+	}
+
+	// If the header has options, pull those up as well.
+	if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() {
+		hdr, ok = pkt.Data.PullUp(offset)
+		if !ok {
+			panic(fmt.Sprintf("There should be at least %d bytes in pkt.Data.", offset))
+		}
+	}
+
+	pkt.TransportHeader = hdr
+	pkt.Data.TrimFront(len(hdr))
+	return true
+}
+
 // NewProtocol returns a TCP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	return &protocol{
@@ -340,6 +520,11 @@ func NewProtocol() stack.TransportProtocol {
 		availableCongestionControl: []string{ccReno, ccCubic},
 		tcpLingerTimeout:           DefaultTCPLingerTimeout,
 		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
+		synRcvdCount:               synRcvdCounter{threshold: SynRcvdCountThreshold},
 		dispatcher:                 newDispatcher(runtime.GOMAXPROCS(0)),
+		synRetries:                 DefaultSynRetries,
+		minRTO:                     MinRTO,
+		maxRTO:                     MaxRTO,
+		maxRetries:                 MaxRetries,
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 958f03ac1..dd89a292a 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -70,13 +70,16 @@ func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale
 // acceptable checks if the segment sequence number range is acceptable
 // according to the table on page 26 of RFC 793.
 func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
-	rcvWnd := r.rcvNxt.Size(r.rcvAcc)
-	if rcvWnd == 0 {
-		return segLen == 0 && segSeq == r.rcvNxt
+	// r.rcvWnd could be much larger than the window size we advertised in our
+	// outgoing packets, we should use what we have advertised for acceptability
+	// test.
+	scaledWindowSize := r.rcvWnd >> r.rcvWndScale
+	if scaledWindowSize > 0xffff {
+		// This is what we actually put in the Window field.
+		scaledWindowSize = 0xffff
 	}
-
-	return segSeq.InWindow(r.rcvNxt, rcvWnd) ||
-		seqnum.Overlap(r.rcvNxt, rcvWnd, segSeq, segLen)
+	advertisedWindowSize := scaledWindowSize << r.rcvWndScale
+	return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize))
 }
 
 // getSendParams returns the parameters needed by the sender when building
@@ -168,7 +171,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 
 		// We just received a FIN, our next state depends on whether we sent a
 		// FIN already or not.
-		r.ep.mu.Lock()
 		switch r.ep.EndpointState() {
 		case StateEstablished:
 			r.ep.setEndpointState(StateCloseWait)
@@ -183,7 +185,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		case StateFinWait2:
 			r.ep.setEndpointState(StateTimeWait)
 		}
-		r.ep.mu.Unlock()
 
 		// Flush out any pending segments, except the very first one if
 		// it happens to be the one we're handling now because the
@@ -195,6 +196,10 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 
 		for i := first; i < len(r.pendingRcvdSegments); i++ {
 			r.pendingRcvdSegments[i].decRef()
+			// Note that slice truncation does not allow garbage collection of
+			// truncated items, thus truncated items must be set to nil to avoid
+			// memory leaks.
+			r.pendingRcvdSegments[i] = nil
 		}
 		r.pendingRcvdSegments = r.pendingRcvdSegments[:first]
 
@@ -204,7 +209,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 	// Handle ACK (not FIN-ACK, which we handled above) during one of the
 	// shutdown states.
 	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt {
-		r.ep.mu.Lock()
 		switch r.ep.EndpointState() {
 		case StateFinWait1:
 			r.ep.setEndpointState(StateFinWait2)
@@ -218,7 +222,6 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		case StateLastAck:
 			r.ep.transitionToStateCloseLocked()
 		}
-		r.ep.mu.Unlock()
 	}
 
 	return true
@@ -265,7 +268,14 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 	// If we are in one of the shutdown states then we need to do
 	// additional checks before we try and process the segment.
 	switch state {
-	case StateCloseWait, StateClosing, StateLastAck:
+	case StateCloseWait:
+		// If the ACK acks something not yet sent then we send an ACK.
+		if r.ep.snd.sndNxt.LessThan(s.ackNumber) {
+			r.ep.snd.sendAck()
+			return true, nil
+		}
+		fallthrough
+	case StateClosing, StateLastAck:
 		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
 			// Just drop the segment as we have
 			// already received a FIN and this
@@ -282,7 +292,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 		// SHUT_RD) then any data past the rcvNxt should
 		// trigger a RST.
 		endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
-		if rcvClosed && r.rcvNxt.LessThan(endDataSeq) {
+		if state != StateCloseWait && rcvClosed && r.rcvNxt.LessThan(endDataSeq) {
 			return true, tcpip.ErrConnectionAborted
 		}
 		if state == StateFinWait1 {
@@ -332,17 +342,8 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 // handleRcvdSegment handles TCP segments directed at the connection managed by
 // r as they arrive. It is called by the protocol main loop.
 func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
-	r.ep.mu.RLock()
 	state := r.ep.EndpointState()
 	closed := r.ep.closed
-	r.ep.mu.RUnlock()
-
-	if state != StateEstablished {
-		drop, err := r.handleRcvdSegmentClosing(s, state, closed)
-		if drop || err != nil {
-			return drop, err
-		}
-	}
 
 	segLen := seqnum.Size(s.data.Size())
 	segSeq := s.sequenceNumber
@@ -355,6 +356,13 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 		return true, nil
 	}
 
+	if state != StateEstablished {
+		drop, err := r.handleRcvdSegmentClosing(s, state, closed)
+		if drop || err != nil {
+			return drop, err
+		}
+	}
+
 	// Store the time of the last ack.
 	r.lastRcvdAckTime = time.Now()
 
diff --git a/pkg/tcpip/transport/tcp/rcv_test.go b/pkg/tcpip/transport/tcp/rcv_test.go
new file mode 100644
index 000000000..8a026ec46
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rcv_test.go
@@ -0,0 +1,74 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package rcv_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+func TestAcceptable(t *testing.T) {
+	for _, tt := range []struct {
+		segSeq         seqnum.Value
+		segLen         seqnum.Size
+		rcvNxt, rcvAcc seqnum.Value
+		want           bool
+	}{
+		// The segment is smaller than the window.
+		{105, 2, 100, 104, false},
+		{105, 2, 101, 105, true},
+		{105, 2, 102, 106, true},
+		{105, 2, 103, 107, true},
+		{105, 2, 104, 108, true},
+		{105, 2, 105, 109, true},
+		{105, 2, 106, 110, true},
+		{105, 2, 107, 111, false},
+
+		// The segment is larger than the window.
+		{105, 4, 103, 105, true},
+		{105, 4, 104, 106, true},
+		{105, 4, 105, 107, true},
+		{105, 4, 106, 108, true},
+		{105, 4, 107, 109, true},
+		{105, 4, 108, 110, true},
+		{105, 4, 109, 111, false},
+		{105, 4, 110, 112, false},
+
+		// The segment has no width.
+		{105, 0, 100, 102, false},
+		{105, 0, 101, 103, false},
+		{105, 0, 102, 104, false},
+		{105, 0, 103, 105, true},
+		{105, 0, 104, 106, true},
+		{105, 0, 105, 107, true},
+		{105, 0, 106, 108, false},
+		{105, 0, 107, 109, false},
+
+		// The receive window has no width.
+		{105, 2, 103, 103, false},
+		{105, 2, 104, 104, false},
+		{105, 2, 105, 105, false},
+		{105, 2, 106, 106, false},
+		{105, 2, 107, 107, false},
+		{105, 2, 108, 108, false},
+		{105, 2, 109, 109, false},
+	} {
+		if got := header.Acceptable(tt.segSeq, tt.segLen, tt.rcvNxt, tt.rcvAcc); got != tt.want {
+			t.Errorf("header.Acceptable(%d, %d, %d, %d) = %t, want %t", tt.segSeq, tt.segLen, tt.rcvNxt, tt.rcvAcc, got, tt.want)
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 1c10da5ca..0280892a8 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -18,7 +18,6 @@ import (
 	"sync/atomic"
 	"time"
 
-	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
@@ -36,6 +35,7 @@ type segment struct {
 	id     stack.TransportEndpointID `state:"manual"`
 	route  stack.Route               `state:"manual"`
 	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
+	hdr    header.TCP
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -56,18 +56,19 @@ type segment struct {
 	options        []byte `state:".([]byte)"`
 	hasNewSACKInfo bool
 	rcvdTime       time.Time `state:".(unixTime)"`
-	// xmitTime is the last transmit time of this segment. A zero value
-	// indicates that the segment has yet to be transmitted.
-	xmitTime time.Time `state:".(unixTime)"`
+	// xmitTime is the last transmit time of this segment.
+	xmitTime  time.Time `state:".(unixTime)"`
+	xmitCount uint32
 }
 
-func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) *segment {
+func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) *segment {
 	s := &segment{
 		refCnt: 1,
 		id:     id,
 		route:  r.Clone(),
 	}
 	s.data = pkt.Data.Clone(s.views[:])
+	s.hdr = header.TCP(pkt.TransportHeader)
 	s.rcvdTime = time.Now()
 	return s
 }
@@ -78,9 +79,11 @@ func newSegmentFromView(r *stack.Route, id stack.TransportEndpointID, v buffer.V
 		id:     id,
 		route:  r.Clone(),
 	}
-	s.views[0] = v
-	s.data = buffer.NewVectorisedView(len(v), s.views[:1])
 	s.rcvdTime = time.Now()
+	if len(v) != 0 {
+		s.views[0] = v
+		s.data = buffer.NewVectorisedView(len(v), s.views[:1])
+	}
 	return s
 }
 
@@ -95,6 +98,8 @@ func (s *segment) clone() *segment {
 		route:          s.route.Clone(),
 		viewToDeliver:  s.viewToDeliver,
 		rcvdTime:       s.rcvdTime,
+		xmitTime:       s.xmitTime,
+		xmitCount:      s.xmitCount,
 	}
 	t.data = s.data.Clone(t.views[:])
 	return t
@@ -143,8 +148,6 @@ func (s *segment) logicalLen() seqnum.Size {
 // TCP checksum and stores the checksum and result of checksum verification in
 // the csum and csumValid fields of the segment.
 func (s *segment) parse() bool {
-	h := header.TCP(s.data.First())
-
 	// h is the header followed by the payload. We check that the offset to
 	// the data respects the following constraints:
 	// 1. That it's at least the minimum header size; if we don't do this
@@ -155,12 +158,12 @@ func (s *segment) parse() bool {
 	// N.B. The segment has already been validated as having at least the
 	//      minimum TCP size before reaching here, so it's safe to read the
 	//      fields.
-	offset := int(h.DataOffset())
-	if offset < header.TCPMinimumSize || offset > len(h) {
+	offset := int(s.hdr.DataOffset())
+	if offset < header.TCPMinimumSize || offset > len(s.hdr) {
 		return false
 	}
 
-	s.options = []byte(h[header.TCPMinimumSize:offset])
+	s.options = []byte(s.hdr[header.TCPMinimumSize:])
 	s.parsedOptions = header.ParseTCPOptions(s.options)
 
 	// Query the link capabilities to decide if checksum validation is
@@ -169,21 +172,19 @@ func (s *segment) parse() bool {
 	if s.route.Capabilities()&stack.CapabilityRXChecksumOffload != 0 {
 		s.csumValid = true
 		verifyChecksum = false
-		s.data.TrimFront(offset)
 	}
 	if verifyChecksum {
-		s.csum = h.Checksum()
-		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()))
-		xsum = h.CalculateChecksum(xsum)
-		s.data.TrimFront(offset)
+		s.csum = s.hdr.Checksum()
+		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()+len(s.hdr)))
+		xsum = s.hdr.CalculateChecksum(xsum)
 		xsum = header.ChecksumVV(s.data, xsum)
 		s.csumValid = xsum == 0xffff
 	}
 
-	s.sequenceNumber = seqnum.Value(h.SequenceNumber())
-	s.ackNumber = seqnum.Value(h.AckNumber())
-	s.flags = h.Flags()
-	s.window = seqnum.Size(h.WindowSize())
+	s.sequenceNumber = seqnum.Value(s.hdr.SequenceNumber())
+	s.ackNumber = seqnum.Value(s.hdr.AckNumber())
+	s.flags = s.hdr.Flags()
+	s.window = seqnum.Size(s.hdr.WindowSize())
 	return true
 }
 
diff --git a/pkg/tcpip/transport/tcp/segment_heap.go b/pkg/tcpip/transport/tcp/segment_heap.go
index 9fd061d7d..8d3ddce4b 100644
--- a/pkg/tcpip/transport/tcp/segment_heap.go
+++ b/pkg/tcpip/transport/tcp/segment_heap.go
@@ -14,21 +14,25 @@
 
 package tcp
 
+import "container/heap"
+
 type segmentHeap []*segment
 
+var _ heap.Interface = (*segmentHeap)(nil)
+
 // Len returns the length of h.
-func (h segmentHeap) Len() int {
-	return len(h)
+func (h *segmentHeap) Len() int {
+	return len(*h)
 }
 
 // Less determines whether the i-th element of h is less than the j-th element.
-func (h segmentHeap) Less(i, j int) bool {
-	return h[i].sequenceNumber.LessThan(h[j].sequenceNumber)
+func (h *segmentHeap) Less(i, j int) bool {
+	return (*h)[i].sequenceNumber.LessThan((*h)[j].sequenceNumber)
 }
 
 // Swap swaps the i-th and j-th elements of h.
-func (h segmentHeap) Swap(i, j int) {
-	h[i], h[j] = h[j], h[i]
+func (h *segmentHeap) Swap(i, j int) {
+	(*h)[i], (*h)[j] = (*h)[j], (*h)[i]
 }
 
 // Push adds x as the last element of h.
@@ -41,6 +45,7 @@ func (h *segmentHeap) Pop() interface{} {
 	old := *h
 	n := len(old)
 	x := old[n-1]
+	old[n-1] = nil
 	*h = old[:n-1]
 	return x
 }
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index bd20a7ee9..48a257137 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -28,10 +28,16 @@ type segmentQueue struct {
 	used  int
 }
 
+// emptyLocked determines if the queue is empty.
+// Preconditions: q.mu must be held.
+func (q *segmentQueue) emptyLocked() bool {
+	return q.used == 0
+}
+
 // empty determines if the queue is empty.
 func (q *segmentQueue) empty() bool {
 	q.mu.Lock()
-	r := q.used == 0
+	r := q.emptyLocked()
 	q.mu.Unlock()
 
 	return r
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index b74b61e7d..acacb42e4 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -15,6 +15,7 @@
 package tcp
 
 import (
+	"fmt"
 	"math"
 	"sync/atomic"
 	"time"
@@ -40,6 +41,11 @@ const (
 	// nDupAckThreshold is the number of duplicate ACK's required
 	// before fast-retransmit is entered.
 	nDupAckThreshold = 3
+
+	// MaxRetries is the maximum number of probe retries sender does
+	// before timing out the connection.
+	// Linux default TCP_RETR2, net.ipv4.tcp_retries2.
+	MaxRetries = 15
 )
 
 // ccState indicates the current congestion control state for this sender.
@@ -126,10 +132,6 @@ type sender struct {
 	// sndNxt is the sequence number of the next segment to be sent.
 	sndNxt seqnum.Value
 
-	// sndNxtList is the sequence number of the next segment to be added to
-	// the send list.
-	sndNxtList seqnum.Value
-
 	// rttMeasureSeqNum is the sequence number being used for the latest RTT
 	// measurement.
 	rttMeasureSeqNum seqnum.Value
@@ -141,6 +143,14 @@ type sender struct {
 	// the first segment that was retransmitted due to RTO expiration.
 	firstRetransmittedSegXmitTime time.Time `state:".(unixTime)"`
 
+	// zeroWindowProbing is set if the sender is currently probing
+	// for zero receive window.
+	zeroWindowProbing bool `state:"nosave"`
+
+	// unackZeroWindowProbes is the number of unacknowledged zero
+	// window probes.
+	unackZeroWindowProbes uint32 `state:"nosave"`
+
 	closed      bool
 	writeNext   *segment
 	writeList   segmentList
@@ -153,6 +163,15 @@ type sender struct {
 	rtt rtt
 	rto time.Duration
 
+	// minRTO is the minimum permitted value for sender.rto.
+	minRTO time.Duration
+
+	// maxRTO is the maximum permitted value for sender.rto.
+	maxRTO time.Duration
+
+	// maxRetries is the maximum permitted retransmissions.
+	maxRetries uint32
+
 	// maxPayloadSize is the maximum size of the payload of a given segment.
 	// It is initialized on demand.
 	maxPayloadSize int
@@ -229,7 +248,6 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 		sndWnd:           sndWnd,
 		sndUna:           iss + 1,
 		sndNxt:           iss + 1,
-		sndNxtList:       iss + 1,
 		rto:              1 * time.Second,
 		rttMeasureSeqNum: iss + 1,
 		lastSendTime:     time.Now(),
@@ -265,6 +283,25 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 	// etc.
 	s.ep.scoreboard = NewSACKScoreboard(uint16(s.maxPayloadSize), iss)
 
+	// Get Stack wide config.
+	var minRTO tcpip.TCPMinRTOOption
+	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
+		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
+	}
+	s.minRTO = time.Duration(minRTO)
+
+	var maxRTO tcpip.TCPMaxRTOOption
+	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
+		panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
+	}
+	s.maxRTO = time.Duration(maxRTO)
+
+	var maxRetries tcpip.TCPMaxRetriesOption
+	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
+		panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
+	}
+	s.maxRetries = uint32(maxRetries)
+
 	return s
 }
 
@@ -399,8 +436,8 @@ func (s *sender) updateRTO(rtt time.Duration) {
 
 	s.rto = s.rtt.srtt + 4*s.rtt.rttvar
 	s.rtt.Unlock()
-	if s.rto < MinRTO {
-		s.rto = MinRTO
+	if s.rto < s.minRTO {
+		s.rto = s.minRTO
 	}
 }
 
@@ -455,9 +492,7 @@ func (s *sender) retransmitTimerExpired() bool {
 	// Give up if we've waited more than a minute since the last resend or
 	// if a user time out is set and we have exceeded the user specified
 	// timeout since the first retransmission.
-	s.ep.mu.RLock()
 	uto := s.ep.userTimeout
-	s.ep.mu.RUnlock()
 
 	if s.firstRetransmittedSegXmitTime.IsZero() {
 		// We store the original xmitTime of the segment that we are
@@ -469,19 +504,26 @@ func (s *sender) retransmitTimerExpired() bool {
 	}
 
 	elapsed := time.Since(s.firstRetransmittedSegXmitTime)
-	remaining := MaxRTO
+	remaining := s.maxRTO
 	if uto != 0 {
 		// Cap to the user specified timeout if one is specified.
 		remaining = uto - elapsed
 	}
 
-	if remaining <= 0 || s.rto >= MaxRTO {
+	// Always honor the user-timeout irrespective of whether the zero
+	// window probes were acknowledged.
+	// net/ipv4/tcp_timer.c::tcp_probe_timer()
+	if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
 		return false
 	}
 
 	// Set new timeout. The timer will be restarted by the call to sendData
 	// below.
 	s.rto *= 2
+	// Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
+	if s.rto > s.maxRTO {
+		s.rto = s.maxRTO
+	}
 
 	// Cap RTO to remaining time.
 	if s.rto > remaining {
@@ -529,6 +571,26 @@ func (s *sender) retransmitTimerExpired() bool {
 	// information is usable after an RTO.
 	s.ep.scoreboard.Reset()
 	s.writeNext = s.writeList.Front()
+
+	// RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
+	// zero receive window after retransmission interval and we have data to
+	// send.
+	if s.zeroWindowProbing {
+		s.sendZeroWindowProbe()
+		// RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
+		// indefinitely.  As long as the receiving TCP continues to send
+		// acknowledgments in response to the probe segments, the sending TCP
+		// MUST allow the connection to stay open.
+		return true
+	}
+
+	seg := s.writeNext
+	// RFC 1122 4.2.3.5: Close the connection when the number of
+	// retransmissions for this segment is beyond a limit.
+	if seg != nil && seg.xmitCount > s.maxRetries {
+		return false
+	}
+
 	s.sendData()
 
 	return true
@@ -556,25 +618,51 @@ func (s *sender) splitSeg(seg *segment, size int) {
 	nSeg.data.TrimFront(size)
 	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
 	s.writeList.InsertAfter(seg, nSeg)
+
+	// The segment being split does not carry PUSH flag because it is
+	// followed by the newly split segment.
+	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
+	// segment (i.e., when there is no more queued data to be sent).
+	// Linux removes PSH flag only when the segment is being split over MSS
+	// and retains it when we are splitting the segment over lack of sender
+	// window space.
+	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
+	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
+	if seg.data.Size() > s.maxPayloadSize {
+		seg.flags ^= header.TCPFlagPsh
+	}
+
 	seg.data.CapLength(size)
 }
 
-// NextSeg implements the RFC6675 NextSeg() operation. It returns segments that
-// match rule 1, 3 and 4 of the NextSeg() operation defined in RFC6675. Rule 2
-// is handled by the normal send logic.
-func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) {
+// NextSeg implements the RFC6675 NextSeg() operation.
+//
+// NextSeg starts scanning the writeList starting from nextSegHint and returns
+// the hint to be passed on the next call to NextSeg. This is required to avoid
+// iterating the write list repeatedly when NextSeg is invoked in a loop during
+// recovery. The returned hint will be nil if there are no more segments that
+// can match rules defined by NextSeg operation in RFC6675.
+//
+// rescueRtx will be true only if nextSeg is a rescue retransmission as
+// described by Step 4) of the NextSeg algorithm.
+func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
 	var s3 *segment
 	var s4 *segment
-	smss := s.ep.scoreboard.SMSS()
 	// Step 1.
-	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
-		if !s.isAssignedSequenceNumber(seg) {
+	for seg := nextSegHint; seg != nil; seg = seg.Next() {
+		// Stop iteration if we hit a segment that has never been
+		// transmitted (i.e. either it has no assigned sequence number
+		// or if it does have one, it's >= the next sequence number
+		// to be sent [i.e. >= s.sndNxt]).
+		if !s.isAssignedSequenceNumber(seg) || s.sndNxt.LessThanEq(seg.sequenceNumber) {
+			hint = nil
 			break
 		}
 		segSeq := seg.sequenceNumber
-		if seg.data.Size() > int(smss) {
+		if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) {
 			s.splitSeg(seg, int(smss))
 		}
+
 		// See RFC 6675 Section 4
 		//
 		//     1. If there exists a smallest unSACKED sequence number
@@ -591,8 +679,9 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) {
 				// NextSeg():
 				//     (1.c) IsLost(S2) returns true.
 				if s.ep.scoreboard.IsLost(segSeq) {
-					return seg, s3, s4
+					return seg, seg.Next(), false
 				}
+
 				// NextSeg():
 				//
 				// (3): If the conditions for rules (1) and (2)
@@ -604,6 +693,7 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) {
 				// SHOULD be returned.
 				if s3 == nil {
 					s3 = seg
+					hint = seg.Next()
 				}
 			}
 			// NextSeg():
@@ -612,10 +702,12 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) {
 			//     but there exists outstanding unSACKED data, we
 			//     provide the opportunity for a single "rescue"
 			//     retransmission per entry into loss recovery. If
-			//     HighACK is greater than RescueRxt, the one
-			//     segment of upto SMSS octects that MUST include
-			//     the highest outstanding unSACKed sequence number
-			//     SHOULD be returned.
+			//     HighACK is greater than RescueRxt (or RescueRxt
+			//     is undefined), then one segment of upto SMSS
+			//     octects that MUST include the highest outstanding
+			//     unSACKed sequence number SHOULD be returned, and
+			//     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
+			//     be updated.
 			if s.fr.rescueRxt.LessThan(s.sndUna - 1) {
 				if s4 != nil {
 					if s4.sequenceNumber.LessThan(segSeq) {
@@ -624,12 +716,31 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) {
 				} else {
 					s4 = seg
 				}
-				s.fr.rescueRxt = s.fr.last
 			}
 		}
 	}
 
-	return nil, s3, s4
+	// If we got here then no segment matched step (1).
+	// Step (2): "If no sequence number 'S2' per rule (1)
+	// exists but there exists available unsent data and the
+	// receiver's advertised window allows, the sequence
+	// range of one segment of up to SMSS octets of
+	// previously unsent data starting with sequence number
+	// HighData+1 MUST be returned."
+	for seg := s.writeNext; seg != nil; seg = seg.Next() {
+		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) {
+			continue
+		}
+		// We do not split the segment here to <= smss as it has
+		// potentially not been assigned a sequence number yet.
+		return seg, nil, false
+	}
+
+	if s3 != nil {
+		return s3, hint, false
+	}
+
+	return s4, nil, true
 }
 
 // maybeSendSegment tries to send the specified segment and either coalesces
@@ -642,7 +753,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 	if !s.isAssignedSequenceNumber(seg) {
 		// Merge segments if allowed.
 		if seg.data.Size() != 0 {
-			available := int(seg.sequenceNumber.Size(end))
+			available := int(s.sndNxt.Size(end))
 			if available > limit {
 				available = limit
 			}
@@ -685,8 +796,11 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 					//   sent all at once.
 					return false
 				}
-				if atomic.LoadUint32(&s.ep.cork) != 0 {
-					// Hold back the segment until full.
+				// With TCP_CORK, hold back until minimum of the available
+				// send space and MSS.
+				// TODO(gvisor.dev/issue/2833): Drain the held segments after a
+				// timeout.
+				if seg.data.Size() < s.maxPayloadSize && atomic.LoadUint32(&s.ep.cork) != 0 {
 					return false
 				}
 			}
@@ -713,13 +827,31 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		default:
 			s.ep.setEndpointState(StateFinWait1)
 		}
-
 	} else {
 		// We're sending a non-FIN segment.
 		if seg.flags&header.TCPFlagFin != 0 {
 			panic("Netstack queues FIN segments without data.")
 		}
 
+		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
+		// If the entire segment cannot be accomodated in the receiver
+		// advertized window, skip splitting and sending of the segment.
+		// ref: net/ipv4/tcp_output.c::tcp_snd_wnd_test()
+		//
+		// Linux checks this for all segment transmits not triggered
+		// by a probe timer. On this condition, it defers the segment
+		// split and transmit to a short probe timer.
+		// ref: include/net/tcp.h::tcp_check_probe_timer()
+		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
+		//
+		// Instead of defining a new transmit timer, we attempt to split the
+		// segment right here if there are no pending segments.
+		// If there are pending segments, segment transmits are deferred
+		// to the retransmit timer handler.
+		if s.sndUna != s.sndNxt && !segEnd.LessThan(end) {
+			return false
+		}
+
 		if !seg.sequenceNumber.LessThan(end) {
 			return false
 		}
@@ -728,9 +860,17 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		if available == 0 {
 			return false
 		}
+
+		// The segment size limit is computed as a function of sender congestion
+		// window and MSS. When sender congestion window is > 1, this limit can
+		// be larger than MSS. Ensure that the currently available send space
+		// is not greater than minimum of this limit and MSS.
 		if available > limit {
 			available = limit
 		}
+		if available > s.maxPayloadSize {
+			available = s.maxPayloadSize
+		}
 
 		if seg.data.Size() > available {
 			s.splitSeg(seg, available)
@@ -754,64 +894,47 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 // section 5, step C.
 func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) {
 	s.SetPipe()
+
+	if smss := int(s.ep.scoreboard.SMSS()); limit > smss {
+		// Cap segment size limit to s.smss as SACK recovery requires
+		// that all retransmissions or new segments send during recovery
+		// be of <= SMSS.
+		limit = smss
+	}
+
+	nextSegHint := s.writeList.Front()
 	for s.outstanding < s.sndCwnd {
-		nextSeg, s3, s4 := s.NextSeg()
+		var nextSeg *segment
+		var rescueRtx bool
+		nextSeg, nextSegHint, rescueRtx = s.NextSeg(nextSegHint)
 		if nextSeg == nil {
-			// NextSeg():
-			//
-			// Step (2): "If no sequence number 'S2' per rule (1)
-			// exists but there exists available unsent data and the
-			// receiver's advertised window allows, the sequence
-			// range of one segment of up to SMSS octets of
-			// previously unsent data starting with sequence number
-			// HighData+1 MUST be returned."
-			for seg := s.writeNext; seg != nil; seg = seg.Next() {
-				if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) {
-					continue
-				}
-				// Step C.3 described below is handled by
-				// maybeSendSegment which increments sndNxt when
-				// a segment is transmitted.
-				//
-				// Step C.3 "If any of the data octets sent in
-				// (C.1) are above HighData, HighData must be
-				// updated to reflect the transmission of
-				// previously unsent data."
-				if sent := s.maybeSendSegment(seg, limit, end); !sent {
-					break
-				}
-				dataSent = true
-				s.outstanding++
-				s.writeNext = seg.Next()
-				nextSeg = seg
-				break
-			}
-			if nextSeg != nil {
-				continue
-			}
-		}
-		rescueRtx := false
-		if nextSeg == nil && s3 != nil {
-			nextSeg = s3
-		}
-		if nextSeg == nil && s4 != nil {
-			nextSeg = s4
-			rescueRtx = true
-		}
-		if nextSeg == nil {
-			break
+			return dataSent
 		}
-		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
-		if !rescueRtx && nextSeg.sequenceNumber.LessThan(s.sndNxt) {
-			// RFC 6675, Step C.2
+		if !s.isAssignedSequenceNumber(nextSeg) || s.sndNxt.LessThanEq(nextSeg.sequenceNumber) {
+			// New data being sent.
+
+			// Step C.3 described below is handled by
+			// maybeSendSegment which increments sndNxt when
+			// a segment is transmitted.
 			//
-			// "If any of the data octets sent in (C.1) are below
-			// HighData, HighRxt MUST be set to the highest sequence
-			// number of the retransmitted segment unless NextSeg ()
-			// rule (4) was invoked for this retransmission."
-			s.fr.highRxt = segEnd - 1
+			// Step C.3 "If any of the data octets sent in
+			// (C.1) are above HighData, HighData must be
+			// updated to reflect the transmission of
+			// previously unsent data."
+			//
+			// We pass s.smss as the limit as the Step 2) requires that
+			// new data sent should be of size s.smss or less.
+			if sent := s.maybeSendSegment(nextSeg, limit, end); !sent {
+				return dataSent
+			}
+			dataSent = true
+			s.outstanding++
+			s.writeNext = nextSeg.Next()
+			continue
 		}
 
+		// Now handle the retransmission case where we matched either step 1,3 or 4
+		// of the NextSeg algorithm.
 		// RFC 6675, Step C.4.
 		//
 		// "The estimate of the amount of data outstanding in the network
@@ -820,10 +943,54 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool)
 		s.outstanding++
 		dataSent = true
 		s.sendSegment(nextSeg)
+
+		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
+		if rescueRtx {
+			// We do the last part of rule (4) of NextSeg here to update
+			// RescueRxt as until this point we don't know if we are going
+			// to use the rescue transmission.
+			s.fr.rescueRxt = s.fr.last
+		} else {
+			// RFC 6675, Step C.2
+			//
+			// "If any of the data octets sent in (C.1) are below
+			// HighData, HighRxt MUST be set to the highest sequence
+			// number of the retransmitted segment unless NextSeg ()
+			// rule (4) was invoked for this retransmission."
+			s.fr.highRxt = segEnd - 1
+		}
 	}
 	return dataSent
 }
 
+func (s *sender) sendZeroWindowProbe() {
+	ack, win := s.ep.rcv.getSendParams()
+	s.unackZeroWindowProbes++
+	// Send a zero window probe with sequence number pointing to
+	// the last acknowledged byte.
+	s.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, s.sndUna-1, ack, win)
+	// Rearm the timer to continue probing.
+	s.resendTimer.enable(s.rto)
+}
+
+func (s *sender) enableZeroWindowProbing() {
+	s.zeroWindowProbing = true
+	// We piggyback the probing on the retransmit timer with the
+	// current retranmission interval, as we may start probing while
+	// segment retransmissions.
+	if s.firstRetransmittedSegXmitTime.IsZero() {
+		s.firstRetransmittedSegXmitTime = time.Now()
+	}
+	s.resendTimer.enable(s.rto)
+}
+
+func (s *sender) disableZeroWindowProbing() {
+	s.zeroWindowProbing = false
+	s.unackZeroWindowProbes = 0
+	s.firstRetransmittedSegXmitTime = time.Time{}
+	s.resendTimer.disable()
+}
+
 // sendData sends new data segments. It is called when data becomes available or
 // when the send window opens up.
 func (s *sender) sendData() {
@@ -837,7 +1004,7 @@ func (s *sender) sendData() {
 	// "A TCP SHOULD set cwnd to no more than RW before beginning
 	// transmission if the TCP has not sent data in the interval exceeding
 	// the retrasmission timeout."
-	if !s.fr.active && time.Now().Sub(s.lastSendTime) > s.rto {
+	if !s.fr.active && s.state != RTORecovery && time.Now().Sub(s.lastSendTime) > s.rto {
 		if s.sndCwnd > InitialCwnd {
 			s.sndCwnd = InitialCwnd
 		}
@@ -855,6 +1022,9 @@ func (s *sender) sendData() {
 				limit = cwndLimit
 			}
 			if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
+				// Move writeNext along so that we don't try and scan data that
+				// has already been SACKED.
+				s.writeNext = seg.Next()
 				continue
 			}
 			if sent := s.maybeSendSegment(seg, limit, end); !sent {
@@ -872,6 +1042,13 @@ func (s *sender) sendData() {
 		s.ep.disableKeepaliveTimer()
 	}
 
+	// If the sender has advertized zero receive window and we have
+	// data to be sent out, start zero window probing to query the
+	// the remote for it's receive window size.
+	if s.writeNext != nil && s.sndWnd == 0 {
+		s.enableZeroWindowProbing()
+	}
+
 	// Enable the timer if we have pending data and it's not enabled yet.
 	if !s.resendTimer.enabled() && s.sndUna != s.sndNxt {
 		s.resendTimer.enable(s.rto)
@@ -893,6 +1070,8 @@ func (s *sender) enterFastRecovery() {
 	s.fr.first = s.sndUna
 	s.fr.last = s.sndNxt - 1
 	s.fr.maxCwnd = s.sndCwnd + s.outstanding
+	s.fr.highRxt = s.sndUna
+	s.fr.rescueRxt = s.sndUna
 	if s.ep.sackPermitted {
 		s.state = SACKRecovery
 		s.ep.stack.Stats().TCP.SACKRecovery.Increment()
@@ -1119,8 +1298,26 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 	// Stash away the current window size.
 	s.sndWnd = seg.window
 
-	// Ignore ack if it doesn't acknowledge any new data.
 	ack := seg.ackNumber
+
+	// Disable zero window probing if remote advertizes a non-zero receive
+	// window. This can be with an ACK to the zero window probe (where the
+	// acknumber refers to the already acknowledged byte) OR to any previously
+	// unacknowledged segment.
+	if s.zeroWindowProbing && seg.window > 0 &&
+		(ack == s.sndUna || (ack-1).InRange(s.sndUna, s.sndNxt)) {
+		s.disableZeroWindowProbing()
+	}
+
+	// On receiving the ACK for the zero window probe, account for it and
+	// skip trying to send any segment as we are still probing for
+	// receive window to become non-zero.
+	if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.sndUna {
+		s.unackZeroWindowProbes--
+		return
+	}
+
+	// Ignore ack if it doesn't acknowledge any new data.
 	if (ack - 1).InRange(s.sndUna, s.sndNxt) {
 		s.dupAckCount = 0
 
@@ -1140,7 +1337,7 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 		}
 
 		// When an ack is received we must rearm the timer.
-		// RFC 6298 5.2
+		// RFC 6298 5.3
 		s.resendTimer.enable(s.rto)
 
 		// Remove all acknowledged data from the write list.
@@ -1167,6 +1364,7 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 			if s.writeNext == seg {
 				s.writeNext = seg.Next()
 			}
+
 			s.writeList.Remove(seg)
 
 			// if SACK is enabled then Only reduce outstanding if
@@ -1229,7 +1427,7 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 
 // sendSegment sends the specified segment.
 func (s *sender) sendSegment(seg *segment) *tcpip.Error {
-	if !seg.xmitTime.IsZero() {
+	if seg.xmitCount > 0 {
 		s.ep.stack.Stats().TCP.Retransmits.Increment()
 		s.ep.stats.SendErrors.Retransmits.Increment()
 		if s.sndCwnd < s.sndSsthresh {
@@ -1237,7 +1435,24 @@ func (s *sender) sendSegment(seg *segment) *tcpip.Error {
 		}
 	}
 	seg.xmitTime = time.Now()
-	return s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber)
+	seg.xmitCount++
+	err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber)
+
+	// Every time a packet containing data is sent (including a
+	// retransmission), if SACK is enabled and we are retransmitting data
+	// then use the conservative timer described in RFC6675 Section 6.0,
+	// otherwise follow the standard time described in RFC6298 Section 5.1.
+	if err != nil && seg.data.Size() != 0 {
+		if s.fr.active && seg.xmitCount > 1 && s.ep.sackPermitted {
+			s.resendTimer.enable(s.rto)
+		} else {
+			if !s.resendTimer.enabled() {
+				s.resendTimer.enable(s.rto)
+			}
+		}
+	}
+
+	return err
 }
 
 // sendSegmentFromView sends a new segment containing the given payload, flags
@@ -1253,19 +1468,5 @@ func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags byte, seq
 	// Remember the max sent ack.
 	s.maxSentAck = rcvNxt
 
-	// Every time a packet containing data is sent (including a
-	// retransmission), if SACK is enabled then use the conservative timer
-	// described in RFC6675 Section 4.0, otherwise follow the standard time
-	// described in RFC6298 Section 5.2.
-	if data.Size() != 0 {
-		if s.ep.sackPermitted {
-			s.resendTimer.enable(s.rto)
-		} else {
-			if !s.resendTimer.enabled() {
-				s.resendTimer.enable(s.rto)
-			}
-		}
-	}
-
 	return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd)
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
index 782d7b42c..5fe23113b 100644
--- a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 func TestFastRecovery(t *testing.T) {
@@ -40,7 +41,7 @@ func TestFastRecovery(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 	for i := range data {
 		data[i] = byte(i)
@@ -86,16 +87,23 @@ func TestFastRecovery(t *testing.T) {
 	// Receive the retransmitted packet.
 	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
 
-	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
-	}
+	// Wait before checking metrics.
+	metricPollFn := func() error {
+		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+		}
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+		}
 
-	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+		if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.FastRecovery.Value = %v, want = %v", got, want)
+		}
+		return nil
 	}
 
-	if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.FastRecovery.Value = %v, want = %v", got, want)
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
 	}
 
 	// Now send 7 mode duplicate acks. Each of these should cause a window
@@ -117,12 +125,18 @@ func TestFastRecovery(t *testing.T) {
 	// Receive the retransmit due to partial ack.
 	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
 
-	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
-		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+	// Wait before checking metrics.
+	metricPollFn = func() error {
+		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+		}
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+		}
+		return nil
 	}
-
-	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
-		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
 	}
 
 	// Receive the 10 extra packets that should have been released due to
@@ -192,7 +206,7 @@ func TestExponentialIncreaseDuringSlowStart(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 	for i := range data {
 		data[i] = byte(i)
@@ -234,7 +248,7 @@ func TestCongestionAvoidance(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 	for i := range data {
 		data[i] = byte(i)
@@ -338,7 +352,7 @@ func TestCubicCongestionAvoidance(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 
 	for i := range data {
@@ -447,7 +461,7 @@ func TestRetransmit(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 	for i := range data {
 		data[i] = byte(i)
@@ -492,24 +506,33 @@ func TestRetransmit(t *testing.T) {
 	rtxOffset := bytesRead - maxPayload*expected
 	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
 
-	if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.Timeouts.Value = %v, want = %v", got, want)
-	}
+	metricPollFn := func() error {
+		if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.Timeouts.Value = %v, want = %v", got, want)
+		}
 
-	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
-	}
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
+		}
 
-	if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Timeouts.Value(), uint64(1); got != want {
-		t.Errorf("got EP SendErrors.Timeouts.Value = %v, want = %v", got, want)
-	}
+		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Timeouts.Value(), uint64(1); got != want {
+			return fmt.Errorf("got EP SendErrors.Timeouts.Value = %v, want = %v", got, want)
+		}
+
+		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Retransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got EP stats SendErrors.Retransmits.Value = %v, want = %v", got, want)
+		}
+
+		if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.SlowStartRetransmits.Value = %v, want = %v", got, want)
+		}
 
-	if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Retransmits.Value(), uint64(1); got != want {
-		t.Errorf("got EP stats SendErrors.Retransmits.Value = %v, want = %v", got, want)
+		return nil
 	}
 
-	if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.SlowStartRetransmits.Value = %v, want = %v", got, want)
+	// Poll when checking metrics.
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
 	}
 
 	// Acknowledge half of the pending data.
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index afea124ec..ace79b7b2 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 // createConnectedWithSACKPermittedOption creates and connects c.ep with the
@@ -149,21 +150,22 @@ func TestSackPermittedAccept(t *testing.T) {
 		{true, false, -1, 0xffff}, // When cookie is used window scaling is disabled.
 		{false, true, 5, 0x8000},  // 0x8000 * 2^5 = 1<<20 = 1MB window (the default).
 	}
-	savedSynCountThreshold := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = savedSynCountThreshold
-	}()
+
 	for _, tc := range testCases {
 		t.Run(fmt.Sprintf("test: %#v", tc), func(t *testing.T) {
-			if tc.cookieEnabled {
-				tcp.SynRcvdCountThreshold = 0
-			} else {
-				tcp.SynRcvdCountThreshold = savedSynCountThreshold
-			}
 			for _, sackEnabled := range []bool{false, true} {
 				t.Run(fmt.Sprintf("test stack.sackEnabled: %v", sackEnabled), func(t *testing.T) {
 					c := context.New(t, defaultMTU)
 					defer c.Cleanup()
+
+					if tc.cookieEnabled {
+						// Set the SynRcvd threshold to
+						// zero to force a syn cookie
+						// based accept to happen.
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						}
+					}
 					setStackSACKPermitted(t, c, sackEnabled)
 
 					rep := c.AcceptWithOptions(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS, SACKPermitted: tc.sackPermitted})
@@ -222,21 +224,23 @@ func TestSackDisabledAccept(t *testing.T) {
 		{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
 		{false, 5, 0x8000}, // 0x8000 * 2^5 = 1<<20 = 1MB window (the default).
 	}
-	savedSynCountThreshold := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = savedSynCountThreshold
-	}()
+
 	for _, tc := range testCases {
 		t.Run(fmt.Sprintf("test: %#v", tc), func(t *testing.T) {
-			if tc.cookieEnabled {
-				tcp.SynRcvdCountThreshold = 0
-			} else {
-				tcp.SynRcvdCountThreshold = savedSynCountThreshold
-			}
 			for _, sackEnabled := range []bool{false, true} {
 				t.Run(fmt.Sprintf("test: sackEnabled: %v", sackEnabled), func(t *testing.T) {
 					c := context.New(t, defaultMTU)
 					defer c.Cleanup()
+
+					if tc.cookieEnabled {
+						// Set the SynRcvd threshold to
+						// zero to force a syn cookie
+						// based accept to happen.
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						}
+					}
+
 					setStackSACKPermitted(t, c, sackEnabled)
 
 					rep := c.AcceptWithOptions(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS})
@@ -387,7 +391,7 @@ func TestSACKRecovery(t *testing.T) {
 	setStackSACKPermitted(t, c, true)
 	createConnectedWithSACKAndTS(c)
 
-	const iterations = 7
+	const iterations = 3
 	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
 	for i := range data {
 		data[i] = byte(i)
@@ -436,21 +440,28 @@ func TestSACKRecovery(t *testing.T) {
 	// Receive the retransmitted packet.
 	c.ReceiveAndCheckPacketWithOptions(data, rtxOffset, maxPayload, tsOptionSize)
 
-	tcpStats := c.Stack().Stats().TCP
-	stats := []struct {
-		stat *tcpip.StatCounter
-		name string
-		want uint64
-	}{
-		{tcpStats.FastRetransmit, "stats.TCP.FastRetransmit", 1},
-		{tcpStats.Retransmits, "stats.TCP.Retransmits", 1},
-		{tcpStats.SACKRecovery, "stats.TCP.SACKRecovery", 1},
-		{tcpStats.FastRecovery, "stats.TCP.FastRecovery", 0},
-	}
-	for _, s := range stats {
-		if got, want := s.stat.Value(), s.want; got != want {
-			t.Errorf("got %s.Value() = %v, want = %v", s.name, got, want)
+	metricPollFn := func() error {
+		tcpStats := c.Stack().Stats().TCP
+		stats := []struct {
+			stat *tcpip.StatCounter
+			name string
+			want uint64
+		}{
+			{tcpStats.FastRetransmit, "stats.TCP.FastRetransmit", 1},
+			{tcpStats.Retransmits, "stats.TCP.Retransmits", 1},
+			{tcpStats.SACKRecovery, "stats.TCP.SACKRecovery", 1},
+			{tcpStats.FastRecovery, "stats.TCP.FastRecovery", 0},
 		}
+		for _, s := range stats {
+			if got, want := s.stat.Value(), s.want; got != want {
+				return fmt.Errorf("got %s.Value() = %v, want = %v", s.name, got, want)
+			}
+		}
+		return nil
+	}
+
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
 	}
 
 	// Now send 7 mode duplicate ACKs. In SACK TCP dupAcks do not cause
@@ -514,22 +525,28 @@ func TestSACKRecovery(t *testing.T) {
 		bytesRead += maxPayload
 	}
 
-	// In SACK recovery only the first segment is fast retransmitted when
-	// entering recovery.
-	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
-	}
+	metricPollFn = func() error {
+		// In SACK recovery only the first segment is fast retransmitted when
+		// entering recovery.
+		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+		}
 
-	if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.FastRetransmit.Value(), uint64(1); got != want {
-		t.Errorf("got EP stats SendErrors.FastRetransmit = %v, want = %v", got, want)
-	}
+		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.FastRetransmit.Value(), uint64(1); got != want {
+			return fmt.Errorf("got EP stats SendErrors.FastRetransmit = %v, want = %v", got, want)
+		}
 
-	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(4); got != want {
-		t.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
-	}
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(4); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
+		}
 
-	if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Retransmits.Value(), uint64(4); got != want {
-		t.Errorf("got EP stats Stats.SendErrors.Retransmits = %v, want = %v", got, want)
+		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Retransmits.Value(), uint64(4); got != want {
+			return fmt.Errorf("got EP stats Stats.SendErrors.Retransmits = %v, want = %v", got, want)
+		}
+		return nil
+	}
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
 	}
 
 	c.CheckNoPacketTimeout("More packets received than expected during recovery after partial ack for this cwnd.", 50*time.Millisecond)
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index cc118c993..6ef32a1b3 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -209,8 +210,15 @@ func TestTCPResetsSentIncrement(t *testing.T) {
 	c.SendPacket(nil, ackHeaders)
 
 	c.GetPacket()
-	if got := stats.TCP.ResetsSent.Value(); got != want {
-		t.Errorf("got stats.TCP.ResetsSent.Value() = %v, want = %v", got, want)
+
+	metricPollFn := func() error {
+		if got := stats.TCP.ResetsSent.Value(); got != want {
+			return fmt.Errorf("got stats.TCP.ResetsSent.Value() = %v, want = %v", got, want)
+		}
+		return nil
+	}
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
 	}
 }
 
@@ -284,7 +292,7 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	// are released instantly on Close.
 	tcpTW := tcpip.TCPTimeWaitTimeoutOption(1 * time.Millisecond)
 	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpTW); err != nil {
-		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %s) = %s", tcp.ProtocolNumber, tcpTW, err)
+		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %v) = %v", tcp.ProtocolNumber, tcpTW, err)
 	}
 
 	c.EP.Close()
@@ -543,8 +551,9 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 		),
 	)
 
-	// Wait for the TIME-WAIT state to transition to CLOSED.
-	time.Sleep(1 * time.Second)
+	// Wait for a little more than the TIME-WAIT duration for the socket to
+	// transition to CLOSED state.
+	time.Sleep(1200 * time.Millisecond)
 
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
 		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
@@ -589,6 +598,10 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 		),
 	)
 
+	// Give the stack a few ms to transition the endpoint out of ESTABLISHED
+	// state.
+	time.Sleep(10 * time.Millisecond)
+
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateCloseWait; got != want {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
@@ -727,7 +740,7 @@ func TestUserSuppliedMSSOnConnectV4(t *testing.T) {
 	const maxMSS = mtu - header.IPv4MinimumSize - header.TCPMinimumSize
 	tests := []struct {
 		name   string
-		setMSS uint16
+		setMSS int
 		expMSS uint16
 	}{
 		{
@@ -755,15 +768,14 @@ func TestUserSuppliedMSSOnConnectV4(t *testing.T) {
 			c.Create(-1)
 
 			// Set the MSS socket option.
-			opt := tcpip.MaxSegOption(test.setMSS)
-			if err := c.EP.SetSockOpt(opt); err != nil {
-				t.Fatalf("SetSockOpt(%#v) failed: %s", opt, err)
+			if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, test.setMSS); err != nil {
+				t.Fatalf("SetSockOptInt(MaxSegOption, %d) failed: %s", test.setMSS, err)
 			}
 
 			// Get expected window size.
 			rcvBufSize, err := c.EP.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
 			if err != nil {
-				t.Fatalf("GetSockOpt(%v) failed: %s", tcpip.ReceiveBufferSizeOption, err)
+				t.Fatalf("GetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
 			}
 			ws := tcp.FindWndScale(seqnum.Size(rcvBufSize))
 
@@ -817,15 +829,14 @@ func TestUserSuppliedMSSOnConnectV6(t *testing.T) {
 			c.CreateV6Endpoint(true)
 
 			// Set the MSS socket option.
-			opt := tcpip.MaxSegOption(test.setMSS)
-			if err := c.EP.SetSockOpt(opt); err != nil {
-				t.Fatalf("SetSockOpt(%#v) failed: %s", opt, err)
+			if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, int(test.setMSS)); err != nil {
+				t.Fatalf("SetSockOptInt(MaxSegOption, %d) failed: %s", test.setMSS, err)
 			}
 
 			// Get expected window size.
 			rcvBufSize, err := c.EP.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
 			if err != nil {
-				t.Fatalf("GetSockOpt(%v) failed: %s", tcpip.ReceiveBufferSizeOption, err)
+				t.Fatalf("GetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
 			}
 			ws := tcp.FindWndScale(seqnum.Size(rcvBufSize))
 
@@ -1031,8 +1042,8 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) {
 		checker.SeqNum(200)))
 }
 
-// TestListenShutdown tests for the listening endpoint not processing
-// any receive when it is on read shutdown.
+// TestListenShutdown tests for the listening endpoint replying with RST
+// on read shutdown.
 func TestListenShutdown(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -1043,7 +1054,7 @@ func TestListenShutdown(t *testing.T) {
 		t.Fatal("Bind failed:", err)
 	}
 
-	if err := c.EP.Listen(10 /* backlog */); err != nil {
+	if err := c.EP.Listen(1 /* backlog */); err != nil {
 		t.Fatal("Listen failed:", err)
 	}
 
@@ -1051,9 +1062,6 @@ func TestListenShutdown(t *testing.T) {
 		t.Fatal("Shutdown failed:", err)
 	}
 
-	// Wait for the endpoint state to be propagated.
-	time.Sleep(10 * time.Millisecond)
-
 	c.SendPacket(nil, &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: context.StackPort,
@@ -1062,7 +1070,49 @@ func TestListenShutdown(t *testing.T) {
 		AckNum:  200,
 	})
 
-	c.CheckNoPacket("Packet received when listening socket was shutdown")
+	// Expect the listening endpoint to reset the connection.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+		))
+}
+
+// TestListenCloseWhileConnect tests for the listening endpoint to
+// drain the accept-queue when closed. This should reset all of the
+// pending connections that are waiting to be accepted.
+func TestListenCloseWhileConnect(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1 /* epRcvBuf */)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(1 /* backlog */); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventIn)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+	// Wait for the new endpoint created because of handshake to be delivered
+	// to the listening endpoint's accept queue.
+	<-notifyCh
+
+	// Close the listening endpoint.
+	c.EP.Close()
+
+	// Expect the listening endpoint to reset the connection.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+		))
 }
 
 func TestTOSV4(t *testing.T) {
@@ -1076,17 +1126,17 @@ func TestTOSV4(t *testing.T) {
 	c.EP = ep
 
 	const tos = 0xC0
-	if err := c.EP.SetSockOpt(tcpip.IPv4TOSOption(tos)); err != nil {
-		t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.IPv4TOSOption(tos), err)
+	if err := c.EP.SetSockOptInt(tcpip.IPv4TOSOption, tos); err != nil {
+		t.Errorf("SetSockOptInt(IPv4TOSOption, %d) failed: %s", tos, err)
 	}
 
-	var v tcpip.IPv4TOSOption
-	if err := c.EP.GetSockOpt(&v); err != nil {
-		t.Errorf("GetSockopt failed: %s", err)
+	v, err := c.EP.GetSockOptInt(tcpip.IPv4TOSOption)
+	if err != nil {
+		t.Errorf("GetSockoptInt(IPv4TOSOption) failed: %s", err)
 	}
 
-	if want := tcpip.IPv4TOSOption(tos); v != want {
-		t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+	if v != tos {
+		t.Errorf("got GetSockOptInt(IPv4TOSOption) = %d, want = %d", v, tos)
 	}
 
 	testV4Connect(t, c, checker.TOS(tos, 0))
@@ -1124,17 +1174,17 @@ func TestTrafficClassV6(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	const tos = 0xC0
-	if err := c.EP.SetSockOpt(tcpip.IPv6TrafficClassOption(tos)); err != nil {
-		t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.IPv6TrafficClassOption(tos), err)
+	if err := c.EP.SetSockOptInt(tcpip.IPv6TrafficClassOption, tos); err != nil {
+		t.Errorf("SetSockOpInt(IPv6TrafficClassOption, %d) failed: %s", tos, err)
 	}
 
-	var v tcpip.IPv6TrafficClassOption
-	if err := c.EP.GetSockOpt(&v); err != nil {
-		t.Fatalf("GetSockopt failed: %s", err)
+	v, err := c.EP.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+	if err != nil {
+		t.Fatalf("GetSockoptInt(IPv6TrafficClassOption) failed: %s", err)
 	}
 
-	if want := tcpip.IPv6TrafficClassOption(tos); v != want {
-		t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+	if v != tos {
+		t.Errorf("got GetSockOptInt(IPv6TrafficClassOption) = %d, want = %d", v, tos)
 	}
 
 	// Test the connection request.
@@ -1710,7 +1760,7 @@ func TestNoWindowShrinking(t *testing.T) {
 	c.CreateConnected(789, 30000, 10)
 
 	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 5); err != nil {
-		t.Fatalf("SetSockOpt failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %v", err)
 	}
 
 	we, ch := waiter.NewChannelEntry(nil)
@@ -1850,7 +1900,7 @@ func TestZeroWindowSend(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	c.CreateConnected(789, 0, -1 /* epRcvBuf */)
+	c.CreateConnected(789 /* iss */, 0 /* rcvWnd */, -1 /* epRcvBuf */)
 
 	data := []byte{1, 2, 3}
 	view := buffer.NewView(len(data))
@@ -1861,8 +1911,17 @@ func TestZeroWindowSend(t *testing.T) {
 		t.Fatalf("Write failed: %v", err)
 	}
 
-	// Since the window is currently zero, check that no packet is received.
-	c.CheckNoPacket("Packet received when window is zero")
+	// Check if we got a zero-window probe.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
 
 	// Open up the window. Data should be received now.
 	c.SendPacket(nil, &context.Headers{
@@ -1875,7 +1934,7 @@ func TestZeroWindowSend(t *testing.T) {
 	})
 
 	// Check that data is received.
-	b := c.GetPacket()
+	b = c.GetPacket()
 	checker.IPv4(t, b,
 		checker.PayloadLen(len(data)+header.TCPMinimumSize),
 		checker.TCP(
@@ -1983,7 +2042,7 @@ func TestScaledWindowAccept(t *testing.T) {
 
 	// Set the window size greater than the maximum non-scaled window.
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 65535*3); err != nil {
-		t.Fatalf("SetSockOpt failed failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 65535*3) failed failed: %v", err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
@@ -2056,7 +2115,7 @@ func TestNonScaledWindowAccept(t *testing.T) {
 
 	// Set the window size greater than the maximum non-scaled window.
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 65535*3); err != nil {
-		t.Fatalf("SetSockOpt failed failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 65535*3) failed failed: %v", err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
@@ -2220,10 +2279,10 @@ func TestSegmentMerging(t *testing.T) {
 		{
 			"cork",
 			func(ep tcpip.Endpoint) {
-				ep.SetSockOpt(tcpip.CorkOption(1))
+				ep.SetSockOptBool(tcpip.CorkOption, true)
 			},
 			func(ep tcpip.Endpoint) {
-				ep.SetSockOpt(tcpip.CorkOption(0))
+				ep.SetSockOptBool(tcpip.CorkOption, false)
 			},
 		},
 	}
@@ -2235,9 +2294,18 @@ func TestSegmentMerging(t *testing.T) {
 
 			c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-			// Prevent the endpoint from processing packets.
-			test.stop(c.EP)
+			// Send tcp.InitialCwnd number of segments to fill up
+			// InitialWindow but don't ACK. That should prevent
+			// anymore packets from going out.
+			for i := 0; i < tcp.InitialCwnd; i++ {
+				view := buffer.NewViewFromBytes([]byte{0})
+				if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+					t.Fatalf("Write #%d failed: %s", i+1, err)
+				}
+			}
 
+			// Now send the segments that should get merged as the congestion
+			// window is full and we won't be able to send any more packets.
 			var allData []byte
 			for i, data := range [][]byte{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
 				allData = append(allData, data...)
@@ -2247,8 +2315,29 @@ func TestSegmentMerging(t *testing.T) {
 				}
 			}
 
-			// Let the endpoint process the segments that we just sent.
-			test.resume(c.EP)
+			// Check that we get tcp.InitialCwnd packets.
+			for i := 0; i < tcp.InitialCwnd; i++ {
+				b := c.GetPacket()
+				checker.IPv4(t, b,
+					checker.PayloadLen(header.TCPMinimumSize+1),
+					checker.TCP(
+						checker.DstPort(context.TestPort),
+						checker.SeqNum(uint32(c.IRS)+uint32(i)+1),
+						checker.AckNum(790),
+						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+					),
+				)
+			}
+
+			// Acknowledge the data.
+			c.SendPacket(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: c.Port,
+				Flags:   header.TCPFlagAck,
+				SeqNum:  790,
+				AckNum:  c.IRS.Add(1 + 10), // 10 for the 10 bytes of payload.
+				RcvWnd:  30000,
+			})
 
 			// Check that data is received.
 			b := c.GetPacket()
@@ -2256,7 +2345,7 @@ func TestSegmentMerging(t *testing.T) {
 				checker.PayloadLen(len(allData)+header.TCPMinimumSize),
 				checker.TCP(
 					checker.DstPort(context.TestPort),
-					checker.SeqNum(uint32(c.IRS)+1),
+					checker.SeqNum(uint32(c.IRS)+11),
 					checker.AckNum(790),
 					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
 				),
@@ -2272,7 +2361,7 @@ func TestSegmentMerging(t *testing.T) {
 				DstPort: c.Port,
 				Flags:   header.TCPFlagAck,
 				SeqNum:  790,
-				AckNum:  c.IRS.Add(1 + seqnum.Size(len(allData))),
+				AckNum:  c.IRS.Add(11 + seqnum.Size(len(allData))),
 				RcvWnd:  30000,
 			})
 		})
@@ -2285,7 +2374,7 @@ func TestDelay(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	c.EP.SetSockOptInt(tcpip.DelayOption, 1)
+	c.EP.SetSockOptBool(tcpip.DelayOption, true)
 
 	var allData []byte
 	for i, data := range [][]byte{{0}, {1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
@@ -2333,7 +2422,7 @@ func TestUndelay(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	c.EP.SetSockOptInt(tcpip.DelayOption, 1)
+	c.EP.SetSockOptBool(tcpip.DelayOption, true)
 
 	allData := [][]byte{{0}, {1, 2, 3}}
 	for i, data := range allData {
@@ -2366,7 +2455,7 @@ func TestUndelay(t *testing.T) {
 	// Check that we don't get the second packet yet.
 	c.CheckNoPacketTimeout("delayed second packet transmitted", 100*time.Millisecond)
 
-	c.EP.SetSockOptInt(tcpip.DelayOption, 0)
+	c.EP.SetSockOptBool(tcpip.DelayOption, false)
 
 	// Check that data is received.
 	second := c.GetPacket()
@@ -2403,8 +2492,8 @@ func TestMSSNotDelayed(t *testing.T) {
 		fn   func(tcpip.Endpoint)
 	}{
 		{"no-op", func(tcpip.Endpoint) {}},
-		{"delay", func(ep tcpip.Endpoint) { ep.SetSockOptInt(tcpip.DelayOption, 1) }},
-		{"cork", func(ep tcpip.Endpoint) { ep.SetSockOpt(tcpip.CorkOption(1)) }},
+		{"delay", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.DelayOption, true) }},
+		{"cork", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.CorkOption, true) }},
 	}
 
 	for _, test := range tests {
@@ -2545,12 +2634,12 @@ func TestSetTTL(t *testing.T) {
 				t.Fatalf("NewEndpoint failed: %v", err)
 			}
 
-			if err := c.EP.SetSockOpt(tcpip.TTLOption(wantTTL)); err != nil {
-				t.Fatalf("SetSockOpt failed: %v", err)
+			if err := c.EP.SetSockOptInt(tcpip.TTLOption, int(wantTTL)); err != nil {
+				t.Fatalf("SetSockOptInt(TTLOption, %d) failed: %s", wantTTL, err)
 			}
 
 			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
-				t.Fatalf("Unexpected return value from Connect: %v", err)
+				t.Fatalf("Unexpected return value from Connect: %s", err)
 			}
 
 			// Receive SYN packet.
@@ -2590,7 +2679,7 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) {
 	// window scaling option.
 	const rcvBufferSize = 0x20000
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
-		t.Fatalf("SetSockOpt failed failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed failed: %s", rcvBufferSize, err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
@@ -2636,26 +2725,24 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 
 	// Set the SynRcvd threshold to zero to force a syn cookie based accept
 	// to happen.
-	saved := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = saved
-	}()
-	tcp.SynRcvdCountThreshold = 0
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+	}
 
 	// Create EP and start listening.
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	defer ep.Close()
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Do 3-way handshake.
@@ -2673,7 +2760,7 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 		case <-ch:
 			c.EP, _, err = ep.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -2734,7 +2821,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 	const rcvBufferSize = 0x20000
 	const wndScale = 2
 	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
-		t.Fatalf("SetSockOpt failed failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed failed: %s", rcvBufferSize, err)
 	}
 
 	// Start connection attempt.
@@ -2907,6 +2994,101 @@ func TestSendOnResetConnection(t *testing.T) {
 	}
 }
 
+// TestMaxRetransmitsTimeout tests if the connection is timed out after
+// a segment has been retransmitted MaxRetries times.
+func TestMaxRetransmitsTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	const numRetries = 2
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRetriesOption(numRetries)); err != nil {
+		t.Fatalf("could not set protocol option MaxRetries.\n")
+	}
+
+	c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
+
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventHUp)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	_, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{})
+	if err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	// Expect first transmit and MaxRetries retransmits.
+	for i := 0; i < numRetries+1; i++ {
+		checker.IPv4(t, c.GetPacket(),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.TCPFlags(header.TCPFlagAck|header.TCPFlagPsh),
+			),
+		)
+	}
+	// Wait for the connection to timeout after MaxRetries retransmits.
+	initRTO := 1 * time.Second
+	select {
+	case <-notifyCh:
+	case <-time.After((2 << numRetries) * initRTO):
+		t.Fatalf("connection still alive after maximum retransmits.\n")
+	}
+
+	// Send an ACK and expect a RST as the connection would have been closed.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %v, want = 1", got)
+	}
+}
+
+// TestMaxRTO tests if the retransmit interval caps to MaxRTO.
+func TestMaxRTO(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	rto := 1 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRTOOption(rto)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPMaxRTO(%d) failed: %s", rto, err)
+	}
+
+	c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
+
+	_, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{})
+	if err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+	const numRetransmits = 2
+	for i := 0; i < numRetransmits; i++ {
+		start := time.Now()
+		checker.IPv4(t, c.GetPacket(),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+			),
+		)
+		if time.Since(start).Round(time.Second).Seconds() != rto.Seconds() {
+			t.Errorf("Retransmit interval not capped to MaxRTO.\n")
+		}
+	}
+}
+
 func TestFinImmediately(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -3478,7 +3660,7 @@ func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
-	tcpbuf := vv.First()[header.IPv4MinimumSize:]
+	tcpbuf := vv.ToView()[header.IPv4MinimumSize:]
 	tcpbuf[header.TCPDataOffset] = ((header.TCPMinimumSize - 1) / 4) << 4
 
 	c.SendSegment(vv)
@@ -3505,7 +3687,7 @@ func TestReceivedIncorrectChecksumIncrement(t *testing.T) {
 		AckNum:  c.IRS.Add(1),
 		RcvWnd:  30000,
 	})
-	tcpbuf := vv.First()[header.IPv4MinimumSize:]
+	tcpbuf := vv.ToView()[header.IPv4MinimumSize:]
 	// Overwrite a byte in the payload which should cause checksum
 	// verification to fail.
 	tcpbuf[(tcpbuf[header.TCPDataOffset]>>4)*4] = 0x4
@@ -3851,26 +4033,26 @@ func TestMinMaxBufferSizes(t *testing.T) {
 
 	// Set values below the min.
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 199); err != nil {
-		t.Fatalf("GetSockOpt failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 199) failed: %s", err)
 	}
 
 	checkRecvBufferSize(t, ep, 200)
 
 	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 299); err != nil {
-		t.Fatalf("GetSockOpt failed: %v", err)
+		t.Fatalf("SetSockOptInt(SendBufferSizeOption, 299) failed: %s", err)
 	}
 
 	checkSendBufferSize(t, ep, 300)
 
 	// Set values above the max.
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 1+tcp.DefaultReceiveBufferSize*20); err != nil {
-		t.Fatalf("GetSockOpt failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
 	}
 
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20)
 
 	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 1+tcp.DefaultSendBufferSize*30); err != nil {
-		t.Fatalf("GetSockOpt failed: %v", err)
+		t.Fatalf("SetSockOptInt(SendBufferSizeOption) failed: %s", err)
 	}
 
 	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30)
@@ -4116,11 +4298,11 @@ func TestConnectAvoidsBoundPorts(t *testing.T) {
 												case "ipv4":
 												case "ipv6":
 													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-														t.Fatalf("SetSockOpt(V6OnlyOption(true)) failed: %v", err)
+														t.Fatalf("SetSockOptBool(V6OnlyOption(true)) failed: %s", err)
 													}
 												case "dual":
 													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, false); err != nil {
-														t.Fatalf("SetSockOpt(V6OnlyOption(false)) failed: %v", err)
+														t.Fatalf("SetSockOptBool(V6OnlyOption(false)) failed: %s", err)
 													}
 												default:
 													t.Fatalf("unknown network: '%s'", network)
@@ -4443,11 +4625,11 @@ func TestKeepalive(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	const keepAliveInterval = 10 * time.Millisecond
-	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
+	const keepAliveInterval = 3 * time.Second
+	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
 	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
-	c.EP.SetSockOpt(tcpip.KeepaliveCountOption(5))
-	c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
+	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5)
+	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
 
 	// 5 unacked keepalives are sent. ACK each one, and check that the
 	// connection stays alive after 5.
@@ -4538,7 +4720,7 @@ func TestKeepalive(t *testing.T) {
 	// Sleep for a litte over the KeepAlive interval to make sure
 	// the timer has time to fire after the last ACK and close the
 	// close the socket.
-	time.Sleep(keepAliveInterval + 5*time.Millisecond)
+	time.Sleep(keepAliveInterval + keepAliveInterval/2)
 
 	// The connection should be terminated after 5 unacked keepalives.
 	// Send an ACK to trigger a RST from the stack as the endpoint should
@@ -4818,6 +5000,8 @@ func TestListenNoAcceptNonUnicastV4(t *testing.T) {
 	}
 
 	for _, test := range tests {
+		test := test // capture range variable
+
 		t.Run(test.name, func(t *testing.T) {
 			t.Parallel()
 
@@ -4920,6 +5104,8 @@ func TestListenNoAcceptNonUnicastV6(t *testing.T) {
 	}
 
 	for _, test := range tests {
+		test := test // capture range variable
+
 		t.Run(test.name, func(t *testing.T) {
 			t.Parallel()
 
@@ -5073,25 +5259,23 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 }
 
 func TestListenBacklogFullSynCookieInUse(t *testing.T) {
-	saved := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = saved
-	}()
-	tcp.SynRcvdCountThreshold = 1
-
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(1)); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 1 failed: %s", err)
+	}
+
 	// Create TCP endpoint.
 	var err *tcpip.Error
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 
 	// Bind to wildcard.
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Test acceptance.
@@ -5099,7 +5283,7 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	listenBacklog := 1
 	portOffset := uint16(0)
 	if err := c.EP.Listen(listenBacklog); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	executeHandshake(t, c, context.TestPort+portOffset, false)
@@ -5578,7 +5762,7 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 				return
 			}
 			if w := tcp.WindowSize(); w == 0 || w > uint16(wantRcvWnd) {
-				t.Errorf("expected a non-zero window: got %d, want <= wantRcvWnd", w, wantRcvWnd)
+				t.Errorf("expected a non-zero window: got %d, want <= wantRcvWnd", w)
 			}
 		},
 	))
@@ -5685,7 +5869,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 		// Invoke the moderation API. This is required for auto-tuning
 		// to happen. This method is normally expected to be invoked
 		// from a higher layer than tcpip.Endpoint. So we simulate
-		// copying to user-space by invoking it explicitly here.
+		// copying to userspace by invoking it explicitly here.
 		c.EP.ModerateRecvBuf(totalCopied)
 
 		// Now send a keep-alive packet to trigger an ACK so that we can
@@ -5739,14 +5923,14 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 func TestDelayEnabled(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
-	checkDelayOption(t, c, false, 0) // Delay is disabled by default.
+	checkDelayOption(t, c, false, false) // Delay is disabled by default.
 
 	for _, v := range []struct {
 		delayEnabled    tcp.DelayEnabled
-		wantDelayOption int
+		wantDelayOption bool
 	}{
-		{delayEnabled: false, wantDelayOption: 0},
-		{delayEnabled: true, wantDelayOption: 1},
+		{delayEnabled: false, wantDelayOption: false},
+		{delayEnabled: true, wantDelayOption: true},
 	} {
 		c := context.New(t, defaultMTU)
 		defer c.Cleanup()
@@ -5757,7 +5941,7 @@ func TestDelayEnabled(t *testing.T) {
 	}
 }
 
-func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.DelayEnabled, wantDelayOption int) {
+func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.DelayEnabled, wantDelayOption bool) {
 	t.Helper()
 
 	var gotDelayEnabled tcp.DelayEnabled
@@ -5772,12 +5956,12 @@ func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.Del
 	if err != nil {
 		t.Fatalf("NewEndPoint(tcp, ipv4, new(waiter.Queue)) failed: %v", err)
 	}
-	gotDelayOption, err := ep.GetSockOptInt(tcpip.DelayOption)
+	gotDelayOption, err := ep.GetSockOptBool(tcpip.DelayOption)
 	if err != nil {
-		t.Fatalf("ep.GetSockOptInt(tcpip.DelayOption) failed: %v", err)
+		t.Fatalf("ep.GetSockOptBool(tcpip.DelayOption) failed: %s", err)
 	}
 	if gotDelayOption != wantDelayOption {
-		t.Errorf("ep.GetSockOptInt(tcpip.DelayOption) got: %d, want: %d", gotDelayOption, wantDelayOption)
+		t.Errorf("ep.GetSockOptBool(tcpip.DelayOption) got: %t, want: %t", gotDelayOption, wantDelayOption)
 	}
 }
 
@@ -6516,9 +6700,16 @@ func TestTCPUserTimeout(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventHUp)
+	defer c.WQ.EventUnregister(&waitEntry)
+
 	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
 
-	userTimeout := 50 * time.Millisecond
+	// Ensure that on the next retransmit timer fire, the user timeout has
+	// expired.
+	initRTO := 1 * time.Second
+	userTimeout := initRTO / 2
 	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
 
 	// Send some data and wait before ACKing it.
@@ -6538,9 +6729,13 @@ func TestTCPUserTimeout(t *testing.T) {
 		),
 	)
 
-	// Wait for a little over the minimum retransmit timeout of 200ms for
-	// the retransmitTimer to fire and close the connection.
-	time.Sleep(tcp.MinRTO + 10*time.Millisecond)
+	// Wait for the retransmit timer to be fired and the user timeout to cause
+	// close of the connection.
+	select {
+	case <-notifyCh:
+	case <-time.After(2 * initRTO):
+		t.Fatalf("connection still alive after %s, should have been closed after :%s", 2*initRTO, userTimeout)
+	}
 
 	// No packet should be received as the connection should be silently
 	// closed due to timeout.
@@ -6586,14 +6781,17 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 
 	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
 
-	const keepAliveInterval = 10 * time.Millisecond
-	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
+	const keepAliveInterval = 3 * time.Second
+	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
 	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
-	c.EP.SetSockOpt(tcpip.KeepaliveCountOption(10))
-	c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
-
-	// Set userTimeout to be the duration for 3 keepalive probes.
-	userTimeout := 30 * time.Millisecond
+	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10)
+	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
+
+	// Set userTimeout to be the duration to be 1 keepalive
+	// probes. Which means that after the first probe is sent
+	// the second one should cause the connection to be
+	// closed due to userTimeout being hit.
+	userTimeout := 1 * keepAliveInterval
 	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
 
 	// Check that the connection is still alive.
@@ -6601,28 +6799,23 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
 	}
 
-	// Now receive 2 keepalives, but don't ACK them. The connection should
-	// be reset when the 3rd one should be sent due to userTimeout being
-	// 30ms and each keepalive probe should be sent 10ms apart as set above after
-	// the connection has been idle for 10ms.
-	for i := 0; i < 2; i++ {
-		b := c.GetPacket()
-		checker.IPv4(t, b,
-			checker.TCP(
-				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)),
-				checker.AckNum(uint32(790)),
-				checker.TCPFlags(header.TCPFlagAck),
-			),
-		)
-	}
+	// Now receive 1 keepalives, but don't ACK it.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)),
+			checker.AckNum(uint32(790)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
 
 	// Sleep for a litte over the KeepAlive interval to make sure
 	// the timer has time to fire after the last ACK and close the
 	// close the socket.
-	time.Sleep(keepAliveInterval + 5*time.Millisecond)
+	time.Sleep(keepAliveInterval + keepAliveInterval/2)
 
-	// The connection should be terminated after 30ms.
+	// The connection should be closed with a timeout.
 	// Send an ACK to trigger a RST from the stack as the endpoint should
 	// be dead.
 	c.SendPacket(nil, &context.Headers{
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index a641e953d..8edbff964 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -127,16 +127,14 @@ func TestTimeStampDisabledConnect(t *testing.T) {
 }
 
 func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndSize uint16) {
-	savedSynCountThreshold := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = savedSynCountThreshold
-	}()
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
 
 	if cookieEnabled {
-		tcp.SynRcvdCountThreshold = 0
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		}
 	}
-	c := context.New(t, defaultMTU)
-	defer c.Cleanup()
 
 	t.Logf("Test w/ CookieEnabled = %v", cookieEnabled)
 	tsVal := rand.Uint32()
@@ -148,7 +146,7 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
 	copy(view, data)
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Unexpected error from Write: %v", err)
+		t.Fatalf("Unexpected error from Write: %s", err)
 	}
 
 	// Check that data is received and that the timestamp option TSEcr field
@@ -190,17 +188,15 @@ func TestTimeStampEnabledAccept(t *testing.T) {
 }
 
 func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndSize uint16) {
-	savedSynCountThreshold := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = savedSynCountThreshold
-	}()
-	if cookieEnabled {
-		tcp.SynRcvdCountThreshold = 0
-	}
-
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
+	if cookieEnabled {
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		}
+	}
+
 	t.Logf("Test w/ CookieEnabled = %v", cookieEnabled)
 	c.AcceptWithOptions(wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS})
 
@@ -211,7 +207,7 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
 	copy(view, data)
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Unexpected error from Write: %v", err)
+		t.Fatalf("Unexpected error from Write: %s", err)
 	}
 
 	// Check that data is received and that the timestamp option is disabled
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 1e9a0dea3..9721f6caf 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -152,6 +152,13 @@ func New(t *testing.T, mtu uint32) *Context {
 		t.Fatalf("SetTransportProtocolOption failed: %v", err)
 	}
 
+	// Increase minimum RTO in tests to avoid test flakes due to early
+	// retransmit in case the test executors are overloaded and cause timers
+	// to fire earlier than expected.
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMinRTOOption(3*time.Second)); err != nil {
+		t.Fatalf("failed to set stack-wide minRTO: %s", err)
+	}
+
 	// Some of the congestion control tests send up to 640 packets, we so
 	// set the channel size to 1000.
 	ep := channel.New(1000, mtu, "")
@@ -204,6 +211,7 @@ func (c *Context) Cleanup() {
 	if c.EP != nil {
 		c.EP.Close()
 	}
+	c.Stack().Close()
 }
 
 // Stack returns a reference to the stack in the Context.
@@ -216,7 +224,8 @@ func (c *Context) Stack() *stack.Stack {
 func (c *Context) CheckNoPacketTimeout(errMsg string, wait time.Duration) {
 	c.t.Helper()
 
-	ctx, _ := context.WithTimeout(context.Background(), wait)
+	ctx, cancel := context.WithTimeout(context.Background(), wait)
+	defer cancel()
 	if _, ok := c.linkEP.ReadContext(ctx); ok {
 		c.t.Fatal(errMsg)
 	}
@@ -234,7 +243,8 @@ func (c *Context) CheckNoPacket(errMsg string) {
 func (c *Context) GetPacket() []byte {
 	c.t.Helper()
 
-	ctx, _ := context.WithTimeout(context.Background(), 2*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
 	p, ok := c.linkEP.ReadContext(ctx)
 	if !ok {
 		c.t.Fatalf("Packet wasn't written out")
@@ -306,7 +316,7 @@ func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byt
 	copy(icmp[header.ICMPv4PayloadOffset:], p2)
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 }
@@ -362,7 +372,7 @@ func (c *Context) BuildSegmentWithAddrs(payload []byte, h *Headers, src, dst tcp
 // SendSegment sends a TCP segment that has already been built and written to a
 // buffer.VectorisedView.
 func (c *Context) SendSegment(s buffer.VectorisedView) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Data: s,
 	})
 }
@@ -370,7 +380,7 @@ func (c *Context) SendSegment(s buffer.VectorisedView) {
 // SendPacket builds and sends a TCP segment(with the provided payload & TCP
 // headers) in an IPv4 packet via the link layer endpoint.
 func (c *Context) SendPacket(payload []byte, h *Headers) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Data: c.BuildSegment(payload, h),
 	})
 }
@@ -379,7 +389,7 @@ func (c *Context) SendPacket(payload []byte, h *Headers) {
 // & TCPheaders) in an IPv4 packet via the link layer endpoint using the
 // provided source and destination IPv4 addresses.
 func (c *Context) SendPacketWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Data: c.BuildSegmentWithAddrs(payload, h, src, dst),
 	})
 }
@@ -414,6 +424,8 @@ func (c *Context) SendAckWithSACK(seq seqnum.Value, bytesReceived int, sackBlock
 // verifies that the packet packet payload of packet matches the slice
 // of data indicated by offset & size.
 func (c *Context) ReceiveAndCheckPacket(data []byte, offset, size int) {
+	c.t.Helper()
+
 	c.ReceiveAndCheckPacketWithOptions(data, offset, size, 0)
 }
 
@@ -422,6 +434,8 @@ func (c *Context) ReceiveAndCheckPacket(data []byte, offset, size int) {
 // data indicated by offset & size and skips optlen bytes in addition to the IP
 // TCP headers when comparing the data.
 func (c *Context) ReceiveAndCheckPacketWithOptions(data []byte, offset, size, optlen int) {
+	c.t.Helper()
+
 	b := c.GetPacket()
 	checker.IPv4(c.t, b,
 		checker.PayloadLen(size+header.TCPMinimumSize+optlen),
@@ -444,6 +458,8 @@ func (c *Context) ReceiveAndCheckPacketWithOptions(data []byte, offset, size, op
 // data indicated by offset & size. It returns true if a packet was received and
 // processed.
 func (c *Context) ReceiveNonBlockingAndCheckPacket(data []byte, offset, size int) bool {
+	c.t.Helper()
+
 	b := c.GetPacketNonBlocking()
 	if b == nil {
 		return false
@@ -485,7 +501,8 @@ func (c *Context) CreateV6Endpoint(v6only bool) {
 func (c *Context) GetV6Packet() []byte {
 	c.t.Helper()
 
-	ctx, _ := context.WithTimeout(context.Background(), 2*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
 	p, ok := c.linkEP.ReadContext(ctx)
 	if !ok {
 		c.t.Fatalf("Packet wasn't written out")
@@ -547,7 +564,7 @@ func (c *Context) SendV6PacketWithAddrs(payload []byte, h *Headers, src, dst tcp
 	t.SetChecksum(^t.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 }
@@ -566,6 +583,8 @@ func (c *Context) CreateConnected(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf
 //
 // PreCondition: c.EP must already be created.
 func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte) {
+	c.t.Helper()
+
 	// Start connection attempt.
 	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&waitEntry, waiter.EventOut)
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
index 93712cd45..12bc1b5b5 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
@@ -311,17 +311,7 @@ type stream struct {
 // the window is zero, if it's a packet with no payload and sequence number
 // equal to una.
 func (s *stream) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
-	wnd := s.una.Size(s.end)
-	if wnd == 0 {
-		return segLen == 0 && segSeq == s.una
-	}
-
-	// Make sure [segSeq, seqSeq+segLen) is non-empty.
-	if segLen == 0 {
-		segLen = 1
-	}
-
-	return seqnum.Overlap(s.una, wnd, segSeq, segLen)
+	return header.Acceptable(segSeq, segLen, s.una, s.end)
 }
 
 // closed determines if the stream has already been closed. This happens when
@@ -347,3 +337,16 @@ func logicalLen(tcp header.TCP) seqnum.Size {
 	}
 	return l
 }
+
+// IsEmpty returns true if tcb is not initialized.
+func (t *TCB) IsEmpty() bool {
+	if t.inbound != (stream{}) || t.outbound != (stream{}) {
+		return false
+	}
+
+	if t.firstFin != nil || t.state != ResultDrop {
+		return false
+	}
+
+	return true
+}
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index adc908e24..b5d2d0ba6 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -32,7 +32,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
-        "//pkg/tcpip/iptables",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 3fe91cac2..8c7895713 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -19,7 +19,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -32,7 +31,8 @@ type udpPacket struct {
 	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	tos           uint8
+	// tos stores either the receiveTOS or receiveTClass value.
+	tos uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -106,6 +106,9 @@ type endpoint struct {
 	bindToDevice   tcpip.NICID
 	broadcast      bool
 
+	lastErrorMu sync.Mutex   `state:"nosave"`
+	lastError   *tcpip.Error `state:".(string)"`
+
 	// Values used to reserve a port or register a transport endpoint.
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
@@ -119,6 +122,10 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveTClass determines if the incoming IPv6 TClass header field is
+	// passed as ancillary data to ControlMessages on Read.
+	receiveTClass bool
+
 	// receiveIPPacketInfo determines if the packet info is returned by Read.
 	receiveIPPacketInfo bool
 
@@ -139,6 +146,9 @@ type endpoint struct {
 
 	// TODO(b/142022063): Add ability to save and restore per endpoint stats.
 	stats tcpip.TransportEndpointStats `state:"nosave"`
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
 }
 
 // +stateify savable
@@ -181,6 +191,20 @@ func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
 }
 
+func (e *endpoint) takeLastError() *tcpip.Error {
+	e.lastErrorMu.Lock()
+	defer e.lastErrorMu.Unlock()
+
+	err := e.lastError
+	e.lastError = nil
+	return err
+}
+
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
 // Close puts the endpoint in a closed state and frees all resources
 // associated with it.
 func (e *endpoint) Close() {
@@ -223,14 +247,13 @@ func (e *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (iptables.IPTables, error) {
-	return e.stack.IPTables(), nil
-}
-
 // Read reads data from the endpoint. This method does not block if
 // there is no data pending.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	if err := e.takeLastError(); err != nil {
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
 	e.rcvMu.Lock()
 
 	if e.rcvList.Empty() {
@@ -258,13 +281,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveTClass := e.receiveTClass
 	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
-
+	if receiveTClass {
+		cm.HasTClass = true
+		// Although TClass is an 8-bit value it's read in the CMsg as a uint32.
+		cm.TClass = uint32(p.tos)
+	}
 	if receiveIPPacketInfo {
 		cm.HasIPPacketInfo = true
 		cm.PacketInfo = p.packetInfo
@@ -365,6 +393,10 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 }
 
 func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	if err := e.takeLastError(); err != nil {
+		return 0, nil, err
+	}
+
 	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
 	if opts.More {
 		return 0, nil, tcpip.ErrInvalidOptionValue
@@ -428,19 +460,19 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			return 0, nil, tcpip.ErrBroadcastDisabled
 		}
 
-		netProto, err := e.checkV4Mapped(to)
+		dst, netProto, err := e.checkV4MappedLocked(*to)
 		if err != nil {
 			return 0, nil, err
 		}
 
-		r, _, err := e.connectRoute(nicID, *to, netProto)
+		r, _, err := e.connectRoute(nicID, dst, netProto)
 		if err != nil {
 			return 0, nil, err
 		}
 		defer r.Release()
 
 		route = &r
-		dstPort = to.Port
+		dstPort = dst.Port
 	}
 
 	if route.IsResolutionRequired() {
@@ -470,7 +502,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		useDefaultTTL = false
 	}
 
-	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS); err != nil {
+	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner); err != nil {
 		return 0, nil, err
 	}
 	return int64(len(v)), nil, nil
@@ -484,11 +516,42 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 	switch opt {
+	case tcpip.BroadcastOption:
+		e.mu.Lock()
+		e.broadcast = v
+		e.mu.Unlock()
+
+	case tcpip.MulticastLoopOption:
+		e.mu.Lock()
+		e.multicastLoop = v
+		e.mu.Unlock()
+
 	case tcpip.ReceiveTOSOption:
 		e.mu.Lock()
 		e.receiveTOS = v
 		e.mu.Unlock()
-		return nil
+
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return tcpip.ErrNotSupported
+		}
+
+		e.mu.Lock()
+		e.receiveTClass = v
+		e.mu.Unlock()
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.Lock()
+		e.receiveIPPacketInfo = v
+		e.mu.Unlock()
+
+	case tcpip.ReuseAddressOption:
+
+	case tcpip.ReusePortOption:
+		e.mu.Lock()
+		e.reusePort = v
+		e.mu.Unlock()
 
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
@@ -505,13 +568,6 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		}
 
 		e.v6only = v
-		return nil
-
-	case tcpip.ReceiveIPPacketInfoOption:
-		e.mu.Lock()
-		e.receiveIPPacketInfo = v
-		e.mu.Unlock()
-		return nil
 	}
 
 	return nil
@@ -519,28 +575,44 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
-	return nil
-}
+	switch opt {
+	case tcpip.MulticastTTLOption:
+		e.mu.Lock()
+		e.multicastTTL = uint8(v)
+		e.mu.Unlock()
 
-// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch v := opt.(type) {
 	case tcpip.TTLOption:
 		e.mu.Lock()
 		e.ttl = uint8(v)
 		e.mu.Unlock()
 
-	case tcpip.MulticastTTLOption:
+	case tcpip.IPv4TOSOption:
 		e.mu.Lock()
-		e.multicastTTL = uint8(v)
+		e.sendTOS = uint8(v)
 		e.mu.Unlock()
 
+	case tcpip.IPv6TrafficClassOption:
+		e.mu.Lock()
+		e.sendTOS = uint8(v)
+		e.mu.Unlock()
+
+	case tcpip.ReceiveBufferSizeOption:
+	case tcpip.SendBufferSizeOption:
+
+	}
+
+	return nil
+}
+
+// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch v := opt.(type) {
 	case tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		defer e.mu.Unlock()
 
 		fa := tcpip.FullAddress{Addr: v.InterfaceAddr}
-		netProto, err := e.checkV4Mapped(&fa)
+		fa, netProto, err := e.checkV4MappedLocked(fa)
 		if err != nil {
 			return err
 		}
@@ -658,16 +730,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.multicastMemberships[memToRemoveIndex] = e.multicastMemberships[len(e.multicastMemberships)-1]
 		e.multicastMemberships = e.multicastMemberships[:len(e.multicastMemberships)-1]
 
-	case tcpip.MulticastLoopOption:
-		e.mu.Lock()
-		e.multicastLoop = bool(v)
-		e.mu.Unlock()
-
-	case tcpip.ReusePortOption:
-		e.mu.Lock()
-		e.reusePort = v != 0
-		e.mu.Unlock()
-
 	case tcpip.BindToDeviceOption:
 		id := tcpip.NICID(v)
 		if id != 0 && !e.stack.HasNIC(id) {
@@ -676,26 +738,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Lock()
 		e.bindToDevice = id
 		e.mu.Unlock()
-		return nil
-
-	case tcpip.BroadcastOption:
-		e.mu.Lock()
-		e.broadcast = v != 0
-		e.mu.Unlock()
-
-		return nil
-
-	case tcpip.IPv4TOSOption:
-		e.mu.Lock()
-		e.sendTOS = uint8(v)
-		e.mu.Unlock()
-		return nil
-
-	case tcpip.IPv6TrafficClassOption:
-		e.mu.Lock()
-		e.sendTOS = uint8(v)
-		e.mu.Unlock()
-		return nil
 	}
 	return nil
 }
@@ -703,22 +745,36 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	switch opt {
+	case tcpip.BroadcastOption:
+		e.mu.RLock()
+		v := e.broadcast
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	case tcpip.MulticastLoopOption:
+		e.mu.RLock()
+		v := e.multicastLoop
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.ReceiveTOSOption:
 		e.mu.RLock()
 		v := e.receiveTOS
 		e.mu.RUnlock()
 		return v, nil
 
-	case tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
-			return false, tcpip.ErrUnknownProtocolOption
+			return false, tcpip.ErrNotSupported
 		}
 
 		e.mu.RLock()
-		v := e.v6only
+		v := e.receiveTClass
 		e.mu.RUnlock()
-
 		return v, nil
 
 	case tcpip.ReceiveIPPacketInfoOption:
@@ -726,14 +782,55 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		v := e.receiveIPPacketInfo
 		e.mu.RUnlock()
 		return v, nil
-	}
 
-	return false, tcpip.ErrUnknownProtocolOption
+	case tcpip.ReuseAddressOption:
+		return false, nil
+
+	case tcpip.ReusePortOption:
+		e.mu.RLock()
+		v := e.reusePort
+		e.mu.RUnlock()
+
+		return v, nil
+
+	case tcpip.V6OnlyOption:
+		// We only recognize this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrUnknownProtocolOption
+		}
+
+		e.mu.RLock()
+		v := e.v6only
+		e.mu.RUnlock()
+
+		return v, nil
+
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
+	case tcpip.IPv4TOSOption:
+		e.mu.RLock()
+		v := int(e.sendTOS)
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.IPv6TrafficClassOption:
+		e.mu.RLock()
+		v := int(e.sendTOS)
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.MulticastTTLOption:
+		e.mu.Lock()
+		v := int(e.multicastTTL)
+		e.mu.Unlock()
+		return v, nil
+
 	case tcpip.ReceiveQueueSizeOption:
 		v := 0
 		e.rcvMu.Lock()
@@ -755,29 +852,23 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		v := e.rcvBufSizeMax
 		e.rcvMu.Unlock()
 		return v, nil
-	}
 
-	return -1, tcpip.ErrUnknownProtocolOption
+	case tcpip.TTLOption:
+		e.mu.Lock()
+		v := int(e.ttl)
+		e.mu.Unlock()
+		return v, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	switch o := opt.(type) {
 	case tcpip.ErrorOption:
-		return nil
-
-	case *tcpip.TTLOption:
-		e.mu.Lock()
-		*o = tcpip.TTLOption(e.ttl)
-		e.mu.Unlock()
-		return nil
-
-	case *tcpip.MulticastTTLOption:
-		e.mu.Lock()
-		*o = tcpip.MulticastTTLOption(e.multicastTTL)
-		e.mu.Unlock()
-		return nil
-
+		return e.takeLastError()
 	case *tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		*o = tcpip.MulticastInterfaceOption{
@@ -785,72 +876,21 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 			e.multicastAddr,
 		}
 		e.mu.Unlock()
-		return nil
-
-	case *tcpip.MulticastLoopOption:
-		e.mu.RLock()
-		v := e.multicastLoop
-		e.mu.RUnlock()
-
-		*o = tcpip.MulticastLoopOption(v)
-		return nil
-
-	case *tcpip.ReuseAddressOption:
-		*o = 0
-		return nil
-
-	case *tcpip.ReusePortOption:
-		e.mu.RLock()
-		v := e.reusePort
-		e.mu.RUnlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
 
 	case *tcpip.BindToDeviceOption:
 		e.mu.RLock()
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
 		e.mu.RUnlock()
-		return nil
-
-	case *tcpip.KeepaliveEnabledOption:
-		*o = 0
-		return nil
-
-	case *tcpip.BroadcastOption:
-		e.mu.RLock()
-		v := e.broadcast
-		e.mu.RUnlock()
-
-		*o = 0
-		if v {
-			*o = 1
-		}
-		return nil
-
-	case *tcpip.IPv4TOSOption:
-		e.mu.RLock()
-		*o = tcpip.IPv4TOSOption(e.sendTOS)
-		e.mu.RUnlock()
-		return nil
-
-	case *tcpip.IPv6TrafficClassOption:
-		e.mu.RLock()
-		*o = tcpip.IPv6TrafficClassOption(e.sendTOS)
-		e.mu.RUnlock()
-		return nil
 
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
+	return nil
 }
 
 // sendUDP sends a UDP segment via the provided network endpoint and under the
 // provided identity.
-func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8) *tcpip.Error {
+func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8, owner tcpip.PacketOwner) *tcpip.Error {
 	// Allocate a buffer for the UDP header.
 	hdr := buffer.NewPrependable(header.UDPMinimumSize + int(r.MaxHeaderLength()))
 
@@ -876,10 +916,15 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	if useDefaultTTL {
 		ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, tcpip.PacketBuffer{
+	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: ProtocolNumber,
+		TTL:      ttl,
+		TOS:      tos,
+	}, &stack.PacketBuffer{
 		Header:          hdr,
 		Data:            data,
 		TransportHeader: buffer.View(udp),
+		Owner:           owner,
 	}); err != nil {
 		r.Stats().UDP.PacketSendErrors.Increment()
 		return err
@@ -890,13 +935,14 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	return nil
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
 	if err != nil {
-		return 0, err
+		return tcpip.FullAddress{}, 0, err
 	}
-	*addr = unwrapped
-	return netProto, nil
+	return unwrapped, netProto, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect.
@@ -944,10 +990,6 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 
 // Connect connects the endpoint to its peer. Specifying a NIC is optional.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
-	netProto, err := e.checkV4Mapped(&addr)
-	if err != nil {
-		return err
-	}
 	if addr.Port == 0 {
 		// We don't support connecting to port zero.
 		return tcpip.ErrInvalidEndpointState
@@ -975,6 +1017,11 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
+	addr, netProto, err := e.checkV4MappedLocked(addr)
+	if err != nil {
+		return err
+	}
+
 	r, nicID, err := e.connectRoute(nicID, addr, netProto)
 	if err != nil {
 		return err
@@ -1102,7 +1149,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr)
+	addr, netProto, err := e.checkV4MappedLocked(addr)
 	if err != nil {
 		return err
 	}
@@ -1221,18 +1268,16 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	// Get the header then trim it from the view.
-	hdr := header.UDP(pkt.Data.First())
-	if int(hdr.Length()) > pkt.Data.Size() {
+	hdr := header.UDP(pkt.TransportHeader)
+	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
 		// Malformed packet.
 		e.stack.Stats().UDP.MalformedPacketsReceived.Increment()
 		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 		return
 	}
 
-	pkt.Data.TrimFront(header.UDPMinimumSize)
-
 	e.rcvMu.Lock()
 	e.stack.Stats().UDP.PacketsReceived.Increment()
 	e.stats.PacketsReceived.Increment()
@@ -1259,7 +1304,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		senderAddress: tcpip.FullAddress{
 			NIC:  r.NICID(),
 			Addr: id.RemoteAddress,
-			Port: hdr.SourcePort(),
+			Port: header.UDP(hdr).SourcePort(),
 		},
 	}
 	packet.data = pkt.Data
@@ -1273,6 +1318,8 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		packet.packetInfo.LocalAddr = r.LocalAddress
 		packet.packetInfo.DestinationAddr = r.RemoteAddress
 		packet.packetInfo.NIC = r.NICID()
+	case header.IPv6ProtocolNumber:
+		packet.tos, _ = header.IPv6(pkt.NetworkHeader).TOS()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
@@ -1286,7 +1333,18 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
+	if typ == stack.ControlPortUnreachable {
+		e.mu.RLock()
+		defer e.mu.RUnlock()
+
+		if e.state == StateConnected {
+			e.lastErrorMu.Lock()
+			defer e.lastErrorMu.Unlock()
+
+			e.lastError = tcpip.ErrConnectionRefused
+		}
+	}
 }
 
 // State implements tcpip.Endpoint.State.
@@ -1316,3 +1374,7 @@ func (*endpoint) Wait() {}
 func isBroadcastOrMulticast(a tcpip.Address) bool {
 	return a == header.IPv4Broadcast || header.IsV4MulticastAddress(a) || header.IsV6MulticastAddress(a)
 }
+
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
+}
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 43fb047ed..851e6b635 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -37,6 +37,24 @@ func (u *udpPacket) loadData(data buffer.VectorisedView) {
 	u.data = data
 }
 
+// saveLastError is invoked by stateify.
+func (e *endpoint) saveLastError() string {
+	if e.lastError == nil {
+		return ""
+	}
+
+	return e.lastError.String()
+}
+
+// loadLastError is invoked by stateify.
+func (e *endpoint) loadLastError(s string) {
+	if s == "" {
+		return
+	}
+
+	e.lastError = tcpip.StringToError(s)
+}
+
 // beforeSave is invoked by stateify.
 func (e *endpoint) beforeSave() {
 	// Stop incoming packets from being handled (and mutate endpoint state).
@@ -69,6 +87,9 @@ func (e *endpoint) afterLoad() {
 
 // Resume implements tcpip.ResumableEndpoint.Resume.
 func (e *endpoint) Resume(s *stack.Stack) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
 	e.stack = s
 
 	for _, m := range e.multicastMemberships {
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index fc706ede2..7abfa0ed2 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -43,7 +43,7 @@ func NewForwarder(s *stack.Stack, handler func(*ForwarderRequest)) *Forwarder {
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
 	f.handler(&ForwarderRequest{
 		stack: f.stack,
 		route: r,
@@ -61,7 +61,7 @@ type ForwarderRequest struct {
 	stack *stack.Stack
 	route *stack.Route
 	id    stack.TransportEndpointID
-	pkt   tcpip.PacketBuffer
+	pkt   *stack.PacketBuffer
 }
 
 // ID returns the 4-tuple (src address, src port, dst address, dst port) that
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 259c3072a..4218e7d03 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -66,10 +66,9 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
-	// Get the header then trim it from the view.
-	hdr := header.UDP(pkt.Data.First())
-	if int(hdr.Length()) > pkt.Data.Size() {
+func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+	hdr := header.UDP(pkt.TransportHeader)
+	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
 		// Malformed packet.
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
 		return true
@@ -116,7 +115,7 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		}
 		headerLen := int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize
 		available := int(mtu) - headerLen
-		payloadLen := len(pkt.NetworkHeader) + pkt.Data.Size()
+		payloadLen := len(pkt.NetworkHeader) + len(pkt.TransportHeader) + pkt.Data.Size()
 		if payloadLen > available {
 			payloadLen = available
 		}
@@ -125,9 +124,10 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		// For example, a raw or packet socket may use what UDP
 		// considers an unreachable destination. Thus we deep copy pkt
 		// to prevent multiple ownership and SR errors.
-		newNetHeader := append(buffer.View(nil), pkt.NetworkHeader...)
-		payload := newNetHeader.ToVectorisedView()
-		payload.Append(pkt.Data.ToView().ToVectorisedView())
+		newHeader := append(buffer.View(nil), pkt.NetworkHeader...)
+		newHeader = append(newHeader, pkt.TransportHeader...)
+		payload := newHeader.ToVectorisedView()
+		payload.AppendView(pkt.Data.ToView())
 		payload.CapLength(payloadLen)
 
 		hdr := buffer.NewPrependable(headerLen)
@@ -135,9 +135,10 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		pkt.SetType(header.ICMPv4DstUnreachable)
 		pkt.SetCode(header.ICMPv4PortUnreachable)
 		pkt.SetChecksum(header.ICMPv4Checksum(pkt, payload))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
-			Header: hdr,
-			Data:   payload,
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+			Header:          hdr,
+			TransportHeader: buffer.View(pkt),
+			Data:            payload,
 		})
 
 	case header.IPv6AddressSize:
@@ -159,11 +160,11 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		}
 		headerLen := int(r.MaxHeaderLength()) + header.ICMPv6DstUnreachableMinimumSize
 		available := int(mtu) - headerLen
-		payloadLen := len(pkt.NetworkHeader) + pkt.Data.Size()
+		payloadLen := len(pkt.NetworkHeader) + len(pkt.TransportHeader) + pkt.Data.Size()
 		if payloadLen > available {
 			payloadLen = available
 		}
-		payload := buffer.NewVectorisedView(len(pkt.NetworkHeader), []buffer.View{pkt.NetworkHeader})
+		payload := buffer.NewVectorisedView(len(pkt.NetworkHeader)+len(pkt.TransportHeader), []buffer.View{pkt.NetworkHeader, pkt.TransportHeader})
 		payload.Append(pkt.Data)
 		payload.CapLength(payloadLen)
 
@@ -172,24 +173,43 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		pkt.SetType(header.ICMPv6DstUnreachable)
 		pkt.SetCode(header.ICMPv6PortUnreachable)
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, payload))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
-			Header: hdr,
-			Data:   payload,
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+			Header:          hdr,
+			TransportHeader: buffer.View(pkt),
+			Data:            payload,
 		})
 	}
 	return true
 }
 
-// SetOption implements TransportProtocol.SetOption.
-func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+// SetOption implements stack.TransportProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
-// Option implements TransportProtocol.Option.
-func (p *protocol) Option(option interface{}) *tcpip.Error {
+// Option implements stack.TransportProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	h, ok := pkt.Data.PullUp(header.UDPMinimumSize)
+	if !ok {
+		// Packet is too small
+		return false
+	}
+	pkt.TransportHeader = h
+	pkt.Data.TrimFront(header.UDPMinimumSize)
+	return true
+}
+
 // NewProtocol returns a UDP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	return &protocol{}
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index f0ff3fe71..313a3f117 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -343,11 +343,11 @@ func (c *testContext) createEndpointForFlow(flow testFlow) {
 	c.createEndpoint(flow.sockProto())
 	if flow.isV6Only() {
 		if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-			c.t.Fatalf("SetSockOpt failed: %v", err)
+			c.t.Fatalf("SetSockOptBool failed: %s", err)
 		}
 	} else if flow.isBroadcast() {
-		if err := c.ep.SetSockOpt(tcpip.BroadcastOption(1)); err != nil {
-			c.t.Fatal("SetSockOpt failed:", err)
+		if err := c.ep.SetSockOptBool(tcpip.BroadcastOption, true); err != nil {
+			c.t.Fatalf("SetSockOptBool failed: %s", err)
 		}
 	}
 }
@@ -358,7 +358,8 @@ func (c *testContext) createEndpointForFlow(flow testFlow) {
 func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.NetworkChecker) []byte {
 	c.t.Helper()
 
-	ctx, _ := context.WithTimeout(context.Background(), 2*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
 	p, ok := c.linkEP.ReadContext(ctx)
 	if !ok {
 		c.t.Fatalf("Packet wasn't written out")
@@ -409,6 +410,7 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
 	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
 		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
 		NextHeader:    uint8(udp.ProtocolNumber),
 		HopLimit:      65,
@@ -438,10 +440,8 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
-		Data:            buf.ToVectorisedView(),
-		NetworkHeader:   buffer.View(ip),
-		TransportHeader: buffer.View(u),
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
 	})
 }
 
@@ -485,10 +485,8 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 
 	// Inject packet.
 
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
-		Data:            buf.ToVectorisedView(),
-		NetworkHeader:   buffer.View(ip),
-		TransportHeader: buffer.View(u),
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
 	})
 }
 
@@ -606,7 +604,7 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	// Check the peer address.
 	h := flow.header4Tuple(incoming)
 	if addr.Addr != h.srcAddr.Addr {
-		c.t.Fatalf("unexpected remote address: got %s, want %s", addr.Addr, h.srcAddr)
+		c.t.Fatalf("unexpected remote address: got %s, want %v", addr.Addr, h.srcAddr)
 	}
 
 	// Check the payload.
@@ -1270,8 +1268,8 @@ func TestTTL(t *testing.T) {
 			c.createEndpointForFlow(flow)
 
 			const multicastTTL = 42
-			if err := c.ep.SetSockOpt(tcpip.MulticastTTLOption(multicastTTL)); err != nil {
-				c.t.Fatalf("SetSockOpt failed: %v", err)
+			if err := c.ep.SetSockOptInt(tcpip.MulticastTTLOption, multicastTTL); err != nil {
+				c.t.Fatalf("SetSockOptInt failed: %s", err)
 			}
 
 			var wantTTL uint8
@@ -1310,8 +1308,8 @@ func TestSetTTL(t *testing.T) {
 
 					c.createEndpointForFlow(flow)
 
-					if err := c.ep.SetSockOpt(tcpip.TTLOption(wantTTL)); err != nil {
-						c.t.Fatalf("SetSockOpt failed: %v", err)
+					if err := c.ep.SetSockOptInt(tcpip.TTLOption, int(wantTTL)); err != nil {
+						c.t.Fatalf("SetSockOptInt(TTLOption, %d) failed: %s", wantTTL, err)
 					}
 
 					var p stack.NetworkProtocol
@@ -1336,7 +1334,7 @@ func TestSetTTL(t *testing.T) {
 	}
 }
 
-func TestTOSV4(t *testing.T) {
+func TestSetTOS(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4, multicastV4, broadcast} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1345,25 +1343,26 @@ func TestTOSV4(t *testing.T) {
 			c.createEndpointForFlow(flow)
 
 			const tos = testTOS
-			var v tcpip.IPv4TOSOption
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+			v, err := c.ep.GetSockOptInt(tcpip.IPv4TOSOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv4TOSOption) failed: %s", err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(IPv4TOSOption) = 0x%x, want = 0x%x", v, 0)
 			}
 
-			if err := c.ep.SetSockOpt(tcpip.IPv4TOSOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.IPv4TOSOption(tos), err)
+			if err := c.ep.SetSockOptInt(tcpip.IPv4TOSOption, tos); err != nil {
+				c.t.Errorf("SetSockOptInt(IPv4TOSOption, 0x%x) failed: %s", tos, err)
 			}
 
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+			v, err = c.ep.GetSockOptInt(tcpip.IPv4TOSOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv4TOSOption) failed: %s", err)
 			}
 
-			if want := tcpip.IPv4TOSOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+			if v != tos {
+				c.t.Errorf("got GetSockOptInt(IPv4TOSOption) = 0x%x, want = 0x%x", v, tos)
 			}
 
 			testWrite(c, flow, checker.TOS(tos, 0))
@@ -1371,7 +1370,7 @@ func TestTOSV4(t *testing.T) {
 	}
 }
 
-func TestTOSV6(t *testing.T) {
+func TestSetTClass(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, broadcastIn6} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1379,71 +1378,93 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = testTOS
-			var v tcpip.IPv6TrafficClassOption
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+			const tClass = testTOS
+			v, err := c.ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv6TrafficClassOption) failed: %s", err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOptInt(IPv6TrafficClassOption) = 0x%x, want = 0x%x", v, 0)
 			}
 
-			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt failed: %s", err)
+			if err := c.ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, tClass); err != nil {
+				c.t.Errorf("SetSockOptInt(IPv6TrafficClassOption, 0x%x) failed: %s", tClass, err)
 			}
 
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+			v, err = c.ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv6TrafficClassOption) failed: %s", err)
 			}
 
-			if want := tcpip.IPv6TrafficClassOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+			if v != tClass {
+				c.t.Errorf("got GetSockOptInt(IPv6TrafficClassOption) = 0x%x, want = 0x%x", v, tClass)
 			}
 
-			testWrite(c, flow, checker.TOS(tos, 0))
+			// The header getter for TClass is called TOS, so use that checker.
+			testWrite(c, flow, checker.TOS(tClass, 0))
 		})
 	}
 }
 
-func TestReceiveTOSV4(t *testing.T) {
-	for _, flow := range []testFlow{unicastV4, broadcast} {
-		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
-			c := newDualTestContext(t, defaultMTU)
-			defer c.cleanup()
+func TestReceiveTosTClass(t *testing.T) {
+	testCases := []struct {
+		name             string
+		getReceiveOption tcpip.SockOptBool
+		tests            []testFlow
+	}{
+		{"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}},
+		{"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
+	}
+	for _, testCase := range testCases {
+		for _, flow := range testCase.tests {
+			t.Run(fmt.Sprintf("%s:flow:%s", testCase.name, flow), func(t *testing.T) {
+				c := newDualTestContext(t, defaultMTU)
+				defer c.cleanup()
 
-			c.createEndpointForFlow(flow)
+				c.createEndpointForFlow(flow)
+				option := testCase.getReceiveOption
+				name := testCase.name
 
-			// Verify that setting and reading the option works.
-			v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			// Test for expected default value.
-			if v != false {
-				c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false)
-			}
+				// Verify that setting and reading the option works.
+				v, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err)
+				}
+				// Test for expected default value.
+				if v != false {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false)
+				}
 
-			want := true
-			if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil {
-				c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err)
-			}
+				want := true
+				if err := c.ep.SetSockOptBool(option, want); err != nil {
+					c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err)
+				}
 
-			got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			if got != want {
-				c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want)
-			}
+				got, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err)
+				}
 
-			// Verify that the correct received TOS is handed through as
-			// ancillary data to the ControlMessages struct.
-			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-				c.t.Fatal("Bind failed:", err)
-			}
-			testRead(c, flow, checker.ReceiveTOS(testTOS))
-		})
+				if got != want {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want)
+				}
+
+				// Verify that the correct received TOS or TClass is handed through as
+				// ancillary data to the ControlMessages struct.
+				if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+					c.t.Fatalf("Bind failed: %s", err)
+				}
+				switch option {
+				case tcpip.ReceiveTClassOption:
+					testRead(c, flow, checker.ReceiveTClass(testTOS))
+				case tcpip.ReceiveTOSOption:
+					testRead(c, flow, checker.ReceiveTOS(testTOS))
+				default:
+					t.Fatalf("unknown test variant: %s", name)
+				}
+			})
+		}
 	}
 }
 
@@ -1541,7 +1562,8 @@ func TestV4UnknownDestination(t *testing.T) {
 			}
 			c.injectPacket(tc.flow, payload)
 			if !tc.icmpRequired {
-				ctx, _ := context.WithTimeout(context.Background(), time.Second)
+				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+				defer cancel()
 				if p, ok := c.linkEP.ReadContext(ctx); ok {
 					t.Fatalf("unexpected packet received: %+v", p)
 				}
@@ -1549,7 +1571,8 @@ func TestV4UnknownDestination(t *testing.T) {
 			}
 
 			// ICMP required.
-			ctx, _ := context.WithTimeout(context.Background(), time.Second)
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			defer cancel()
 			p, ok := c.linkEP.ReadContext(ctx)
 			if !ok {
 				t.Fatalf("packet wasn't written out")
@@ -1617,7 +1640,8 @@ func TestV6UnknownDestination(t *testing.T) {
 			}
 			c.injectPacket(tc.flow, payload)
 			if !tc.icmpRequired {
-				ctx, _ := context.WithTimeout(context.Background(), time.Second)
+				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+				defer cancel()
 				if p, ok := c.linkEP.ReadContext(ctx); ok {
 					t.Fatalf("unexpected packet received: %+v", p)
 				}
@@ -1625,7 +1649,8 @@ func TestV6UnknownDestination(t *testing.T) {
 			}
 
 			// ICMP required.
-			ctx, _ := context.WithTimeout(context.Background(), time.Second)
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			defer cancel()
 			p, ok := c.linkEP.ReadContext(ctx)
 			if !ok {
 				t.Fatalf("packet wasn't written out")
@@ -1691,6 +1716,58 @@ func TestIncrementMalformedPacketsReceived(t *testing.T) {
 	}
 }
 
+// TestShortHeader verifies that when a packet with a too-short UDP header is
+// received, the malformed received global stat gets incremented.
+func TestShortHeader(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	c.t.Helper()
+	h := unicastV6.header4Tuple(incoming)
+
+	// Allocate a buffer for an IPv6 and too-short UDP header.
+	const udpSize = header.UDPMinimumSize - 1
+	buf := buffer.NewView(header.IPv6MinimumSize + udpSize)
+	// Initialize the IP header.
+	ip := header.IPv6(buf)
+	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
+		PayloadLength: uint16(udpSize),
+		NextHeader:    uint8(udp.ProtocolNumber),
+		HopLimit:      65,
+		SrcAddr:       h.srcAddr.Addr,
+		DstAddr:       h.dstAddr.Addr,
+	})
+
+	// Initialize the UDP header.
+	udpHdr := header.UDP(buffer.NewView(header.UDPMinimumSize))
+	udpHdr.Encode(&header.UDPFields{
+		SrcPort: h.srcAddr.Port,
+		DstPort: h.dstAddr.Port,
+		Length:  header.UDPMinimumSize,
+	})
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, h.srcAddr.Addr, h.dstAddr.Addr, uint16(len(udpHdr)))
+	udpHdr.SetChecksum(^udpHdr.CalculateChecksum(xsum))
+	// Copy all but the last byte of the UDP header into the packet.
+	copy(buf[header.IPv6MinimumSize:], udpHdr)
+
+	// Inject packet.
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	if got, want := c.s.Stats().MalformedRcvdPackets.Value(), uint64(1); got != want {
+		t.Errorf("got c.s.Stats().MalformedRcvdPackets.Value() = %d, want = %d", got, want)
+	}
+}
+
 // TestShutdownRead verifies endpoint read shutdown and error
 // stats increment on packet receive.
 func TestShutdownRead(t *testing.T) {
diff --git a/runsc/criutil/BUILD b/pkg/test/criutil/BUILD
index 8a571a000..a7b082cee 100644
--- a/runsc/criutil/BUILD
+++ b/pkg/test/criutil/BUILD
@@ -7,5 +7,8 @@ go_library(
     testonly = 1,
     srcs = ["criutil.go"],
     visibility = ["//:sandbox"],
-    deps = ["//runsc/testutil"],
+    deps = [
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
+    ],
 )
diff --git a/runsc/criutil/criutil.go b/pkg/test/criutil/criutil.go
index 773f5a1c4..bebebb48e 100644
--- a/runsc/criutil/criutil.go
+++ b/pkg/test/criutil/criutil.go
@@ -25,40 +25,45 @@ import (
 	"strings"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
-const endpointPrefix = "unix://"
-
 // Crictl contains information required to run the crictl utility.
 type Crictl struct {
-	executable      string
-	timeout         time.Duration
-	imageEndpoint   string
-	runtimeEndpoint string
+	logger   testutil.Logger
+	endpoint string
+	cleanup  []func()
+}
+
+// resolvePath attempts to find binary paths. It may set the path to invalid,
+// which will cause the execution to fail with a sensible error.
+func resolvePath(executable string) string {
+	guess, err := exec.LookPath(executable)
+	if err != nil {
+		guess = fmt.Sprintf("/usr/local/bin/%s", executable)
+	}
+	return guess
 }
 
 // NewCrictl returns a Crictl configured with a timeout and an endpoint over
 // which it will talk to containerd.
-func NewCrictl(timeout time.Duration, endpoint string) *Crictl {
-	// Bazel doesn't pass PATH through, assume the location of crictl
-	// unless specified by environment variable.
-	executable := os.Getenv("CRICTL_PATH")
-	if executable == "" {
-		executable = "/usr/local/bin/crictl"
-	}
+func NewCrictl(logger testutil.Logger, endpoint string) *Crictl {
+	// Attempt to find the executable, but don't bother propagating the
+	// error at this point. The first command executed will return with a
+	// binary not found error.
 	return &Crictl{
-		executable:      executable,
-		timeout:         timeout,
-		imageEndpoint:   endpointPrefix + endpoint,
-		runtimeEndpoint: endpointPrefix + endpoint,
+		logger:   logger,
+		endpoint: endpoint,
 	}
 }
 
-// Pull pulls an container image. It corresponds to `crictl pull`.
-func (cc *Crictl) Pull(imageName string) error {
-	_, err := cc.run("pull", imageName)
-	return err
+// CleanUp executes cleanup functions.
+func (cc *Crictl) CleanUp() {
+	for _, c := range cc.cleanup {
+		c()
+	}
+	cc.cleanup = nil
 }
 
 // RunPod creates a sandbox. It corresponds to `crictl runp`.
@@ -157,27 +162,66 @@ func (cc *Crictl) RmPod(podID string) error {
 	return err
 }
 
+// Import imports the given container from the local Docker instance.
+func (cc *Crictl) Import(image string) error {
+	// Note that we provide a 10 minute timeout after connect because we may
+	// be pushing a lot of bytes in order to import the image. The connect
+	// timeout stays the same and is inherited from the Crictl instance.
+	cmd := testutil.Command(cc.logger,
+		resolvePath("ctr"),
+		fmt.Sprintf("--connect-timeout=%s", 30*time.Second),
+		fmt.Sprintf("--address=%s", cc.endpoint),
+		"-n", "k8s.io", "images", "import", "-")
+	cmd.Stderr = os.Stderr // Pass through errors.
+
+	// Create a pipe and start the program.
+	w, err := cmd.StdinPipe()
+	if err != nil {
+		return err
+	}
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+
+	// Save the image on the other end.
+	if err := dockerutil.Save(cc.logger, image, w); err != nil {
+		cmd.Wait()
+		return err
+	}
+
+	// Close our pipe reference & see if it was loaded.
+	if err := w.Close(); err != nil {
+		return w.Close()
+	}
+
+	return cmd.Wait()
+}
+
 // StartContainer pulls the given image ands starts the container in the
 // sandbox with the given podID.
+//
+// Note that the image will always be imported from the local docker daemon.
 func (cc *Crictl) StartContainer(podID, image, sbSpec, contSpec string) (string, error) {
+	if err := cc.Import(image); err != nil {
+		return "", err
+	}
+
 	// Write the specs to files that can be read by crictl.
-	sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec)
+	sbSpecFile, cleanup, err := testutil.WriteTmpFile("sbSpec", sbSpec)
 	if err != nil {
 		return "", fmt.Errorf("failed to write sandbox spec: %v", err)
 	}
-	contSpecFile, err := testutil.WriteTmpFile("contSpec", contSpec)
+	cc.cleanup = append(cc.cleanup, cleanup)
+	contSpecFile, cleanup, err := testutil.WriteTmpFile("contSpec", contSpec)
 	if err != nil {
 		return "", fmt.Errorf("failed to write container spec: %v", err)
 	}
+	cc.cleanup = append(cc.cleanup, cleanup)
 
 	return cc.startContainer(podID, image, sbSpecFile, contSpecFile)
 }
 
 func (cc *Crictl) startContainer(podID, image, sbSpecFile, contSpecFile string) (string, error) {
-	if err := cc.Pull(image); err != nil {
-		return "", fmt.Errorf("failed to pull %s: %v", image, err)
-	}
-
 	contID, err := cc.Create(podID, contSpecFile, sbSpecFile)
 	if err != nil {
 		return "", fmt.Errorf("failed to create container in pod %q: %v", podID, err)
@@ -203,18 +247,24 @@ func (cc *Crictl) StopContainer(contID string) error {
 	return nil
 }
 
-// StartPodAndContainer pulls an image, then starts a sandbox and container in
-// that sandbox. It returns the pod ID and container ID.
+// StartPodAndContainer starts a sandbox and container in that sandbox. It
+// returns the pod ID and container ID.
 func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) {
+	if err := cc.Import(image); err != nil {
+		return "", "", err
+	}
+
 	// Write the specs to files that can be read by crictl.
-	sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec)
+	sbSpecFile, cleanup, err := testutil.WriteTmpFile("sbSpec", sbSpec)
 	if err != nil {
 		return "", "", fmt.Errorf("failed to write sandbox spec: %v", err)
 	}
-	contSpecFile, err := testutil.WriteTmpFile("contSpec", contSpec)
+	cc.cleanup = append(cc.cleanup, cleanup)
+	contSpecFile, cleanup, err := testutil.WriteTmpFile("contSpec", contSpec)
 	if err != nil {
 		return "", "", fmt.Errorf("failed to write container spec: %v", err)
 	}
+	cc.cleanup = append(cc.cleanup, cleanup)
 
 	podID, err := cc.RunPod(sbSpecFile)
 	if err != nil {
@@ -243,35 +293,14 @@ func (cc *Crictl) StopPodAndContainer(podID, contID string) error {
 	return nil
 }
 
-// run runs crictl with the given args and returns an error if it takes longer
-// than cc.Timeout to run.
+// run runs crictl with the given args.
 func (cc *Crictl) run(args ...string) (string, error) {
 	defaultArgs := []string{
-		"--image-endpoint", cc.imageEndpoint,
-		"--runtime-endpoint", cc.runtimeEndpoint,
-	}
-	cmd := exec.Command(cc.executable, append(defaultArgs, args...)...)
-
-	// Run the command with a timeout.
-	done := make(chan string)
-	errCh := make(chan error)
-	go func() {
-		output, err := cmd.CombinedOutput()
-		if err != nil {
-			errCh <- fmt.Errorf("error: \"%v\", output: %s", err, string(output))
-			return
-		}
-		done <- string(output)
-	}()
-	select {
-	case output := <-done:
-		return output, nil
-	case err := <-errCh:
-		return "", err
-	case <-time.After(cc.timeout):
-		if err := testutil.KillCommand(cmd); err != nil {
-			return "", fmt.Errorf("timed out, then couldn't kill process %+v: %v", cmd, err)
-		}
-		return "", fmt.Errorf("timed out: %+v", cmd)
+		resolvePath("crictl"),
+		"--image-endpoint", fmt.Sprintf("unix://%s", cc.endpoint),
+		"--runtime-endpoint", fmt.Sprintf("unix://%s", cc.endpoint),
 	}
+	fullArgs := append(defaultArgs, args...)
+	out, err := testutil.Command(cc.logger, fullArgs...).CombinedOutput()
+	return string(out), err
 }
diff --git a/runsc/dockerutil/BUILD b/pkg/test/dockerutil/BUILD
index 8621af901..7c8758e35 100644
--- a/runsc/dockerutil/BUILD
+++ b/pkg/test/dockerutil/BUILD
@@ -8,7 +8,7 @@ go_library(
     srcs = ["dockerutil.go"],
     visibility = ["//:sandbox"],
     deps = [
-        "//runsc/testutil",
+        "//pkg/test/testutil",
         "@com_github_kr_pty//:go_default_library",
     ],
 )
diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go
new file mode 100644
index 000000000..c45d2ecbc
--- /dev/null
+++ b/pkg/test/dockerutil/dockerutil.go
@@ -0,0 +1,696 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package dockerutil is a collection of utility functions.
+package dockerutil
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net"
+	"os"
+	"os/exec"
+	"path"
+	"regexp"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/kr/pty"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+var (
+	// runtime is the runtime to use for tests. This will be applied to all
+	// containers. Note that the default here ("runsc") corresponds to the
+	// default used by the installations. This is important, because the
+	// default installer for vm_tests (in tools/installers:head, invoked
+	// via tools/vm:defs.bzl) will install with this name. So without
+	// changing anything, tests should have a runsc runtime available to
+	// them. Otherwise installers should update the existing runtime
+	// instead of installing a new one.
+	runtime = flag.String("runtime", "runsc", "specify which runtime to use")
+
+	// config is the default Docker daemon configuration path.
+	config = flag.String("config_path", "/etc/docker/daemon.json", "configuration file for reading paths")
+)
+
+// EnsureSupportedDockerVersion checks if correct docker is installed.
+//
+// This logs directly to stderr, as it is typically called from a Main wrapper.
+func EnsureSupportedDockerVersion() {
+	cmd := exec.Command("docker", "version")
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		log.Fatalf("error running %q: %v", "docker version", err)
+	}
+	re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`)
+	matches := re.FindStringSubmatch(string(out))
+	if len(matches) != 3 {
+		log.Fatalf("Invalid docker output: %s", out)
+	}
+	major, _ := strconv.Atoi(matches[1])
+	minor, _ := strconv.Atoi(matches[2])
+	if major < 17 || (major == 17 && minor < 9) {
+		log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor)
+	}
+}
+
+// RuntimePath returns the binary path for the current runtime.
+func RuntimePath() (string, error) {
+	// Read the configuration data; the file must exist.
+	configBytes, err := ioutil.ReadFile(*config)
+	if err != nil {
+		return "", err
+	}
+
+	// Unmarshal the configuration.
+	c := make(map[string]interface{})
+	if err := json.Unmarshal(configBytes, &c); err != nil {
+		return "", err
+	}
+
+	// Decode the expected configuration.
+	r, ok := c["runtimes"]
+	if !ok {
+		return "", fmt.Errorf("no runtimes declared: %v", c)
+	}
+	rs, ok := r.(map[string]interface{})
+	if !ok {
+		// The runtimes are not a map.
+		return "", fmt.Errorf("unexpected format: %v", c)
+	}
+	r, ok = rs[*runtime]
+	if !ok {
+		// The expected runtime is not declared.
+		return "", fmt.Errorf("runtime %q not found: %v", *runtime, c)
+	}
+	rs, ok = r.(map[string]interface{})
+	if !ok {
+		// The runtime is not a map.
+		return "", fmt.Errorf("unexpected format: %v", c)
+	}
+	p, ok := rs["path"].(string)
+	if !ok {
+		// The runtime does not declare a path.
+		return "", fmt.Errorf("unexpected format: %v", c)
+	}
+	return p, nil
+}
+
+// Save exports a container image to the given Writer.
+//
+// Note that the writer should be actively consuming the output, otherwise it
+// is not guaranteed that the Save will make any progress and the call may
+// stall indefinitely.
+//
+// This is called by criutil in order to import imports.
+func Save(logger testutil.Logger, image string, w io.Writer) error {
+	cmd := testutil.Command(logger, "docker", "save", testutil.ImageByName(image))
+	cmd.Stdout = w // Send directly to the writer.
+	return cmd.Run()
+}
+
+// MountMode describes if the mount should be ro or rw.
+type MountMode int
+
+const (
+	// ReadOnly is what the name says.
+	ReadOnly MountMode = iota
+	// ReadWrite is what the name says.
+	ReadWrite
+)
+
+// String returns the mount mode argument for this MountMode.
+func (m MountMode) String() string {
+	switch m {
+	case ReadOnly:
+		return "ro"
+	case ReadWrite:
+		return "rw"
+	}
+	panic(fmt.Sprintf("invalid mode: %d", m))
+}
+
+// DockerNetwork contains the name of a docker network.
+type DockerNetwork struct {
+	logger     testutil.Logger
+	Name       string
+	Subnet     *net.IPNet
+	containers []*Docker
+}
+
+// NewDockerNetwork sets up the struct for a Docker network. Names of networks
+// will be unique.
+func NewDockerNetwork(logger testutil.Logger) *DockerNetwork {
+	return &DockerNetwork{
+		logger: logger,
+		Name:   testutil.RandomID(logger.Name()),
+	}
+}
+
+// Create calls 'docker network create'.
+func (n *DockerNetwork) Create(args ...string) error {
+	a := []string{"docker", "network", "create"}
+	if n.Subnet != nil {
+		a = append(a, fmt.Sprintf("--subnet=%s", n.Subnet))
+	}
+	a = append(a, args...)
+	a = append(a, n.Name)
+	return testutil.Command(n.logger, a...).Run()
+}
+
+// Connect calls 'docker network connect' with the arguments provided.
+func (n *DockerNetwork) Connect(container *Docker, args ...string) error {
+	a := []string{"docker", "network", "connect"}
+	a = append(a, args...)
+	a = append(a, n.Name, container.Name)
+	if err := testutil.Command(n.logger, a...).Run(); err != nil {
+		return err
+	}
+	n.containers = append(n.containers, container)
+	return nil
+}
+
+// Cleanup cleans up the docker network and all the containers attached to it.
+func (n *DockerNetwork) Cleanup() error {
+	for _, c := range n.containers {
+		// Don't propagate the error, it might be that the container
+		// was already cleaned up.
+		if err := c.Kill(); err != nil {
+			n.logger.Logf("unable to kill container during cleanup: %s", err)
+		}
+	}
+
+	if err := testutil.Command(n.logger, "docker", "network", "rm", n.Name).Run(); err != nil {
+		return err
+	}
+	return nil
+}
+
+// Docker contains the name and the runtime of a docker container.
+type Docker struct {
+	logger   testutil.Logger
+	Runtime  string
+	Name     string
+	copyErr  error
+	mounts   []string
+	cleanups []func()
+}
+
+// MakeDocker sets up the struct for a Docker container.
+//
+// Names of containers will be unique.
+func MakeDocker(logger testutil.Logger) *Docker {
+	// Slashes are not allowed in container names.
+	name := testutil.RandomID(logger.Name())
+	name = strings.ReplaceAll(name, "/", "-")
+
+	return &Docker{
+		logger:  logger,
+		Name:    name,
+		Runtime: *runtime,
+	}
+}
+
+// Mount mounts the given source and makes it available in the container.
+func (d *Docker) Mount(target, source string, mode MountMode) {
+	d.mounts = append(d.mounts, fmt.Sprintf("-v=%s:%s:%v", source, target, mode))
+}
+
+// CopyFiles copies in and mounts the given files. They are always ReadOnly.
+func (d *Docker) CopyFiles(target string, sources ...string) {
+	dir, err := ioutil.TempDir("", d.Name)
+	if err != nil {
+		d.copyErr = fmt.Errorf("ioutil.TempDir failed: %v", err)
+		return
+	}
+	d.cleanups = append(d.cleanups, func() { os.RemoveAll(dir) })
+	if err := os.Chmod(dir, 0755); err != nil {
+		d.copyErr = fmt.Errorf("os.Chmod(%q, 0755) failed: %v", dir, err)
+		return
+	}
+	for _, name := range sources {
+		src, err := testutil.FindFile(name)
+		if err != nil {
+			d.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %v", name, err)
+			return
+		}
+		dst := path.Join(dir, path.Base(name))
+		if err := testutil.Copy(src, dst); err != nil {
+			d.copyErr = fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
+			return
+		}
+		d.logger.Logf("copy: %s -> %s", src, dst)
+	}
+	d.Mount(target, dir, ReadOnly)
+}
+
+// Link links the given target.
+func (d *Docker) Link(target string, source *Docker) {
+	d.mounts = append(d.mounts, fmt.Sprintf("--link=%s:%s", source.Name, target))
+}
+
+// RunOpts are options for running a container.
+type RunOpts struct {
+	// Image is the image relative to images/. This will be mangled
+	// appropriately, to ensure that only first-party images are used.
+	Image string
+
+	// Memory is the memory limit in kB.
+	Memory int
+
+	// Ports are the ports to be allocated.
+	Ports []int
+
+	// WorkDir sets the working directory.
+	WorkDir string
+
+	// ReadOnly sets the read-only flag.
+	ReadOnly bool
+
+	// Env are additional environment variables.
+	Env []string
+
+	// User is the user to use.
+	User string
+
+	// Privileged enables privileged mode.
+	Privileged bool
+
+	// CapAdd are the extra set of capabilities to add.
+	CapAdd []string
+
+	// CapDrop are the extra set of capabilities to drop.
+	CapDrop []string
+
+	// Pty indicates that a pty will be allocated. If this is non-nil, then
+	// this will run after start-up with the *exec.Command and Pty file
+	// passed in to the function.
+	Pty func(*exec.Cmd, *os.File)
+
+	// Foreground indicates that the container should be run in the
+	// foreground. If this is true, then the output will be available as a
+	// return value from the Run function.
+	Foreground bool
+
+	// Extra are extra arguments that may be passed.
+	Extra []string
+}
+
+// args returns common arguments.
+//
+// Note that this does not define the complete behavior.
+func (d *Docker) argsFor(r *RunOpts, command string, p []string) (rv []string) {
+	isExec := command == "exec"
+	isRun := command == "run"
+
+	if isRun || isExec {
+		rv = append(rv, "-i")
+	}
+	if r.Pty != nil {
+		rv = append(rv, "-t")
+	}
+	if r.User != "" {
+		rv = append(rv, fmt.Sprintf("--user=%s", r.User))
+	}
+	if r.Privileged {
+		rv = append(rv, "--privileged")
+	}
+	for _, c := range r.CapAdd {
+		rv = append(rv, fmt.Sprintf("--cap-add=%s", c))
+	}
+	for _, c := range r.CapDrop {
+		rv = append(rv, fmt.Sprintf("--cap-drop=%s", c))
+	}
+	for _, e := range r.Env {
+		rv = append(rv, fmt.Sprintf("--env=%s", e))
+	}
+	if r.WorkDir != "" {
+		rv = append(rv, fmt.Sprintf("--workdir=%s", r.WorkDir))
+	}
+	if !isExec {
+		if r.Memory != 0 {
+			rv = append(rv, fmt.Sprintf("--memory=%dk", r.Memory))
+		}
+		for _, p := range r.Ports {
+			rv = append(rv, fmt.Sprintf("--publish=%d", p))
+		}
+		if r.ReadOnly {
+			rv = append(rv, fmt.Sprintf("--read-only"))
+		}
+		if len(p) > 0 {
+			rv = append(rv, "--entrypoint=")
+		}
+	}
+
+	// Always attach the test environment & Extra.
+	rv = append(rv, fmt.Sprintf("--env=RUNSC_TEST_NAME=%s", d.Name))
+	rv = append(rv, r.Extra...)
+
+	// Attach necessary bits.
+	if isExec {
+		rv = append(rv, d.Name)
+	} else {
+		rv = append(rv, d.mounts...)
+		if len(d.Runtime) > 0 {
+			rv = append(rv, fmt.Sprintf("--runtime=%s", d.Runtime))
+		}
+		rv = append(rv, fmt.Sprintf("--name=%s", d.Name))
+		rv = append(rv, testutil.ImageByName(r.Image))
+	}
+
+	// Attach other arguments.
+	rv = append(rv, p...)
+	return rv
+}
+
+// run runs a complete command.
+func (d *Docker) run(r RunOpts, command string, p ...string) (string, error) {
+	if d.copyErr != nil {
+		return "", d.copyErr
+	}
+	basicArgs := []string{"docker"}
+	if command == "spawn" {
+		command = "run"
+		basicArgs = append(basicArgs, command)
+		basicArgs = append(basicArgs, "-d")
+	} else {
+		basicArgs = append(basicArgs, command)
+	}
+	customArgs := d.argsFor(&r, command, p)
+	cmd := testutil.Command(d.logger, append(basicArgs, customArgs...)...)
+	if r.Pty != nil {
+		// If allocating a terminal, then we just ignore the output
+		// from the command.
+		ptmx, err := pty.Start(cmd.Cmd)
+		if err != nil {
+			return "", err
+		}
+		defer cmd.Wait() // Best effort.
+		r.Pty(cmd.Cmd, ptmx)
+	} else {
+		// Can't support PTY or streaming.
+		out, err := cmd.CombinedOutput()
+		return string(out), err
+	}
+	return "", nil
+}
+
+// Create calls 'docker create' with the arguments provided.
+func (d *Docker) Create(r RunOpts, args ...string) error {
+	out, err := d.run(r, "create", args...)
+	if strings.Contains(out, "Unable to find image") {
+		return fmt.Errorf("unable to find image, did you remember to `make load-%s`: %w", r.Image, err)
+	}
+	return err
+}
+
+// Start calls 'docker start'.
+func (d *Docker) Start() error {
+	return testutil.Command(d.logger, "docker", "start", d.Name).Run()
+}
+
+// Stop calls 'docker stop'.
+func (d *Docker) Stop() error {
+	return testutil.Command(d.logger, "docker", "stop", d.Name).Run()
+}
+
+// Run calls 'docker run' with the arguments provided.
+func (d *Docker) Run(r RunOpts, args ...string) (string, error) {
+	return d.run(r, "run", args...)
+}
+
+// Spawn starts the container and detaches.
+func (d *Docker) Spawn(r RunOpts, args ...string) error {
+	_, err := d.run(r, "spawn", args...)
+	return err
+}
+
+// Logs calls 'docker logs'.
+func (d *Docker) Logs() (string, error) {
+	// Don't capture the output; since it will swamp the logs.
+	out, err := exec.Command("docker", "logs", d.Name).CombinedOutput()
+	return string(out), err
+}
+
+// Exec calls 'docker exec' with the arguments provided.
+func (d *Docker) Exec(r RunOpts, args ...string) (string, error) {
+	return d.run(r, "exec", args...)
+}
+
+// Pause calls 'docker pause'.
+func (d *Docker) Pause() error {
+	return testutil.Command(d.logger, "docker", "pause", d.Name).Run()
+}
+
+// Unpause calls 'docker pause'.
+func (d *Docker) Unpause() error {
+	return testutil.Command(d.logger, "docker", "unpause", d.Name).Run()
+}
+
+// Checkpoint calls 'docker checkpoint'.
+func (d *Docker) Checkpoint(name string) error {
+	return testutil.Command(d.logger, "docker", "checkpoint", "create", d.Name, name).Run()
+}
+
+// Restore calls 'docker start --checkname [name]'.
+func (d *Docker) Restore(name string) error {
+	return testutil.Command(d.logger, "docker", "start", fmt.Sprintf("--checkpoint=%s", name), d.Name).Run()
+}
+
+// Kill calls 'docker kill'.
+func (d *Docker) Kill() error {
+	// Skip logging this command, it will likely be an error.
+	out, err := exec.Command("docker", "kill", d.Name).CombinedOutput()
+	if err != nil && !strings.Contains(string(out), "is not running") {
+		return err
+	}
+	return nil
+}
+
+// Remove calls 'docker rm'.
+func (d *Docker) Remove() error {
+	return testutil.Command(d.logger, "docker", "rm", d.Name).Run()
+}
+
+// CleanUp kills and deletes the container (best effort).
+func (d *Docker) CleanUp() {
+	// Kill the container.
+	if err := d.Kill(); err != nil {
+		// Just log; can't do anything here.
+		d.logger.Logf("error killing container %q: %v", d.Name, err)
+	}
+	// Remove the image.
+	if err := d.Remove(); err != nil {
+		d.logger.Logf("error removing container %q: %v", d.Name, err)
+	}
+	// Forget all mounts.
+	d.mounts = nil
+	// Execute all cleanups.
+	for _, c := range d.cleanups {
+		c()
+	}
+	d.cleanups = nil
+}
+
+// FindPort returns the host port that is mapped to 'sandboxPort'. This calls
+// docker to allocate a free port in the host and prevent conflicts.
+func (d *Docker) FindPort(sandboxPort int) (int, error) {
+	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
+	out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput()
+	if err != nil {
+		return -1, fmt.Errorf("error retrieving port: %v", err)
+	}
+	port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+	if err != nil {
+		return -1, fmt.Errorf("error parsing port %q: %v", out, err)
+	}
+	return port, nil
+}
+
+// FindIP returns the IP address of the container.
+func (d *Docker) FindIP() (net.IP, error) {
+	const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}`
+	out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput()
+	if err != nil {
+		return net.IP{}, fmt.Errorf("error retrieving IP: %v", err)
+	}
+	ip := net.ParseIP(strings.TrimSpace(string(out)))
+	if ip == nil {
+		return net.IP{}, fmt.Errorf("invalid IP: %q", string(out))
+	}
+	return ip, nil
+}
+
+// A NetworkInterface is container's network interface information.
+type NetworkInterface struct {
+	IPv4 net.IP
+	MAC  net.HardwareAddr
+}
+
+// ListNetworks returns the network interfaces of the container, keyed by
+// Docker network name.
+func (d *Docker) ListNetworks() (map[string]NetworkInterface, error) {
+	const format = `{{json .NetworkSettings.Networks}}`
+	out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("error network interfaces: %q: %w", string(out), err)
+	}
+
+	networks := map[string]map[string]string{}
+	if err := json.Unmarshal(out, &networks); err != nil {
+		return nil, fmt.Errorf("error decoding network interfaces: %w", err)
+	}
+
+	interfaces := map[string]NetworkInterface{}
+	for name, iface := range networks {
+		var netface NetworkInterface
+
+		rawIP := strings.TrimSpace(iface["IPAddress"])
+		if rawIP != "" {
+			ip := net.ParseIP(rawIP)
+			if ip == nil {
+				return nil, fmt.Errorf("invalid IP: %q", rawIP)
+			}
+			// Docker's IPAddress field is IPv4. The IPv6 address
+			// is stored in the GlobalIPv6Address field.
+			netface.IPv4 = ip
+		}
+
+		rawMAC := strings.TrimSpace(iface["MacAddress"])
+		if rawMAC != "" {
+			mac, err := net.ParseMAC(rawMAC)
+			if err != nil {
+				return nil, fmt.Errorf("invalid MAC: %q: %w", rawMAC, err)
+			}
+			netface.MAC = mac
+		}
+
+		interfaces[name] = netface
+	}
+
+	return interfaces, nil
+}
+
+// SandboxPid returns the PID to the sandbox process.
+func (d *Docker) SandboxPid() (int, error) {
+	out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.State.Pid}}", d.Name).CombinedOutput()
+	if err != nil {
+		return -1, fmt.Errorf("error retrieving pid: %v", err)
+	}
+	pid, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+	if err != nil {
+		return -1, fmt.Errorf("error parsing pid %q: %v", out, err)
+	}
+	return pid, nil
+}
+
+// ID returns the container ID.
+func (d *Docker) ID() (string, error) {
+	out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.Id}}", d.Name).CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("error retrieving ID: %v", err)
+	}
+	return strings.TrimSpace(string(out)), nil
+}
+
+// Wait waits for container to exit, up to the given timeout. Returns error if
+// wait fails or timeout is hit. Returns the application return code otherwise.
+// Note that the application may have failed even if err == nil, always check
+// the exit code.
+func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
+	timeoutChan := time.After(timeout)
+	waitChan := make(chan (syscall.WaitStatus))
+	errChan := make(chan (error))
+
+	go func() {
+		out, err := testutil.Command(d.logger, "docker", "wait", d.Name).CombinedOutput()
+		if err != nil {
+			errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err)
+		}
+		exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+		if err != nil {
+			errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err)
+		}
+		waitChan <- syscall.WaitStatus(uint32(exit))
+	}()
+
+	select {
+	case ws := <-waitChan:
+		return ws, nil
+	case err := <-errChan:
+		return syscall.WaitStatus(1), err
+	case <-timeoutChan:
+		return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name)
+	}
+}
+
+// WaitForOutput calls 'docker logs' to retrieve containers output and searches
+// for the given pattern.
+func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
+	matches, err := d.WaitForOutputSubmatch(pattern, timeout)
+	if err != nil {
+		return "", err
+	}
+	if len(matches) == 0 {
+		return "", nil
+	}
+	return matches[0], nil
+}
+
+// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and
+// searches for the given pattern. It returns any regexp submatches as well.
+func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) {
+	re := regexp.MustCompile(pattern)
+	var (
+		lastOut string
+		stopped bool
+	)
+	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
+		out, err := d.Logs()
+		if err != nil {
+			return nil, err
+		}
+		if out != lastOut {
+			if lastOut == "" {
+				d.logger.Logf("output (start): %s", out)
+			} else if strings.HasPrefix(out, lastOut) {
+				d.logger.Logf("output (contn): %s", out[len(lastOut):])
+			} else {
+				d.logger.Logf("output (trunc): %s", out)
+			}
+			lastOut = out // Save for future.
+			if matches := re.FindStringSubmatch(lastOut); matches != nil {
+				return matches, nil // Success!
+			}
+		} else if stopped {
+			// The sandbox stopped and we looked at the
+			// logs at least once since determining that.
+			return nil, fmt.Errorf("no longer running: %v", err)
+		} else if pid, err := d.SandboxPid(); pid == 0 || err != nil {
+			// The sandbox may have stopped, but it's
+			// possible that it has emitted the terminal
+			// line between the last call to Logs and here.
+			stopped = true
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+	return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), lastOut)
+}
diff --git a/runsc/testutil/BUILD b/pkg/test/testutil/BUILD
index 945405303..03b1b4677 100644
--- a/runsc/testutil/BUILD
+++ b/pkg/test/testutil/BUILD
@@ -11,7 +11,6 @@ go_library(
     ],
     visibility = ["//:sandbox"],
     deps = [
-        "//pkg/log",
         "//pkg/sync",
         "//runsc/boot",
         "//runsc/specutils",
diff --git a/runsc/testutil/testutil.go b/pkg/test/testutil/testutil.go
index 92d677e71..ee8c78014 100644
--- a/runsc/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -25,22 +25,24 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"log"
 	"math"
 	"math/rand"
 	"net/http"
 	"os"
 	"os/exec"
 	"os/signal"
+	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"sync/atomic"
 	"syscall"
+	"testing"
 	"time"
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -50,15 +52,27 @@ var (
 	checkpoint = flag.Bool("checkpoint", true, "control checkpoint/restore support")
 )
 
-func init() {
-	rand.Seed(time.Now().UnixNano())
-}
-
 // IsCheckpointSupported returns the relevant command line flag.
 func IsCheckpointSupported() bool {
 	return *checkpoint
 }
 
+// ImageByName mangles the image name used locally. This depends on the image
+// build infrastructure in images/ and tools/vm.
+func ImageByName(name string) string {
+	return fmt.Sprintf("gvisor.dev/images/%s", name)
+}
+
+// ConfigureExePath configures the executable for runsc in the test environment.
+func ConfigureExePath() error {
+	path, err := FindFile("runsc/runsc")
+	if err != nil {
+		return err
+	}
+	specutils.ExePath = path
+	return nil
+}
+
 // TmpDir returns the absolute path to a writable directory that can be used as
 // scratch by the test.
 func TmpDir() string {
@@ -69,36 +83,74 @@ func TmpDir() string {
 	return dir
 }
 
-// ConfigureExePath configures the executable for runsc in the test environment.
-func ConfigureExePath() error {
-	path, err := FindFile("runsc/runsc")
+// Logger is a simple logging wrapper.
+//
+// This is designed to be implemented by *testing.T.
+type Logger interface {
+	Name() string
+	Logf(fmt string, args ...interface{})
+}
+
+// DefaultLogger logs using the log package.
+type DefaultLogger string
+
+// Name implements Logger.Name.
+func (d DefaultLogger) Name() string {
+	return string(d)
+}
+
+// Logf implements Logger.Logf.
+func (d DefaultLogger) Logf(fmt string, args ...interface{}) {
+	log.Printf(fmt, args...)
+}
+
+// Cmd is a simple wrapper.
+type Cmd struct {
+	logger Logger
+	*exec.Cmd
+}
+
+// CombinedOutput returns the output and logs.
+func (c *Cmd) CombinedOutput() ([]byte, error) {
+	out, err := c.Cmd.CombinedOutput()
+	if len(out) > 0 {
+		c.logger.Logf("output: %s", string(out))
+	}
 	if err != nil {
-		return err
+		c.logger.Logf("error: %v", err)
+	}
+	return out, err
+}
+
+// Command is a simple wrapper around exec.Command, that logs.
+func Command(logger Logger, args ...string) *Cmd {
+	logger.Logf("command: %s", strings.Join(args, " "))
+	return &Cmd{
+		logger: logger,
+		Cmd:    exec.Command(args[0], args[1:]...),
 	}
-	specutils.ExePath = path
-	return nil
 }
 
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
-func TestConfig() *boot.Config {
-	logDir := ""
+func TestConfig(t *testing.T) *boot.Config {
+	logDir := os.TempDir()
 	if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
 		logDir = dir + "/"
 	}
 	return &boot.Config{
-		Debug:           true,
-		DebugLog:        logDir,
-		LogFormat:       "text",
-		DebugLogFormat:  "text",
-		AlsoLogToStderr: true,
-		LogPackets:      true,
-		Network:         boot.NetworkNone,
-		Strace:          true,
-		Platform:        "ptrace",
-		FileAccess:      boot.FileAccessExclusive,
+		Debug:              true,
+		DebugLog:           path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"),
+		LogFormat:          "text",
+		DebugLogFormat:     "text",
+		LogPackets:         true,
+		Network:            boot.NetworkNone,
+		Strace:             true,
+		Platform:           "ptrace",
+		FileAccess:         boot.FileAccessExclusive,
+		NumNetworkChannels: 1,
+
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
-		NumNetworkChannels:                         1,
 	}
 }
 
@@ -141,37 +193,45 @@ func NewSpecWithArgs(args ...string) *specs.Spec {
 }
 
 // SetupRootDir creates a root directory for containers.
-func SetupRootDir() (string, error) {
+func SetupRootDir() (string, func(), error) {
 	rootDir, err := ioutil.TempDir(TmpDir(), "containers")
 	if err != nil {
-		return "", fmt.Errorf("error creating root dir: %v", err)
+		return "", nil, fmt.Errorf("error creating root dir: %v", err)
 	}
-	return rootDir, nil
+	return rootDir, func() { os.RemoveAll(rootDir) }, nil
 }
 
 // SetupContainer creates a bundle and root dir for the container, generates a
 // test config, and writes the spec to config.json in the bundle dir.
-func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) {
-	rootDir, err = SetupRootDir()
+func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, cleanup func(), err error) {
+	rootDir, rootCleanup, err := SetupRootDir()
 	if err != nil {
-		return "", "", err
+		return "", "", nil, err
 	}
 	conf.RootDir = rootDir
-	bundleDir, err = SetupBundleDir(spec)
-	return rootDir, bundleDir, err
+	bundleDir, bundleCleanup, err := SetupBundleDir(spec)
+	if err != nil {
+		rootCleanup()
+		return "", "", nil, err
+	}
+	return rootDir, bundleDir, func() {
+		bundleCleanup()
+		rootCleanup()
+	}, err
 }
 
 // SetupBundleDir creates a bundle dir and writes the spec to config.json.
-func SetupBundleDir(spec *specs.Spec) (bundleDir string, err error) {
-	bundleDir, err = ioutil.TempDir(TmpDir(), "bundle")
+func SetupBundleDir(spec *specs.Spec) (string, func(), error) {
+	bundleDir, err := ioutil.TempDir(TmpDir(), "bundle")
 	if err != nil {
-		return "", fmt.Errorf("error creating bundle dir: %v", err)
+		return "", nil, fmt.Errorf("error creating bundle dir: %v", err)
 	}
-
-	if err = writeSpec(bundleDir, spec); err != nil {
-		return "", fmt.Errorf("error writing spec: %v", err)
+	cleanup := func() { os.RemoveAll(bundleDir) }
+	if err := writeSpec(bundleDir, spec); err != nil {
+		cleanup()
+		return "", nil, fmt.Errorf("error writing spec: %v", err)
 	}
-	return bundleDir, nil
+	return bundleDir, cleanup, nil
 }
 
 // writeSpec writes the spec to disk in the given directory.
@@ -183,22 +243,25 @@ func writeSpec(dir string, spec *specs.Spec) error {
 	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
 }
 
-// UniqueContainerID generates a unique container id for each test.
-//
-// The container id is used to create an abstract unix domain socket, which must
-// be unique.  While the container forbids creating two containers with the same
-// name, sometimes between test runs the socket does not get cleaned up quickly
-// enough, causing container creation to fail.
-func UniqueContainerID() string {
+// RandomID returns 20 random bytes following the given prefix.
+func RandomID(prefix string) string {
 	// Read 20 random bytes.
 	b := make([]byte, 20)
 	// "[Read] always returns len(p) and a nil error." --godoc
 	if _, err := rand.Read(b); err != nil {
 		panic("rand.Read failed: " + err.Error())
 	}
-	// base32 encode the random bytes, so that the name is a valid
-	// container id and can be used as a socket name in the filesystem.
-	return fmt.Sprintf("test-container-%s", base32.StdEncoding.EncodeToString(b))
+	return fmt.Sprintf("%s-%s", prefix, base32.StdEncoding.EncodeToString(b))
+}
+
+// RandomContainerID generates a random container id for each test.
+//
+// The container id is used to create an abstract unix domain socket, which
+// must be unique. While the container forbids creating two containers with the
+// same name, sometimes between test runs the socket does not get cleaned up
+// quickly enough, causing container creation to fail.
+func RandomContainerID() string {
+	return RandomID("test-container-")
 }
 
 // Copy copies file from src to dst.
@@ -209,12 +272,39 @@ func Copy(src, dst string) error {
 	}
 	defer in.Close()
 
-	out, err := os.Create(dst)
+	st, err := in.Stat()
+	if err != nil {
+		return err
+	}
+
+	out, err := os.OpenFile(dst, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, st.Mode().Perm())
 	if err != nil {
 		return err
 	}
 	defer out.Close()
 
+	// Mirror the local user's permissions across all users. This is
+	// because as we inject things into the container, the UID/GID will
+	// change. Also, the build system may generate artifacts with different
+	// modes. At the top-level (volume mapping) we have a big read-only
+	// knob that can be applied to prevent modifications.
+	//
+	// Note that this must be done via a separate Chmod call, otherwise the
+	// current process's umask will get in the way.
+	var mode os.FileMode
+	if st.Mode()&0100 != 0 {
+		mode |= 0111
+	}
+	if st.Mode()&0200 != 0 {
+		mode |= 0222
+	}
+	if st.Mode()&0400 != 0 {
+		mode |= 0444
+	}
+	if err := os.Chmod(dst, mode); err != nil {
+		return err
+	}
+
 	_, err = io.Copy(out, in)
 	return err
 }
@@ -237,7 +327,7 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 		url := fmt.Sprintf("http://localhost:%d/", port)
 		resp, err := c.Get(url)
 		if err != nil {
-			log.Infof("Waiting %s: %v", url, err)
+			log.Printf("Waiting %s: %v", url, err)
 			return err
 		}
 		resp.Body.Close()
@@ -347,6 +437,8 @@ func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time
 // KillCommand kills the process running cmd unless it hasn't been started. It
 // returns an error if it cannot kill the process unless the reason is that the
 // process has already exited.
+//
+// KillCommand will also reap the process.
 func KillCommand(cmd *exec.Cmd) error {
 	if cmd.Process == nil {
 		return nil
@@ -356,26 +448,21 @@ func KillCommand(cmd *exec.Cmd) error {
 			return fmt.Errorf("failed to kill process %v: %v", cmd, err)
 		}
 	}
-	return nil
+	return cmd.Wait()
 }
 
 // WriteTmpFile writes text to a temporary file, closes the file, and returns
-// the name of the file.
-func WriteTmpFile(pattern, text string) (string, error) {
+// the name of the file. A cleanup function is also returned.
+func WriteTmpFile(pattern, text string) (string, func(), error) {
 	file, err := ioutil.TempFile(TmpDir(), pattern)
 	if err != nil {
-		return "", err
+		return "", nil, err
 	}
 	defer file.Close()
 	if _, err := file.Write([]byte(text)); err != nil {
-		return "", err
+		return "", nil, err
 	}
-	return file.Name(), nil
-}
-
-// RandomName create a name with a 6 digit random number appended to it.
-func RandomName(prefix string) string {
-	return fmt.Sprintf("%s-%06d", prefix, rand.Int31n(1000000))
+	return file.Name(), func() { os.RemoveAll(file.Name()) }, nil
 }
 
 // IsStatic returns true iff the given file is a static binary.
diff --git a/runsc/testutil/testutil_runfiles.go b/pkg/test/testutil/testutil_runfiles.go
index ece9ea9a1..ece9ea9a1 100644
--- a/runsc/testutil/testutil_runfiles.go
+++ b/pkg/test/testutil/testutil_runfiles.go
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
deleted file mode 100644
index 2dcba84ae..000000000
--- a/pkg/tmutex/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-load("//tools:defs.bzl", "go_library", "go_test")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "tmutex",
-    srcs = ["tmutex.go"],
-    visibility = ["//:sandbox"],
-)
-
-go_test(
-    name = "tmutex_test",
-    size = "medium",
-    srcs = ["tmutex_test.go"],
-    library = ":tmutex",
-    deps = ["//pkg/sync"],
-)
diff --git a/pkg/tmutex/tmutex.go b/pkg/tmutex/tmutex.go
deleted file mode 100644
index c4685020d..000000000
--- a/pkg/tmutex/tmutex.go
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package tmutex provides the implementation of a mutex that implements an
-// efficient TryLock function in addition to Lock and Unlock.
-package tmutex
-
-import (
-	"sync/atomic"
-)
-
-// Mutex is a mutual exclusion primitive that implements TryLock in addition
-// to Lock and Unlock.
-type Mutex struct {
-	v  int32
-	ch chan struct{}
-}
-
-// Init initializes the mutex.
-func (m *Mutex) Init() {
-	m.v = 1
-	m.ch = make(chan struct{}, 1)
-}
-
-// Lock acquires the mutex. If it is currently held by another goroutine, Lock
-// will wait until it has a chance to acquire it.
-func (m *Mutex) Lock() {
-	// Uncontended case.
-	if atomic.AddInt32(&m.v, -1) == 0 {
-		return
-	}
-
-	for {
-		// Try to acquire the mutex again, at the same time making sure
-		// that m.v is negative, which indicates to the owner of the
-		// lock that it is contended, which will force it to try to wake
-		// someone up when it releases the mutex.
-		if v := atomic.LoadInt32(&m.v); v >= 0 && atomic.SwapInt32(&m.v, -1) == 1 {
-			return
-		}
-
-		// Wait for the mutex to be released before trying again.
-		<-m.ch
-	}
-}
-
-// TryLock attempts to acquire the mutex without blocking. If the mutex is
-// currently held by another goroutine, it fails to acquire it and returns
-// false.
-func (m *Mutex) TryLock() bool {
-	v := atomic.LoadInt32(&m.v)
-	if v <= 0 {
-		return false
-	}
-	return atomic.CompareAndSwapInt32(&m.v, 1, 0)
-}
-
-// Unlock releases the mutex.
-func (m *Mutex) Unlock() {
-	if atomic.SwapInt32(&m.v, 1) == 0 {
-		// There were no pending waiters.
-		return
-	}
-
-	// Wake some waiter up.
-	select {
-	case m.ch <- struct{}{}:
-	default:
-	}
-}
diff --git a/pkg/tmutex/tmutex_test.go b/pkg/tmutex/tmutex_test.go
deleted file mode 100644
index 05540696a..000000000
--- a/pkg/tmutex/tmutex_test.go
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tmutex
-
-import (
-	"fmt"
-	"runtime"
-	"sync/atomic"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-func TestBasicLock(t *testing.T) {
-	var m Mutex
-	m.Init()
-
-	m.Lock()
-
-	// Try blocking lock the mutex from a different goroutine. This must
-	// not block because the mutex is held.
-	ch := make(chan struct{}, 1)
-	go func() {
-		m.Lock()
-		ch <- struct{}{}
-		m.Unlock()
-		ch <- struct{}{}
-	}()
-
-	select {
-	case <-ch:
-		t.Fatalf("Lock succeeded on locked mutex")
-	case <-time.After(100 * time.Millisecond):
-	}
-
-	// Unlock the mutex and make sure that the goroutine waiting on Lock()
-	// unblocks and succeeds.
-	m.Unlock()
-
-	select {
-	case <-ch:
-	case <-time.After(100 * time.Millisecond):
-		t.Fatalf("Lock failed to acquire unlocked mutex")
-	}
-
-	// Make sure we can lock and unlock again.
-	m.Lock()
-	m.Unlock()
-}
-
-func TestTryLock(t *testing.T) {
-	var m Mutex
-	m.Init()
-
-	// Try to lock. It should succeed.
-	if !m.TryLock() {
-		t.Fatalf("TryLock failed on unlocked mutex")
-	}
-
-	// Try to lock again, it should now fail.
-	if m.TryLock() {
-		t.Fatalf("TryLock succeeded on locked mutex")
-	}
-
-	// Try blocking lock the mutex from a different goroutine. This must
-	// not block because the mutex is held.
-	ch := make(chan struct{}, 1)
-	go func() {
-		m.Lock()
-		ch <- struct{}{}
-		m.Unlock()
-	}()
-
-	select {
-	case <-ch:
-		t.Fatalf("Lock succeeded on locked mutex")
-	case <-time.After(100 * time.Millisecond):
-	}
-
-	// Unlock the mutex and make sure that the goroutine waiting on Lock()
-	// unblocks and succeeds.
-	m.Unlock()
-
-	select {
-	case <-ch:
-	case <-time.After(100 * time.Millisecond):
-		t.Fatalf("Lock failed to acquire unlocked mutex")
-	}
-}
-
-func TestMutualExclusion(t *testing.T) {
-	var m Mutex
-	m.Init()
-
-	// Test mutual exclusion by running "gr" goroutines concurrently, and
-	// have each one increment a counter "iters" times within the critical
-	// section established by the mutex.
-	//
-	// If at the end the counter is not gr * iters, then we know that
-	// goroutines ran concurrently within the critical section.
-	//
-	// If one of the goroutines doesn't complete, it's likely a bug that
-	// causes to it to wait forever.
-	const gr = 1000
-	const iters = 100000
-	v := 0
-	var wg sync.WaitGroup
-	for i := 0; i < gr; i++ {
-		wg.Add(1)
-		go func() {
-			for j := 0; j < iters; j++ {
-				m.Lock()
-				v++
-				m.Unlock()
-			}
-			wg.Done()
-		}()
-	}
-
-	wg.Wait()
-
-	if v != gr*iters {
-		t.Fatalf("Bad count: got %v, want %v", v, gr*iters)
-	}
-}
-
-func TestMutualExclusionWithTryLock(t *testing.T) {
-	var m Mutex
-	m.Init()
-
-	// Similar to the previous, with the addition of some goroutines that
-	// only increment the count if TryLock succeeds.
-	const gr = 1000
-	const iters = 100000
-	total := int64(gr * iters)
-	var tryTotal int64
-	v := int64(0)
-	var wg sync.WaitGroup
-	for i := 0; i < gr; i++ {
-		wg.Add(2)
-		go func() {
-			for j := 0; j < iters; j++ {
-				m.Lock()
-				v++
-				m.Unlock()
-			}
-			wg.Done()
-		}()
-		go func() {
-			local := int64(0)
-			for j := 0; j < iters; j++ {
-				if m.TryLock() {
-					v++
-					m.Unlock()
-					local++
-				}
-			}
-			atomic.AddInt64(&tryTotal, local)
-			wg.Done()
-		}()
-	}
-
-	wg.Wait()
-
-	t.Logf("tryTotal = %d", tryTotal)
-	total += tryTotal
-
-	if v != total {
-		t.Fatalf("Bad count: got %v, want %v", v, total)
-	}
-}
-
-// BenchmarkTmutex is equivalent to TestMutualExclusion, with the following
-// differences:
-//
-// - The number of goroutines is variable, with the maximum value depending on
-// GOMAXPROCS.
-//
-// - The number of iterations per benchmark is controlled by the benchmarking
-// framework.
-//
-// - Care is taken to ensure that all goroutines participating in the benchmark
-// have been created before the benchmark begins.
-func BenchmarkTmutex(b *testing.B) {
-	for n, max := 1, 4*runtime.GOMAXPROCS(0); n > 0 && n <= max; n *= 2 {
-		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
-			var m Mutex
-			m.Init()
-
-			var ready sync.WaitGroup
-			begin := make(chan struct{})
-			var end sync.WaitGroup
-			for i := 0; i < n; i++ {
-				ready.Add(1)
-				end.Add(1)
-				go func() {
-					ready.Done()
-					<-begin
-					for j := 0; j < b.N; j++ {
-						m.Lock()
-						m.Unlock()
-					}
-					end.Done()
-				}()
-			}
-
-			ready.Wait()
-			b.ResetTimer()
-			close(begin)
-			end.Wait()
-		})
-	}
-}
-
-// BenchmarkSyncMutex is equivalent to BenchmarkTmutex, but uses sync.Mutex as
-// a comparison point.
-func BenchmarkSyncMutex(b *testing.B) {
-	for n, max := 1, 4*runtime.GOMAXPROCS(0); n > 0 && n <= max; n *= 2 {
-		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
-			var m sync.Mutex
-
-			var ready sync.WaitGroup
-			begin := make(chan struct{})
-			var end sync.WaitGroup
-			for i := 0; i < n; i++ {
-				ready.Add(1)
-				end.Add(1)
-				go func() {
-					ready.Done()
-					<-begin
-					for j := 0; j < b.N; j++ {
-						m.Lock()
-						m.Unlock()
-					}
-					end.Done()
-				}()
-			}
-
-			ready.Wait()
-			b.ResetTimer()
-			close(begin)
-			end.Wait()
-		})
-	}
-}
diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD
index ff8b9e91a..6c9ada9c7 100644
--- a/pkg/usermem/BUILD
+++ b/pkg/usermem/BUILD
@@ -25,7 +25,6 @@ go_library(
         "bytes_io_unsafe.go",
         "usermem.go",
         "usermem_arm64.go",
-        "usermem_unsafe.go",
         "usermem_x86.go",
     ],
     visibility = ["//:sandbox"],
@@ -33,6 +32,7 @@ go_library(
         "//pkg/atomicbitops",
         "//pkg/binary",
         "//pkg/context",
+        "//pkg/gohacks",
         "//pkg/log",
         "//pkg/safemem",
         "//pkg/syserror",
diff --git a/pkg/usermem/addr.go b/pkg/usermem/addr.go
index e79210804..c4100481e 100644
--- a/pkg/usermem/addr.go
+++ b/pkg/usermem/addr.go
@@ -106,3 +106,20 @@ func (ar AddrRange) IsPageAligned() bool {
 func (ar AddrRange) String() string {
 	return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End)
 }
+
+// PageRoundDown/Up are equivalent to Addr.RoundDown/Up, but without the
+// potentially truncating conversion from uint64 to Addr. This is necessary
+// because there is no way to define generic "PageRoundDown/Up" functions in Go.
+
+// PageRoundDown returns x rounded down to the nearest page boundary.
+func PageRoundDown(x uint64) uint64 {
+	return x &^ (PageSize - 1)
+}
+
+// PageRoundUp returns x rounded up to the nearest page boundary.
+// ok is true iff rounding up did not wrap around.
+func PageRoundUp(x uint64) (addr uint64, ok bool) {
+	addr = PageRoundDown(x + PageSize - 1)
+	ok = addr >= x
+	return
+}
diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go
index 71fd4e155..cd6a0ea6b 100644
--- a/pkg/usermem/usermem.go
+++ b/pkg/usermem/usermem.go
@@ -23,14 +23,12 @@ import (
 
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // IO provides access to the contents of a virtual memory space.
-//
-// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any
-// meaningful data.
 type IO interface {
 	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
 	// returns the number of bytes copied. If the number of bytes copied is <
@@ -251,7 +249,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		}
 		end, ok := addr.AddLength(uint64(readlen))
 		if !ok {
-			return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
+			return gohacks.StringFromImmutableBytes(buf[:done]), syserror.EFAULT
 		}
 		// Shorten the read to avoid crossing page boundaries, since faulting
 		// in a page unnecessarily is expensive. This also ensures that partial
@@ -272,16 +270,16 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		// Look for the terminating zero byte, which may have occurred before
 		// hitting err.
 		if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 {
-			return stringFromImmutableBytes(buf[:done+i]), nil
+			return gohacks.StringFromImmutableBytes(buf[:done+i]), nil
 		}
 
 		done += n
 		if err != nil {
-			return stringFromImmutableBytes(buf[:done]), err
+			return gohacks.StringFromImmutableBytes(buf[:done]), err
 		}
 		addr = end
 	}
-	return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG
+	return gohacks.StringFromImmutableBytes(buf), syserror.ENAMETOOLONG
 }
 
 // CopyOutVec copies bytes from src to the memory mapped at ars in uio. The
diff --git a/pkg/usermem/usermem_unsafe.go b/pkg/usermem/usermem_unsafe.go
deleted file mode 100644
index 876783e78..000000000
--- a/pkg/usermem/usermem_unsafe.go
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package usermem
-
-import (
-	"unsafe"
-)
-
-// stringFromImmutableBytes is equivalent to string(bs), except that it never
-// copies even if escape analysis can't prove that bs does not escape. This is
-// only valid if bs is never mutated after stringFromImmutableBytes returns.
-func stringFromImmutableBytes(bs []byte) string {
-	// Compare strings.Builder.String().
-	return *(*string)(unsafe.Pointer(&bs))
-}
diff --git a/pkg/usermem/usermem_x86.go b/pkg/usermem/usermem_x86.go
index 8059b72d2..d96f829fb 100644
--- a/pkg/usermem/usermem_x86.go
+++ b/pkg/usermem/usermem_x86.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64 i386
+// +build amd64 386
 
 package usermem
 
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index ae4dd102a..a907c103b 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -12,18 +12,15 @@ go_library(
         "controller.go",
         "debug.go",
         "events.go",
-        "fds.go",
         "fs.go",
         "limits.go",
         "loader.go",
-        "loader_amd64.go",
-        "loader_arm64.go",
         "network.go",
-        "pprof.go",
         "strace.go",
-        "user.go",
+        "vfs.go",
     ],
     visibility = [
+        "//pkg/test:__subpackages__",
         "//runsc:__subpackages__",
         "//test:__subpackages__",
     ],
@@ -34,6 +31,7 @@ go_library(
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
         "//pkg/rand",
@@ -41,6 +39,8 @@ go_library(
         "//pkg/sentry/arch",
         "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/control",
+        "//pkg/sentry/devices/memdev",
+        "//pkg/sentry/fdimport",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/dev",
         "//pkg/sentry/fs/gofer",
@@ -50,6 +50,14 @@ go_library(
         "//pkg/sentry/fs/sys",
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/fs/tty",
+        "//pkg/sentry/fs/user",
+        "//pkg/sentry/fsimpl/devpts",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/gofer",
+        "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/fsimpl/proc",
+        "//pkg/sentry/fsimpl/sys",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel:uncaught_signal_go_proto",
@@ -67,17 +75,18 @@ go_library(
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
-        "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/syscalls/linux/vfs2",
         "//pkg/sentry/time",
         "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
         "//pkg/sentry/watchdog",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/link/fdbased",
         "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/qdisc/fifo",
         "//pkg/tcpip/link/sniffer",
         "//pkg/tcpip/network/arp",
         "//pkg/tcpip/network/ipv4",
@@ -88,9 +97,9 @@ go_library(
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/urpc",
-        "//pkg/usermem",
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
+        "//runsc/boot/pprof",
         "//runsc/specutils",
         "@com_github_golang_protobuf//proto:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
@@ -105,19 +114,20 @@ go_test(
         "compat_test.go",
         "fs_test.go",
         "loader_test.go",
-        "user_test.go",
     ],
     library = ":boot",
     deps = [
         "//pkg/control/server",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
-        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/unet",
         "//runsc/fsgofer",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 8995d678e..84c67cbc2 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -65,7 +65,7 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) {
 
 	if logFD > 0 {
 		f := os.NewFile(uintptr(logFD), "user log file")
-		target := &log.MultiEmitter{c.sink, &log.K8sJSONEmitter{log.Writer{Next: f}}}
+		target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}}
 		c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
 	}
 	return c, nil
@@ -119,7 +119,13 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 	}
 
 	if tr.shouldReport(regs) {
-		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
+		name := c.nameMap.Name(uintptr(sysnr))
+		c.sink.Infof("Unsupported syscall %s(%#x,%#x,%#x,%#x,%#x,%#x). It is "+
+			"likely that you can safely ignore this message and that this is not "+
+			"the cause of any error. Please, refer to %s/%s for more information.",
+			name, argVal(0, regs), argVal(1, regs), argVal(2, regs), argVal(3, regs),
+			argVal(4, regs), argVal(5, regs), syscallLink, name)
+
 		tr.onReported(regs)
 	}
 }
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 42b0ca8b0..8eb76b2ba 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -24,8 +24,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/strace"
 )
 
-// reportLimit is the max number of events that should be reported per tracker.
-const reportLimit = 100
+const (
+	// reportLimit is the max number of events that should be reported per
+	// tracker.
+	reportLimit = 100
+	syscallLink = "https://gvisor.dev/c/linux/amd64"
+)
 
 // newRegs create a empty Registers instance.
 func newRegs() *rpb.Registers {
@@ -36,22 +40,22 @@ func newRegs() *rpb.Registers {
 	}
 }
 
-func argVal(argIdx int, regs *rpb.Registers) uint32 {
+func argVal(argIdx int, regs *rpb.Registers) uint64 {
 	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
 
 	switch argIdx {
 	case 0:
-		return uint32(amd64Regs.Rdi)
+		return amd64Regs.Rdi
 	case 1:
-		return uint32(amd64Regs.Rsi)
+		return amd64Regs.Rsi
 	case 2:
-		return uint32(amd64Regs.Rdx)
+		return amd64Regs.Rdx
 	case 3:
-		return uint32(amd64Regs.R10)
+		return amd64Regs.R10
 	case 4:
-		return uint32(amd64Regs.R8)
+		return amd64Regs.R8
 	case 5:
-		return uint32(amd64Regs.R9)
+		return amd64Regs.R9
 	}
 	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go
index f784cd237..bce9d95b3 100644
--- a/runsc/boot/compat_arm64.go
+++ b/runsc/boot/compat_arm64.go
@@ -23,8 +23,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/strace"
 )
 
-// reportLimit is the max number of events that should be reported per tracker.
-const reportLimit = 100
+const (
+	// reportLimit is the max number of events that should be reported per
+	// tracker.
+	reportLimit = 100
+	syscallLink = "https://gvisor.dev/c/linux/arm64"
+)
 
 // newRegs create a empty Registers instance.
 func newRegs() *rpb.Registers {
@@ -35,22 +39,22 @@ func newRegs() *rpb.Registers {
 	}
 }
 
-func argVal(argIdx int, regs *rpb.Registers) uint32 {
+func argVal(argIdx int, regs *rpb.Registers) uint64 {
 	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
 
 	switch argIdx {
 	case 0:
-		return uint32(arm64Regs.R0)
+		return arm64Regs.R0
 	case 1:
-		return uint32(arm64Regs.R1)
+		return arm64Regs.R1
 	case 2:
-		return uint32(arm64Regs.R2)
+		return arm64Regs.R2
 	case 3:
-		return uint32(arm64Regs.R3)
+		return arm64Regs.R3
 	case 4:
-		return uint32(arm64Regs.R4)
+		return arm64Regs.R4
 	case 5:
-		return uint32(arm64Regs.R5)
+		return arm64Regs.R5
 	}
 	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 35391030f..bcec7e4db 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -158,6 +158,9 @@ type Config struct {
 	// DebugLog is the path to log debug information to, if not empty.
 	DebugLog string
 
+	// PanicLog is the path to log GO's runtime messages, if not empty.
+	PanicLog string
+
 	// DebugLogFormat is the log format for debug.
 	DebugLogFormat string
 
@@ -184,6 +187,10 @@ type Config struct {
 	// SoftwareGSO indicates that software segmentation offload is enabled.
 	SoftwareGSO bool
 
+	// QDisc indicates the type of queuening discipline to use by default
+	// for non-loopback interfaces.
+	QDisc QueueingDiscipline
+
 	// LogPackets indicates that all network packets should be logged.
 	LogPackets bool
 
@@ -234,8 +241,10 @@ type Config struct {
 	// ReferenceLeakMode sets reference leak check mode
 	ReferenceLeakMode refs.LeakMode
 
-	// OverlayfsStaleRead causes cached FDs to reopen after a file is opened for
-	// write to workaround overlayfs limitation on kernels before 4.19.
+	// OverlayfsStaleRead instructs the sandbox to assume that the root mount
+	// is on a Linux overlayfs mount, which does not necessarily preserve
+	// coherence between read-only and subsequent writable file descriptors
+	// representing the "same" file.
 	OverlayfsStaleRead bool
 
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
@@ -269,6 +278,7 @@ func (c *Config) ToFlags() []string {
 		"--log=" + c.LogFilename,
 		"--log-format=" + c.LogFormat,
 		"--debug-log=" + c.DebugLog,
+		"--panic-log=" + c.PanicLog,
 		"--debug-log-format=" + c.DebugLogFormat,
 		"--file-access=" + c.FileAccess.String(),
 		"--overlay=" + strconv.FormatBool(c.Overlay),
@@ -290,6 +300,7 @@ func (c *Config) ToFlags() []string {
 		"--gso=" + strconv.FormatBool(c.HardwareGSO),
 		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
 		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
+		"--qdisc=" + c.QDisc.String(),
 	}
 	if c.CPUNumFromQuota {
 		f = append(f, "--cpu-num-from-quota")
@@ -301,5 +312,10 @@ func (c *Config) ToFlags() []string {
 	if len(c.TestOnlyTestNameEnv) != 0 {
 		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
 	}
+
+	if c.VFS2 {
+		f = append(f, "--vfs2=true")
+	}
+
 	return f
 }
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 9c9e94864..8125d5061 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -100,11 +101,14 @@ const (
 
 // Profiling related commands (see pprof.go for more details).
 const (
-	StartCPUProfile = "Profile.StartCPUProfile"
-	StopCPUProfile  = "Profile.StopCPUProfile"
-	HeapProfile     = "Profile.HeapProfile"
-	StartTrace      = "Profile.StartTrace"
-	StopTrace       = "Profile.StopTrace"
+	StartCPUProfile  = "Profile.StartCPUProfile"
+	StopCPUProfile   = "Profile.StopCPUProfile"
+	HeapProfile      = "Profile.HeapProfile"
+	GoroutineProfile = "Profile.GoroutineProfile"
+	BlockProfile     = "Profile.BlockProfile"
+	MutexProfile     = "Profile.MutexProfile"
+	StartTrace       = "Profile.StartTrace"
+	StopTrace        = "Profile.StopTrace"
 )
 
 // Logging related commands (see logging.go for more details).
@@ -142,7 +146,7 @@ func newController(fd int, l *Loader) (*controller, error) {
 	}
 	srv.Register(manager)
 
-	if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok {
+	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
@@ -341,7 +345,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("creating memory file: %v", err)
 	}
 	k.SetMemoryFile(mf)
-	networkStack := cm.l.k.NetworkStack()
+	networkStack := cm.l.k.RootNetworkNamespace().Stack()
 	cm.l.k = k
 
 	// Set up the restore environment.
@@ -365,9 +369,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	}
 
 	if cm.l.conf.ProfileEnable {
-		// initializePProf opens /proc/self/maps, so has to be
-		// called before installing seccomp filters.
-		initializePProf()
+		// pprof.Initialize opens /proc/self/maps, so has to be called before
+		// installing seccomp filters.
+		pprof.Initialize()
 	}
 
 	// Seccomp filters have to be applied before parsing the state file.
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
deleted file mode 100644
index 417d2d5fb..000000000
--- a/runsc/boot/fds.go
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fs/host"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-)
-
-// createFDTable creates an FD table that contains stdin, stdout, and stderr.
-// If console is true, then ioctl calls will be passed through to the host FD.
-// Upon success, createFDMap dups then closes stdioFDs.
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) {
-	if len(stdioFDs) != 3 {
-		return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
-	}
-
-	k := kernel.KernelFromContext(ctx)
-	fdTable := k.NewFDTable()
-	defer fdTable.DecRef()
-	mounter := fs.FileOwnerFromContext(ctx)
-
-	var ttyFile *fs.File
-	for appFD, hostFD := range stdioFDs {
-		var appFile *fs.File
-
-		if console && appFD < 3 {
-			// Import the file as a host TTY file.
-			if ttyFile == nil {
-				var err error
-				appFile, err = host.ImportFile(ctx, hostFD, mounter, true /* isTTY */)
-				if err != nil {
-					return nil, err
-				}
-				defer appFile.DecRef()
-
-				// Remember this in the TTY file, as we will
-				// use it for the other stdio FDs.
-				ttyFile = appFile
-			} else {
-				// Re-use the existing TTY file, as all three
-				// stdio FDs must point to the same fs.File in
-				// order to share TTY state, specifically the
-				// foreground process group id.
-				appFile = ttyFile
-			}
-		} else {
-			// Import the file as a regular host file.
-			var err error
-			appFile, err = host.ImportFile(ctx, hostFD, mounter, false /* isTTY */)
-			if err != nil {
-				return nil, err
-			}
-			defer appFile.DecRef()
-		}
-
-		// Add the file to the FD map.
-		if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
-			return nil, err
-		}
-	}
-
-	fdTable.IncRef()
-	return fdTable, nil
-}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index c69f4c602..60e33425f 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -44,7 +44,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.AllowValue(syscall.O_CLOEXEC),
 		},
 	},
 	syscall.SYS_EPOLL_CREATE1: {},
@@ -229,7 +229,11 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_NANOSLEEP: {},
 	syscall.SYS_PPOLL:     {},
 	syscall.SYS_PREAD64:   {},
+	syscall.SYS_PREADV:    {},
+	unix.SYS_PREADV2:      {},
 	syscall.SYS_PWRITE64:  {},
+	syscall.SYS_PWRITEV:   {},
+	unix.SYS_PWRITEV2:     {},
 	syscall.SYS_READ:      {},
 	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
@@ -282,12 +286,29 @@ var allowedSyscalls = seccomp.SyscallRules{
 		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK:     {},
+	unix.SYS_STATX:              {},
 	syscall.SYS_SYNC_FILE_RANGE: {},
+	syscall.SYS_TEE: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(1),                      /* len */
+			seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+		},
+	},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
 			seccomp.AllowValue(uint64(os.Getpid())),
 		},
 	},
+	syscall.SYS_UTIMENSAT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* null pathname */
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* flags */
+		},
+	},
 	syscall.SYS_WRITE: {},
 	// The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
 	// values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index 5e5a3c998..209e646a7 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -26,6 +26,8 @@ import (
 func instrumentationFilters() seccomp.SyscallRules {
 	Report("MSAN is enabled: syscall filters less restrictive!")
 	return seccomp.SyscallRules{
+		syscall.SYS_CLONE:             {},
+		syscall.SYS_MMAP:              {},
 		syscall.SYS_SCHED_GETAFFINITY: {},
 		syscall.SYS_SET_ROBUST_LIST:   {},
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 0f62842ea..b98a1eb50 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -37,6 +37,13 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+	procvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
+	sysvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -44,27 +51,19 @@ import (
 )
 
 const (
-	// Filesystem name for 9p gofer mounts.
-	rootFsName = "9p"
-
 	// Device name for root mount.
 	rootDevice = "9pfs-/"
 
 	// MountPrefix is the annotation prefix for mount hints.
 	MountPrefix = "dev.gvisor.spec.mount."
 
-	// Filesystems that runsc supports.
-	bind     = "bind"
-	devpts   = "devpts"
-	devtmpfs = "devtmpfs"
-	proc     = "proc"
-	sysfs    = "sysfs"
-	tmpfs    = "tmpfs"
-	nonefs   = "none"
+	// Supported filesystems that map to different internal filesystem.
+	bind   = "bind"
+	nonefs = "none"
 )
 
 // tmpfs has some extra supported options that we must pass through.
-var tmpfsAllowedOptions = []string{"mode", "uid", "gid"}
+var tmpfsAllowedData = []string{"mode", "uid", "gid"}
 
 func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
 	// Upper layer uses the same flags as lower, but it must be read-write.
@@ -108,12 +107,12 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 
 	// Always mount /dev.
 	mounts = append(mounts, specs.Mount{
-		Type:        devtmpfs,
+		Type:        devtmpfs.Name,
 		Destination: "/dev",
 	})
 
 	mounts = append(mounts, specs.Mount{
-		Type:        devpts,
+		Type:        devpts.Name,
 		Destination: "/dev/pts",
 	})
 
@@ -137,13 +136,13 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	var mandatoryMounts []specs.Mount
 	if !procMounted {
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        proc,
+			Type:        procvfs2.Name,
 			Destination: "/proc",
 		})
 	}
 	if !sysMounted {
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        sysfs,
+			Type:        sysvfs2.Name,
 			Destination: "/sys",
 		})
 	}
@@ -155,13 +154,17 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	return mounts
 }
 
-// p9MountOptions creates a slice of options for a p9 mount.
-func p9MountOptions(fd int, fa FileAccessType) []string {
+// p9MountData creates a slice of p9 mount data.
+func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
 	opts := []string{
 		"trans=fd",
 		"rfdno=" + strconv.Itoa(fd),
 		"wfdno=" + strconv.Itoa(fd),
-		"privateunixsocket=true",
+	}
+	if !vfs2 {
+		// privateunixsocket is always enabled in VFS2. VFS1 requires explicit
+		// enablement.
+		opts = append(opts, "privateunixsocket=true")
 	}
 	if fa == FileAccessShared {
 		opts = append(opts, "cache=remote_revalidating")
@@ -231,8 +234,8 @@ func isSupportedMountFlag(fstype, opt string) bool {
 	case "rw", "ro", "noatime", "noexec":
 		return true
 	}
-	if fstype == tmpfs {
-		ok, err := parseMountOption(opt, tmpfsAllowedOptions...)
+	if fstype == tmpfsvfs2.Name {
+		ok, err := parseMountOption(opt, tmpfsAllowedData...)
 		return ok && err == nil
 	}
 	return false
@@ -278,6 +281,9 @@ func subtargets(root string, mnts []specs.Mount) []string {
 }
 
 func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	if conf.VFS2 {
+		return setupContainerVFS2(ctx, conf, mntr, procArgs)
+	}
 	mns, err := mntr.setupFS(conf, procArgs)
 	if err != nil {
 		return err
@@ -286,17 +292,10 @@ func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter,
 	// Set namespace here so that it can be found in ctx.
 	procArgs.MountNamespace = mns
 
-	return setExecutablePath(ctx, procArgs)
-}
-
-// setExecutablePath sets the procArgs.Filename by searching the PATH for an
-// executable matching the procArgs.Argv[0].
-func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
-	paths := fs.GetPath(procArgs.Envv)
-	exe := procArgs.Argv[0]
-	f, err := procArgs.MountNamespace.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
+	// Resolve the executable path from working dir and environment.
+	f, err := user.ResolveExecutablePath(ctx, procArgs.Credentials, procArgs.MountNamespace, procArgs.Envv, procArgs.WorkingDirectory, procArgs.Argv[0])
 	if err != nil {
-		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+		return fmt.Errorf("searching for executable %q, cwd: %q, envv: %q: %v", procArgs.Argv[0], procArgs.WorkingDirectory, procArgs.Envv, err)
 	}
 	procArgs.Filename = f
 	return nil
@@ -438,7 +437,7 @@ func (m *mountHint) setOptions(val string) error {
 }
 
 func (m *mountHint) isSupported() bool {
-	return m.mount.Type == tmpfs && m.share == pod
+	return m.mount.Type == tmpfsvfs2.Name && m.share == pod
 }
 
 // checkCompatible verifies that shared mount is compatible with master.
@@ -573,11 +572,14 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
 func (c *containerMounter) processHints(conf *Config) error {
+	if conf.VFS2 {
+		return nil
+	}
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
 		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
 		// common gofer to mount all shared volumes.
-		if hint.mount.Type != tmpfs {
+		if hint.mount.Type != tmpfsvfs2.Name {
 			continue
 		}
 		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
@@ -714,7 +716,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 	fd := c.fds.remove()
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
 	p9FS := mustFindFilesystem("9p")
-	opts := p9MountOptions(fd, conf.FileAccess)
+	opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
 
 	if conf.OverlayfsStaleRead {
 		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
@@ -760,30 +762,27 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	)
 
 	switch m.Type {
-	case devpts, devtmpfs, proc, sysfs:
+	case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name:
 		fsName = m.Type
 	case nonefs:
-		fsName = sysfs
-	case tmpfs:
+		fsName = sysvfs2.Name
+	case tmpfsvfs2.Name:
 		fsName = m.Type
 
 		var err error
-		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
 		if err != nil {
 			return "", nil, false, err
 		}
 
 	case bind:
 		fd := c.fds.remove()
-		fsName = "9p"
-		opts = p9MountOptions(fd, c.getMountAccessType(m))
+		fsName = gofervfs2.Name
+		opts = p9MountData(fd, c.getMountAccessType(m), conf.VFS2)
 		// If configured, add overlay to all writable mounts.
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
 	default:
-		// TODO(nlacasse): Support all the mount types and make this a fatal error.
-		// Most applications will "just work" without them, so this is a warning
-		// for now.
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
 	}
 	return fsName, opts, useOverlay, nil
@@ -824,7 +823,20 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 
 	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
 	if err != nil {
-		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+		err := fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+		// Check to see if this is a common error due to a Linux bug.
+		// This error is generated here in order to cause it to be
+		// printed to the user using Docker via 'runsc create' etc. rather
+		// than simply printed to the logs for the 'runsc boot' command.
+		//
+		// We check the error message string rather than type because the
+		// actual error types (syscall.EIO, syscall.EPIPE) are lost by file system
+		// implementation (e.g. p9).
+		// TODO(gvisor.dev/issue/1765): Remove message when bug is resolved.
+		if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) {
+			return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug"))
+		}
+		return err
 	}
 
 	// If there are submounts, we need to overlay the mount on top of a ramfs
@@ -919,7 +931,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 
 	// Add root mount.
 	fd := c.fds.remove()
-	opts := p9MountOptions(fd, conf.FileAccess)
+	opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
 
 	mf := fs.MountSourceFlags{}
 	if c.root.Readonly || conf.Overlay {
@@ -931,7 +943,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 		Flags:      mf,
 		DataString: strings.Join(opts, ","),
 	}
-	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
+	renv.MountSources[gofervfs2.Name] = append(renv.MountSources[gofervfs2.Name], rootMount)
 
 	// Add submounts.
 	var tmpMounted bool
@@ -947,7 +959,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
 	if !tmpMounted {
 		tmpMount := specs.Mount{
-			Type:        tmpfs,
+			Type:        tmpfsvfs2.Name,
 			Destination: "/tmp",
 		}
 		if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
@@ -1003,11 +1015,11 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
 		// No '/tmp' found (or fallthrough from above). Safe to mount internal
 		// tmpfs.
 		tmpMount := specs.Mount{
-			Type:        tmpfs,
+			Type:        tmpfsvfs2.Name,
 			Destination: "/tmp",
 			// Sticky bit is added to prevent accidental deletion of files from
 			// another user. This is normally done for /tmp.
-			Options: []string{"mode=1777"},
+			Options: []string{"mode=01777"},
 		}
 		return c.mountSubmount(ctx, conf, mns, root, tmpMount)
 
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 9f0d5d7af..002479612 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -26,16 +26,19 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/memutil"
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/fdimport"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -46,9 +49,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
 	"gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -60,6 +65,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 
 	// Include supported socket providers.
@@ -137,6 +143,9 @@ type execProcess struct {
 	// tty will be nil if the process is not attached to a terminal.
 	tty *host.TTYFileOperations
 
+	// tty will be nil if the process is not attached to a terminal.
+	ttyVFS2 *hostvfs2.TTYFileDescription
+
 	// pidnsPath is the pid namespace path in spec
 	pidnsPath string
 }
@@ -154,13 +163,17 @@ type Args struct {
 	Spec *specs.Spec
 	// Conf is the system configuration.
 	Conf *Config
-	// ControllerFD is the FD to the URPC controller.
+	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
+	// of this FD and may close it at any time.
 	ControllerFD int
-	// Device is an optional argument that is passed to the platform.
+	// Device is an optional argument that is passed to the platform. The Loader
+	// takes ownership of this file and may close it at any time.
 	Device *os.File
-	// GoferFDs is an array of FDs used to connect with the Gofer.
+	// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
+	// takes ownership of these FDs and may close them at any time.
 	GoferFDs []int
-	// StdioFDs is the stdio for the application.
+	// StdioFDs is the stdio for the application. The Loader takes ownership of
+	// these FDs and may close them at any time.
 	StdioFDs []int
 	// Console is set to true if using TTY.
 	Console bool
@@ -173,6 +186,9 @@ type Args struct {
 	UserLogFD int
 }
 
+// make sure stdioFDs are always the same on initial start and on restore
+const startingStdioFD = 64
+
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
 func New(args Args) (*Loader, error) {
@@ -186,11 +202,10 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("setting up memory usage: %v", err)
 	}
 
+	// Is this a VFSv2 kernel?
 	if args.Conf.VFS2 {
-		st, ok := kernel.LookupSyscallTable(abi.Linux, arch.Host)
-		if ok {
-			vfs2.Override(st.Table)
-		}
+		kernel.VFS2Enabled = true
+		vfs2.Override()
 	}
 
 	// Create kernel and platform.
@@ -230,11 +245,8 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("enabling strace: %v", err)
 	}
 
-	// Create an empty network stack because the network namespace may be empty at
-	// this point. Netns is configured before Run() is called. Netstack is
-	// configured using a control uRPC message. Host network is configured inside
-	// Run().
-	networkStack, err := newEmptyNetworkStack(args.Conf, k, k)
+	// Create root network namespace/stack.
+	netns, err := newRootNetworkNamespace(args.Conf, k, k)
 	if err != nil {
 		return nil, fmt.Errorf("creating network: %v", err)
 	}
@@ -277,7 +289,7 @@ func New(args Args) (*Loader, error) {
 		FeatureSet:                  cpuid.HostFeatureSet(),
 		Timekeeper:                  tk,
 		RootUserNamespace:           creds.UserNamespace,
-		NetworkStack:                networkStack,
+		RootNetworkNamespace:        netns,
 		ApplicationCores:            uint(args.NumCPU),
 		Vdso:                        vdso,
 		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
@@ -320,6 +332,38 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("creating pod mount hints: %v", err)
 	}
 
+	if kernel.VFS2Enabled {
+		// Set up host mount that will be used for imported fds.
+		hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS())
+		if err != nil {
+			return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err)
+		}
+		defer hostFilesystem.DecRef()
+		hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
+		}
+		k.SetHostMount(hostMount)
+	}
+
+	// Make host FDs stable between invocations. Host FDs must map to the exact
+	// same number when the sandbox is restored. Otherwise the wrong FD will be
+	// used.
+	var stdioFDs []int
+	newfd := startingStdioFD
+	for _, fd := range args.StdioFDs {
+		err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
+		if err != nil {
+			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+		}
+		stdioFDs = append(stdioFDs, newfd)
+		err = syscall.Close(fd)
+		if err != nil {
+			return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
+		}
+		newfd++
+	}
+
 	eid := execID{cid: args.ID}
 	l := &Loader{
 		k:            k,
@@ -328,7 +372,7 @@ func New(args Args) (*Loader, error) {
 		watchdog:     dog,
 		spec:         args.Spec,
 		goferFDs:     args.GoferFDs,
-		stdioFDs:     args.StdioFDs,
+		stdioFDs:     stdioFDs,
 		rootProcArgs: procArgs,
 		sandboxID:    args.ID,
 		processes:    map[execID]*execProcess{eid: {}},
@@ -368,11 +412,16 @@ func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.
 		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
 	}
 
+	wd := spec.Process.Cwd
+	if wd == "" {
+		wd = "/"
+	}
+
 	// Create the process arguments.
 	procArgs := kernel.CreateProcessArgs{
 		Argv:                    spec.Process.Args,
 		Envv:                    spec.Process.Env,
-		WorkingDirectory:        spec.Process.Cwd, // Defaults to '/' if empty.
+		WorkingDirectory:        wd,
 		Credentials:             creds,
 		Umask:                   0022,
 		Limits:                  ls,
@@ -466,7 +515,7 @@ func (l *Loader) run() error {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
-		stack := l.k.NetworkStack().(*hostinet.Stack)
+		stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
 		if err := stack.Configure(); err != nil {
 			return err
 		}
@@ -483,9 +532,11 @@ func (l *Loader) run() error {
 
 	// If we are restoring, we do not want to create a process.
 	// l.restore is set by the container manager when a restore call is made.
+	var ttyFile *host.TTYFileOperations
+	var ttyFileVFS2 *hostvfs2.TTYFileDescription
 	if !l.restore {
 		if l.conf.ProfileEnable {
-			initializePProf()
+			pprof.Initialize()
 		}
 
 		// Finally done with all configuration. Setup filters before user code
@@ -497,13 +548,14 @@ func (l *Loader) run() error {
 		// Create the FD map, which will set stdin, stdout, and stderr.  If console
 		// is true, then ioctl calls will be passed through to the host fd.
 		ctx := l.rootProcArgs.NewContext(l.k)
-		fdTable, err := createFDTable(ctx, l.console, l.stdioFDs)
+		var err error
+
+		// CreateProcess takes a reference on FDMap if successful. We won't need
+		// ours either way.
+		l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
 		if err != nil {
 			return fmt.Errorf("importing fds: %v", err)
 		}
-		// CreateProcess takes a reference on FDMap if successful. We won't need
-		// ours either way.
-		l.rootProcArgs.FDTable = fdTable
 
 		// Setup the root container file system.
 		l.startGoferMonitor(l.sandboxID, l.goferFDs)
@@ -517,7 +569,15 @@ func (l *Loader) run() error {
 		}
 
 		// Add the HOME enviroment variable if it is not already set.
-		envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		var envv []string
+		if kernel.VFS2Enabled {
+			envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+
+		} else {
+			envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		}
 		if err != nil {
 			return err
 		}
@@ -538,14 +598,16 @@ func (l *Loader) run() error {
 		ep.pidnsPath = ns.Path
 	}
 	if l.console {
-		ttyFile, _ := l.rootProcArgs.FDTable.Get(0)
-		defer ttyFile.DecRef()
-		ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
-
-		// Set the foreground process group on the TTY to the global
-		// init process group, since that is what we are about to
-		// start running.
-		ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+		// Set the foreground process group on the TTY to the global init process
+		// group, since that is what we are about to start running.
+		switch {
+		case ttyFileVFS2 != nil:
+			ep.ttyVFS2 = ttyFileVFS2
+			ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+		case ttyFile != nil:
+			ep.tty = ttyFile
+			ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+		}
 	}
 
 	// Handle signals by forwarding them to the root container process
@@ -570,6 +632,19 @@ func (l *Loader) run() error {
 		}
 	})
 
+	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
+	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
+	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
+	// passed FDs, so only close for VFS1.
+	if !kernel.VFS2Enabled {
+		for _, fd := range l.stdioFDs {
+			err := syscall.Close(fd)
+			if err != nil {
+				return fmt.Errorf("close dup()ed stdioFDs: %v", err)
+			}
+		}
+	}
+
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
 	return l.k.Start()
@@ -653,7 +728,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 
 	// Create the FD map, which will set stdin, stdout, and stderr.
 	ctx := procArgs.NewContext(l.k)
-	fdTable, err := createFDTable(ctx, false, stdioFDs)
+	fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
 	if err != nil {
 		return fmt.Errorf("importing fds: %v", err)
 	}
@@ -738,14 +813,14 @@ func (l *Loader) destroyContainer(cid string) error {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
-	_, _, started, err := l.threadGroupFromIDLocked(execID{cid: cid})
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
 	if err != nil {
 		// Container doesn't exist.
 		return err
 	}
 
-	// The container exists, has it been started?
-	if started {
+	// The container exists, but has it been started?
+	if tg != nil {
 		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
 			return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
 		}
@@ -787,45 +862,65 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
-	tg, _, started, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID})
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
 	if err != nil {
 		return 0, err
 	}
-	if !started {
+	if tg == nil {
 		return 0, fmt.Errorf("container %q not started", args.ContainerID)
 	}
 
 	// Get the container MountNamespace from the Task.
-	tg.Leader().WithMuLocked(func(t *kernel.Task) {
-		// task.MountNamespace() does not take a ref, so we must do so
-		// ourselves.
-		args.MountNamespace = t.MountNamespace()
-		args.MountNamespace.IncRef()
-	})
-	defer args.MountNamespace.DecRef()
+	if kernel.VFS2Enabled {
+		// task.MountNamespace() does not take a ref, so we must do so ourselves.
+		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
+		args.MountNamespaceVFS2.IncRef()
+	} else {
+		tg.Leader().WithMuLocked(func(t *kernel.Task) {
+			// task.MountNamespace() does not take a ref, so we must do so ourselves.
+			args.MountNamespace = t.MountNamespace()
+			args.MountNamespace.IncRef()
+		})
+	}
 
-	// Add the HOME enviroment varible if it is not already set.
-	root := args.MountNamespace.Root()
-	defer root.DecRef()
-	ctx := fs.WithRoot(l.k.SupervisorContext(), root)
-	envv, err := maybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
-	if err != nil {
-		return 0, err
+	// Add the HOME environment variable if it is not already set.
+	if kernel.VFS2Enabled {
+		defer args.MountNamespaceVFS2.DecRef()
+
+		root := args.MountNamespaceVFS2.Root()
+		defer root.DecRef()
+		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
+		if err != nil {
+			return 0, err
+		}
+		args.Envv = envv
+	} else {
+		defer args.MountNamespace.DecRef()
+
+		root := args.MountNamespace.Root()
+		defer root.DecRef()
+		ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+		envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
+		if err != nil {
+			return 0, err
+		}
+		args.Envv = envv
 	}
-	args.Envv = envv
 
 	// Start the process.
 	proc := control.Proc{Kernel: l.k}
 	args.PIDNamespace = tg.PIDNamespace()
-	newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args)
+	newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args)
 	if err != nil {
 		return 0, err
 	}
 
 	eid := execID{cid: args.ContainerID, pid: tgid}
 	l.processes[eid] = &execProcess{
-		tg:  newTG,
-		tty: ttyFile,
+		tg:      newTG,
+		tty:     ttyFile,
+		ttyVFS2: ttyFileVFS2,
 	}
 	log.Debugf("updated processes: %v", l.processes)
 
@@ -836,7 +931,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// Don't defer unlock, as doing so would make it impossible for
 	// multiple clients to wait on the same container.
-	tg, _, err := l.threadGroupFromID(execID{cid: cid})
+	tg, err := l.threadGroupFromID(execID{cid: cid})
 	if err != nil {
 		return fmt.Errorf("can't wait for container %q: %v", cid, err)
 	}
@@ -855,7 +950,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e
 
 	// Try to find a process that was exec'd
 	eid := execID{cid: cid, pid: tgid}
-	execTG, _, err := l.threadGroupFromID(eid)
+	execTG, err := l.threadGroupFromID(eid)
 	if err == nil {
 		ws := l.wait(execTG)
 		*waitStatus = ws
@@ -869,7 +964,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e
 
 	// The caller may be waiting on a process not started directly via exec.
 	// In this case, find the process in the container's PID namespace.
-	initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+	initTG, err := l.threadGroupFromID(execID{cid: cid})
 	if err != nil {
 		return fmt.Errorf("waiting for PID %d: %v", tgid, err)
 	}
@@ -905,48 +1000,92 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
 	switch conf.Network {
 	case NetworkHost:
-		return hostinet.NewStack(), nil
+		// No network namespacing support for hostinet yet, hence creator is nil.
+		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
 	case NetworkNone, NetworkSandbox:
-		// NetworkNone sets up loopback using netstack.
-		netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
-		transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
-		s := netstack.Stack{stack.New(stack.Options{
-			NetworkProtocols:   netProtos,
-			TransportProtocols: transProtos,
-			Clock:              clock,
-			Stats:              netstack.Metrics,
-			HandleLocal:        true,
-			// Enable raw sockets for users with sufficient
-			// privileges.
-			RawFactory: raw.EndpointFactory{},
-			UniqueID:   uniqueID,
-		})}
-
-		// Enable SACK Recovery.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-			return nil, fmt.Errorf("failed to enable SACK: %v", err)
+		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+		if err != nil {
+			return nil, err
+		}
+		creator := &sandboxNetstackCreator{
+			clock:    clock,
+			uniqueID: uniqueID,
 		}
+		return inet.NewRootNamespace(s, creator), nil
 
-		// Set default TTLs as required by socket/netstack.
-		s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-		s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
 
-		// Enable Receive Buffer Auto-Tuning.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-			return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
-		}
+}
+
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	s := netstack.Stack{stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+		Clock:              clock,
+		Stats:              netstack.Metrics,
+		HandleLocal:        true,
+		// Enable raw sockets for users with sufficient
+		// privileges.
+		RawFactory: raw.EndpointFactory{},
+		UniqueID:   uniqueID,
+	})}
 
-		s.FillDefaultIPTables()
+	// Enable SACK Recovery.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+		return nil, fmt.Errorf("failed to enable SACK: %v", err)
+	}
 
-		return &s, nil
+	// Set default TTLs as required by socket/netstack.
+	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
 
-	default:
-		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+	}
+
+	s.FillIPTablesMetadata()
+
+	return &s, nil
+}
+
+// sandboxNetstackCreator implements kernel.NetworkStackCreator.
+//
+// +stateify savable
+type sandboxNetstackCreator struct {
+	clock    tcpip.Clock
+	uniqueID stack.UniqueID
+}
+
+// CreateStack implements kernel.NetworkStackCreator.CreateStack.
+func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
+	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+	if err != nil {
+		return nil, err
+	}
+
+	// Setup loopback.
+	n := &Network{Stack: s.(*netstack.Stack).Stack}
+	nicID := tcpip.NICID(f.uniqueID.UniqueID())
+	link := DefaultLoopbackLink
+	linkEP := loopback.New()
+	if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		return nil, err
 	}
+
+	return s, nil
 }
 
 // signal sends a signal to one or more processes in a container. If PID is 0,
@@ -976,8 +1115,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e
 			return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
 		}
 		// Check that the container has actually started before signaling it.
-		_, _, err := l.threadGroupFromID(execID{cid: cid})
-		if err != nil {
+		if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
 			return err
 		}
 		if err := l.signalAllProcesses(cid, signo); err != nil {
@@ -991,16 +1129,16 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e
 }
 
 func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
-	execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+	execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
 	if err == nil {
 		// Send signal directly to the identified process.
-		return execTG.SendSignal(&arch.SignalInfo{Signo: signo})
+		return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo})
 	}
 
 	// The caller may be signaling a process not started directly via exec.
 	// In this case, find the process in the container's PID namespace and
 	// signal it.
-	initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+	initTG, err := l.threadGroupFromID(execID{cid: cid})
 	if err != nil {
 		return fmt.Errorf("no thread group found: %v", err)
 	}
@@ -1011,25 +1149,43 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er
 	if tg.Leader().ContainerID() != cid {
 		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
 	}
-	return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+	return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
 }
 
+// signalForegrondProcessGroup looks up foreground process group from the TTY
+// for the given "tgid" inside container "cid", and send the signal to it.
 func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
-	// Lookup foreground process group from the TTY for the given process,
-	// and send the signal to it.
-	tg, tty, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+	l.mu.Lock()
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
 	if err != nil {
+		l.mu.Unlock()
 		return fmt.Errorf("no thread group found: %v", err)
 	}
-	if tty == nil {
+	if tg == nil {
+		l.mu.Unlock()
+		return fmt.Errorf("container %q not started", cid)
+	}
+
+	tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
+	l.mu.Unlock()
+	if err != nil {
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+
+	var pg *kernel.ProcessGroup
+	switch {
+	case ttyVFS2 != nil:
+		pg = ttyVFS2.ForegroundProcessGroup()
+	case tty != nil:
+		pg = tty.ForegroundProcessGroup()
+	default:
 		return fmt.Errorf("no TTY attached")
 	}
-	pg := tty.ForegroundProcessGroup()
 	if pg == nil {
 		// No foreground process group has been set. Signal the
 		// original thread group.
 		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
-		return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+		return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
 	}
 	// Send the signal to all processes in the process group.
 	var lastErr error
@@ -1037,7 +1193,7 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s
 		if tg.ProcessGroup() != pg {
 			continue
 		}
-		if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
+		if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil {
 			lastErr = err
 		}
 	}
@@ -1055,33 +1211,57 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error {
 	return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo})
 }
 
-// threadGroupFromID same as threadGroupFromIDLocked except that it acquires
-// mutex before calling it.
-func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
+// acquires mutex before calling it and fails in case container hasn't started
+// yet.
+func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	tg, tty, ok, err := l.threadGroupFromIDLocked(key)
+	tg, err := l.tryThreadGroupFromIDLocked(key)
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
-	if !ok {
-		return nil, nil, fmt.Errorf("container %q not started", key.cid)
+	if tg == nil {
+		return nil, fmt.Errorf("container %q not started", key.cid)
 	}
-	return tg, tty, nil
+	return tg, nil
 }
 
-// threadGroupFromIDLocked returns the thread group and TTY for the given
-// execution ID. TTY may be nil if the process is not attached to a terminal.
-// Also returns a boolean indicating whether the container has already started.
-// Returns error if execution ID is invalid or if the container cannot be
-// found (maybe it has been deleted). Caller must hold 'mu'.
-func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, bool, error) {
+// tryThreadGroupFromIDLocked returns the thread group for the given execution
+// ID. It may return nil in case the container has not started yet. Returns
+// error if execution ID is invalid or if the container cannot be found (maybe
+// it has been deleted). Caller must hold 'mu'.
+func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
 	ep := l.processes[key]
 	if ep == nil {
-		return nil, nil, false, fmt.Errorf("container %q not found", key.cid)
+		return nil, fmt.Errorf("container %q not found", key.cid)
 	}
-	if ep.tg == nil {
-		return nil, nil, false, nil
+	return ep.tg, nil
+}
+
+// ttyFromIDLocked returns the TTY files for the given execution ID. It may
+// return nil in case the container has not started yet. Returns error if
+// execution ID is invalid or if the container cannot be found (maybe it has
+// been deleted). Caller must hold 'mu'.
+func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	ep := l.processes[key]
+	if ep == nil {
+		return nil, nil, fmt.Errorf("container %q not found", key.cid)
+	}
+	return ep.tty, ep.ttyVFS2, nil
+}
+
+func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	if len(stdioFDs) != 3 {
+		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
+	}
+
+	k := kernel.KernelFromContext(ctx)
+	fdTable := k.NewFDTable()
+	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
+	if err != nil {
+		fdTable.DecRef()
+		return nil, nil, nil, err
 	}
-	return ep.tg, ep.tty, true, nil
+	return fdTable, ttyFile, ttyFileVFS2, nil
 }
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 44aa63196..e448fd773 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -24,11 +24,14 @@ import (
 	"time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/runsc/fsgofer"
@@ -100,20 +103,29 @@ func startGofer(root string) (int, func(), error) {
 	return sandboxEnd, cleanup, nil
 }
 
-func createLoader() (*Loader, func(), error) {
+func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) {
 	fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10]))
 	if err != nil {
 		return nil, nil, err
 	}
 	conf := testConfig()
-	spec := testSpec()
+	conf.VFS2 = vfsEnabled
 
 	sandEnd, cleanup, err := startGofer(spec.Root.Path)
 	if err != nil {
 		return nil, nil, err
 	}
 
-	stdio := []int{int(os.Stdin.Fd()), int(os.Stdout.Fd()), int(os.Stderr.Fd())}
+	// Loader takes ownership of stdio.
+	var stdio []int
+	for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
+		newFd, err := unix.Dup(int(f.Fd()))
+		if err != nil {
+			return nil, nil, err
+		}
+		stdio = append(stdio, newFd)
+	}
+
 	args := Args{
 		ID:           "foo",
 		Spec:         spec,
@@ -132,10 +144,20 @@ func createLoader() (*Loader, func(), error) {
 
 // TestRun runs a simple application in a sandbox and checks that it succeeds.
 func TestRun(t *testing.T) {
-	l, cleanup, err := createLoader()
+	doRun(t, false)
+}
+
+// TestRunVFS2 runs TestRun in VFSv2.
+func TestRunVFS2(t *testing.T) {
+	doRun(t, true)
+}
+
+func doRun(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled, testSpec())
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
+
 	defer l.Destroy()
 	defer cleanup()
 
@@ -169,7 +191,16 @@ func TestRun(t *testing.T) {
 // TestStartSignal tests that the controller Start message will cause
 // WaitForStartSignal to return.
 func TestStartSignal(t *testing.T) {
-	l, cleanup, err := createLoader()
+	doStartSignal(t, false)
+}
+
+// TestStartSignalVFS2 does TestStartSignal with VFS2.
+func TestStartSignalVFS2(t *testing.T) {
+	doStartSignal(t, true)
+}
+
+func doStartSignal(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled, testSpec())
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
@@ -217,18 +248,19 @@ func TestStartSignal(t *testing.T) {
 
 }
 
-// Test that MountNamespace can be created with various specs.
-func TestCreateMountNamespace(t *testing.T) {
-	testCases := []struct {
-		name string
-		// Spec that will be used to create the mount manager.  Note
-		// that we can't mount procfs without a kernel, so each spec
-		// MUST contain something other than procfs mounted at /proc.
-		spec specs.Spec
-		// Paths that are expected to exist in the resulting fs.
-		expectedPaths []string
-	}{
-		{
+type CreateMountTestcase struct {
+	name string
+	// Spec that will be used to create the mount manager.  Note
+	// that we can't mount procfs without a kernel, so each spec
+	// MUST contain something other than procfs mounted at /proc.
+	spec specs.Spec
+	// Paths that are expected to exist in the resulting fs.
+	expectedPaths []string
+}
+
+func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+	testCases := []*CreateMountTestcase{
+		&CreateMountTestcase{
 			// Only proc.
 			name: "only proc mount",
 			spec: specs.Spec{
@@ -270,7 +302,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			// /dev, and /sys.
 			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		{
+		&CreateMountTestcase{
 			// Mounts are nested inside each other.
 			name: "nested mounts",
 			spec: specs.Spec{
@@ -314,7 +346,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
 				"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		{
+		&CreateMountTestcase{
 			name: "mount inside /dev",
 			spec: specs.Spec{
 				Root: &specs.Root{
@@ -357,40 +389,46 @@ func TestCreateMountNamespace(t *testing.T) {
 			},
 			expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
 		},
-		{
-			name: "mounts inside mandatory mounts",
-			spec: specs.Spec{
-				Root: &specs.Root{
-					Path:     os.TempDir(),
-					Readonly: true,
+	}
+
+	vfsCase := &CreateMountTestcase{
+		name: "mounts inside mandatory mounts",
+		spec: specs.Spec{
+			Root: &specs.Root{
+				Path:     os.TempDir(),
+				Readonly: true,
+			},
+			Mounts: []specs.Mount{
+				{
+					Destination: "/proc",
+					Type:        "tmpfs",
 				},
-				Mounts: []specs.Mount{
-					{
-						Destination: "/proc",
-						Type:        "tmpfs",
-					},
-					// We don't include /sys, and /tmp in
-					// the spec, since they will be added
-					// automatically.
-					//
-					// Instead, add submounts inside these
-					// directories and make sure they are
-					// visible under the mandatory mounts.
-					{
-						Destination: "/sys/bar",
-						Type:        "tmpfs",
-					},
-					{
-						Destination: "/tmp/baz",
-						Type:        "tmpfs",
-					},
+				// TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
+				//  MkDirAt in VFS2 (and remove the reduntant append).
+				// {
+				//		Destination: "/sys/bar",
+				//		Type:        "tmpfs",
+				//	},
+				//
+				{
+					Destination: "/tmp/baz",
+					Type:        "tmpfs",
 				},
 			},
-			expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"},
 		},
+		expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
 	}
 
-	for _, tc := range testCases {
+	if !vfs2 {
+		vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
+		vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
+	}
+	return append(testCases, vfsCase)
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespace(t *testing.T) {
+	for _, tc := range createMountTestcases(false /* vfs2 */) {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
 			ctx := contexttest.Context(t)
@@ -425,6 +463,52 @@ func TestCreateMountNamespace(t *testing.T) {
 	}
 }
 
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespaceVFS2(t *testing.T) {
+	for _, tc := range createMountTestcases(true /* vfs2 */) {
+		t.Run(tc.name, func(t *testing.T) {
+			spec := testSpec()
+			spec.Mounts = tc.spec.Mounts
+			spec.Root = tc.spec.Root
+
+			t.Logf("Using root: %q", spec.Root.Path)
+			l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec)
+			if err != nil {
+				t.Fatalf("failed to create loader: %v", err)
+			}
+			defer l.Destroy()
+			defer loaderCleanup()
+
+			mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
+			if err := mntr.processHints(l.conf); err != nil {
+				t.Fatalf("failed process hints: %v", err)
+			}
+
+			ctx := l.k.SupervisorContext()
+			mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
+			if err != nil {
+				t.Fatalf("failed to setupVFS2: %v", err)
+			}
+
+			root := mns.Root()
+			defer root.DecRef()
+			for _, p := range tc.expectedPaths {
+				target := &vfs.PathOperation{
+					Root:  root,
+					Start: root,
+					Path:  fspath.Parse(p),
+				}
+
+				if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+				} else {
+					d.DecRef()
+				}
+			}
+		})
+	}
+}
+
 // TestRestoreEnvironment tests that the correct mounts are collected from the spec and config
 // in order to build the environment for restoring.
 func TestRestoreEnvironment(t *testing.T) {
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 6a8765ec8..0af30456e 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -17,12 +17,15 @@ package boot
 import (
 	"fmt"
 	"net"
+	"runtime"
+	"strings"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -31,6 +34,32 @@ import (
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
+var (
+	// DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and
+	// "::1/8" on "lo" interface.
+	DefaultLoopbackLink = LoopbackLink{
+		Name: "lo",
+		Addresses: []net.IP{
+			net.IP("\x7f\x00\x00\x01"),
+			net.IPv6loopback,
+		},
+		Routes: []Route{
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv4(0x7f, 0, 0, 0),
+					Mask: net.IPv4Mask(0xff, 0, 0, 0),
+				},
+			},
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv6loopback,
+					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
+				},
+			},
+		},
+	}
+)
+
 // Network exposes methods that can be used to configure a network stack.
 type Network struct {
 	Stack *stack.Stack
@@ -48,6 +77,44 @@ type DefaultRoute struct {
 	Name  string
 }
 
+// QueueingDiscipline is used to specify the kind of Queueing Discipline to
+// apply for a give FDBasedLink.
+type QueueingDiscipline int
+
+const (
+	// QDiscNone disables any queueing for the underlying FD.
+	QDiscNone QueueingDiscipline = iota
+
+	// QDiscFIFO applies a simple fifo based queue to the underlying
+	// FD.
+	QDiscFIFO
+)
+
+// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
+// else returns an error.
+func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
+	switch s {
+	case "none":
+		return QDiscNone, nil
+	case "fifo":
+		return QDiscFIFO, nil
+	default:
+		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
+	}
+}
+
+// String implements fmt.Stringer.
+func (q QueueingDiscipline) String() string {
+	switch q {
+	case QDiscNone:
+		return "none"
+	case QDiscFIFO:
+		return "fifo"
+	default:
+		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
+	}
+}
+
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
 	Name               string
@@ -57,6 +124,7 @@ type FDBasedLink struct {
 	GSOMaxSize         uint32
 	SoftwareGSOEnabled bool
 	LinkAddress        net.HardwareAddr
+	QDisc              QueueingDiscipline
 
 	// NumChannels controls how many underlying FD's are to be used to
 	// create this endpoint.
@@ -158,6 +226,8 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		mac := tcpip.LinkAddress(link.LinkAddress)
+		log.Infof("gso max size is: %d", link.GSOMaxSize)
+
 		linkEP, err := fdbased.New(&fdbased.Options{
 			FDs:                FDs,
 			MTU:                uint32(link.MTU),
@@ -172,6 +242,13 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			return err
 		}
 
+		switch link.QDisc {
+		case QDiscNone:
+		case QDiscFIFO:
+			log.Infof("Enabling FIFO QDisc on %q", link.Name)
+			linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
+		}
+
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
 		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
 			return err
diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD
new file mode 100644
index 000000000..29cb42b2f
--- /dev/null
+++ b/runsc/boot/pprof/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pprof",
+    srcs = ["pprof.go"],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+)
diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof/pprof.go
index 463362f02..1ded20dee 100644
--- a/runsc/boot/pprof.go
+++ b/runsc/boot/pprof/pprof.go
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package boot
+// Package pprof provides a stub to initialize custom profilers.
+package pprof
 
-func initializePProf() {
+// Initialize will be called at boot for initializing custom profilers.
+func Initialize() {
 }
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
new file mode 100644
index 000000000..7ed6801b4
--- /dev/null
+++ b/runsc/boot/vfs.go
@@ -0,0 +1,375 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"path"
+	"sort"
+	"strings"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+		// TODO(b/29356795): Users may mount this once the terminals are in a
+		//  usable state.
+		AllowUserMount: false,
+	})
+	vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+	})
+	vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+
+	// Setup files in devtmpfs.
+	if err := memdev.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering memdev: %w", err)
+	}
+	a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
+	if err != nil {
+		return fmt.Errorf("creating devtmpfs accessor: %w", err)
+	}
+	defer a.Release()
+
+	if err := a.UserspaceInit(ctx); err != nil {
+		return fmt.Errorf("initializing userspace: %w", err)
+	}
+	if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
+		return fmt.Errorf("creating devtmpfs files: %w", err)
+	}
+	return nil
+}
+
+func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	if err := mntr.k.VFS().Init(); err != nil {
+		return fmt.Errorf("failed to initialize VFS: %w", err)
+	}
+	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+	if err != nil {
+		return fmt.Errorf("failed to setupFS: %w", err)
+	}
+	procArgs.MountNamespaceVFS2 = mns
+
+	// Resolve the executable path from working dir and environment.
+	f, err := user.ResolveExecutablePathVFS2(ctx, procArgs.Credentials, procArgs.MountNamespaceVFS2, procArgs.Envv, procArgs.WorkingDirectory, procArgs.Argv[0])
+	if err != nil {
+		return fmt.Errorf("searching for executable %q, cwd: %q, envv: %q: %v", procArgs.Argv[0], procArgs.WorkingDirectory, procArgs.Envv, err)
+	}
+	procArgs.Filename = f
+	return nil
+}
+
+func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+	log.Infof("Configuring container's file system with VFS2")
+
+	// Create context with root credentials to mount the filesystem (the current
+	// user may not be privileged enough).
+	rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+	rootProcArgs := *procArgs
+	rootProcArgs.WorkingDirectory = "/"
+	rootProcArgs.Credentials = rootCreds
+	rootProcArgs.Umask = 0022
+	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
+	rootCtx := procArgs.NewContext(c.k)
+
+	if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil {
+		return nil, fmt.Errorf("register filesystems: %w", err)
+	}
+
+	mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
+	if err != nil {
+		return nil, fmt.Errorf("creating mount namespace: %w", err)
+	}
+	rootProcArgs.MountNamespaceVFS2 = mns
+
+	// Mount submounts.
+	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
+		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+	fd := c.fds.remove()
+	opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",")
+
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts})
+	if err != nil {
+		return nil, fmt.Errorf("setting up mount namespace: %w", err)
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+	mounts, err := c.prepareMountsVFS2()
+	if err != nil {
+		return err
+	}
+
+	for i := range mounts {
+		submount := &mounts[i]
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+		if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
+			return err
+		}
+	}
+
+	if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
+		return fmt.Errorf(`mount submount "\tmp": %w`, err)
+	}
+	return nil
+}
+
+type mountAndFD struct {
+	specs.Mount
+	fd int
+}
+
+func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
+	// Associate bind mounts with their FDs before sorting since there is an
+	// undocumented assumption that FDs are dispensed in the order in which
+	// they are required by mounts.
+	var mounts []mountAndFD
+	for _, m := range c.mounts {
+		fd := -1
+		// Only bind mounts use host FDs; see
+		// containerMounter.getMountNameAndOptionsVFS2.
+		if m.Type == bind {
+			fd = c.fds.remove()
+		}
+		mounts = append(mounts, mountAndFD{
+			Mount: m,
+			fd:    fd,
+		})
+	}
+	if err := c.checkDispenser(); err != nil {
+		return nil, err
+	}
+
+	// Sort the mounts so that we don't place children before parents.
+	sort.Slice(mounts, func(i, j int) bool {
+		return len(mounts[i].Destination) < len(mounts[j].Destination)
+	})
+
+	return mounts, nil
+}
+
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
+	root := mns.Root()
+	defer root.DecRef()
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(submount.Destination),
+	}
+	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+	if err != nil {
+		return fmt.Errorf("mountOptions failed: %w", err)
+	}
+	if len(fsName) == 0 {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
+		return err
+	}
+	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
+		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+	}
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
+	return nil
+}
+
+// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
+	var (
+		fsName string
+		data   []string
+	)
+
+	// Find filesystem name and FS specific data field.
+	switch m.Type {
+	case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
+		fsName = m.Type
+	case nonefs:
+		fsName = sys.Name
+	case tmpfs.Name:
+		fsName = m.Type
+
+		var err error
+		data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
+		if err != nil {
+			return "", nil, err
+		}
+
+	case bind:
+		fsName = gofer.Name
+		data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+
+	default:
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+	}
+
+	opts := &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(data, ","),
+		},
+		InternalMount: true,
+	}
+
+	for _, o := range m.Options {
+		switch o {
+		case "rw":
+			opts.ReadOnly = false
+		case "ro":
+			opts.ReadOnly = true
+		case "noatime":
+			opts.Flags.NoATime = true
+		case "noexec":
+			opts.Flags.NoExec = true
+		default:
+			log.Warningf("ignoring unknown mount option %q", o)
+		}
+	}
+
+	if conf.Overlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		opts.ReadOnly = true
+	}
+	return fsName, opts, nil
+}
+
+func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(currentPath),
+	}
+	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
+	if err == nil {
+		// Mount point exists, nothing else to do.
+		return nil
+	}
+	if err != syserror.ENOENT {
+		return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err)
+	}
+
+	// Recurse to ensure parent is created and then create the mount point.
+	if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
+		return err
+	}
+	log.Debugf("Creating dir %q for mount point", currentPath)
+	mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
+	if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
+		return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err)
+	}
+	return nil
+}
+
+// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+//   1. /tmp is mounted explicitly: we should not override user's wish
+//   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+	for _, m := range c.mounts {
+		// m.Destination has been cleaned, so it's to use equality here.
+		if m.Destination == "/tmp" {
+			log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
+			return nil
+		}
+	}
+
+	root := mns.Root()
+	defer root.DecRef()
+	pop := vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse("/tmp"),
+	}
+	// TODO(gvisor.dev/issue/2782): Use O_PATH when available.
+	statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{})
+	switch err {
+	case nil:
+		// Found '/tmp' in filesystem, check if it's empty.
+		if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory {
+			// Not a dir?! Leave it be.
+			return nil
+		}
+		if statx.Nlink > 2 {
+			// If more than "." and ".." is found, skip internal tmpfs to prevent
+			// hiding existing files.
+			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
+			return nil
+		}
+		log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
+		fallthrough
+
+	case syserror.ENOENT:
+		// No '/tmp' found (or fallthrough from above). It's safe to mount internal
+		// tmpfs.
+		tmpMount := specs.Mount{
+			Type:        tmpfs.Name,
+			Destination: "/tmp",
+			// Sticky bit is added to prevent accidental deletion of files from
+			// another user. This is normally done for /tmp.
+			Options: []string{"mode=01777"},
+		}
+		return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+
+	default:
+		return fmt.Errorf(`stating "/tmp" inside container: %w`, err)
+	}
+}
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index d4c7bdfbb..c087e1a3c 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -7,8 +7,8 @@ go_library(
     srcs = ["cgroup.go"],
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/cleanup",
         "//pkg/log",
-        "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 653ca5f52..ef01820ef 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -19,6 +19,7 @@ package cgroup
 import (
 	"bufio"
 	"context"
+	"errors"
 	"fmt"
 	"io/ioutil"
 	"os"
@@ -30,29 +31,31 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 const (
 	cgroupRoot = "/sys/fs/cgroup"
 )
 
-var controllers = map[string]controller{
-	"blkio":    &blockIO{},
-	"cpu":      &cpu{},
-	"cpuset":   &cpuSet{},
-	"memory":   &memory{},
-	"net_cls":  &networkClass{},
-	"net_prio": &networkPrio{},
+var controllers = map[string]config{
+	"blkio":    config{ctrlr: &blockIO{}},
+	"cpu":      config{ctrlr: &cpu{}},
+	"cpuset":   config{ctrlr: &cpuSet{}},
+	"memory":   config{ctrlr: &memory{}},
+	"net_cls":  config{ctrlr: &networkClass{}},
+	"net_prio": config{ctrlr: &networkPrio{}},
+	"pids":     config{ctrlr: &pids{}},
 
 	// These controllers either don't have anything in the OCI spec or is
-	// irrevalant for a sandbox, e.g. pids.
-	"devices":    &noop{},
-	"freezer":    &noop{},
-	"perf_event": &noop{},
-	"pids":       &noop{},
-	"systemd":    &noop{},
+	// irrelevant for a sandbox.
+	"devices":    config{ctrlr: &noop{}},
+	"freezer":    config{ctrlr: &noop{}},
+	"hugetlb":    config{ctrlr: &noop{}, optional: true},
+	"perf_event": config{ctrlr: &noop{}},
+	"rdma":       config{ctrlr: &noop{}, optional: true},
+	"systemd":    config{ctrlr: &noop{}},
 }
 
 func setOptionalValueInt(path, name string, val *int64) error {
@@ -196,8 +199,9 @@ func LoadPaths(pid string) (map[string]string, error) {
 	return paths, nil
 }
 
-// Cgroup represents a group inside all controllers. For example: Name='/foo/bar'
-// maps to /sys/fs/cgroup/<controller>/foo/bar on all controllers.
+// Cgroup represents a group inside all controllers. For example:
+//   Name='/foo/bar' maps to /sys/fs/cgroup/<controller>/foo/bar on
+//   all controllers.
 type Cgroup struct {
 	Name    string            `json:"name"`
 	Parents map[string]string `json:"parents"`
@@ -242,16 +246,20 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
 
 	// The Cleanup object cleans up partially created cgroups when an error occurs.
 	// Errors occuring during cleanup itself are ignored.
-	clean := specutils.MakeCleanup(func() { _ = c.Uninstall() })
+	clean := cleanup.Make(func() { _ = c.Uninstall() })
 	defer clean.Clean()
 
-	for key, ctrl := range controllers {
+	for key, cfg := range controllers {
 		path := c.makePath(key)
 		if err := os.MkdirAll(path, 0755); err != nil {
+			if cfg.optional && errors.Is(err, syscall.EROFS) {
+				log.Infof("Skipping cgroup %q", key)
+				continue
+			}
 			return err
 		}
 		if res != nil {
-			if err := ctrl.set(res, path); err != nil {
+			if err := cfg.ctrlr.set(res, path); err != nil {
 				return err
 			}
 		}
@@ -321,10 +329,13 @@ func (c *Cgroup) Join() (func(), error) {
 	}
 
 	// Now join the cgroups.
-	for key := range controllers {
+	for key, cfg := range controllers {
 		path := c.makePath(key)
 		log.Debugf("Joining cgroup %q", path)
 		if err := setValue(path, "cgroup.procs", "0"); err != nil {
+			if cfg.optional && os.IsNotExist(err) {
+				continue
+			}
 			return undo, err
 		}
 	}
@@ -375,6 +386,11 @@ func (c *Cgroup) makePath(controllerName string) string {
 	return filepath.Join(cgroupRoot, controllerName, path)
 }
 
+type config struct {
+	ctrlr    controller
+	optional bool
+}
+
 type controller interface {
 	set(*specs.LinuxResources, string) error
 }
@@ -525,3 +541,13 @@ func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
 	}
 	return nil
 }
+
+type pids struct{}
+
+func (*pids) set(spec *specs.LinuxResources, path string) error {
+	if spec.Pids == nil {
+		return nil
+	}
+	val := strconv.FormatInt(spec.Pids.Limit, 10)
+	return setValue(path, "pids.max", val)
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 2a88b85a9..af3538ef0 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -31,6 +31,7 @@ go_library(
         "spec.go",
         "start.go",
         "state.go",
+        "statefile.go",
         "syscalls.go",
         "wait.go",
     ],
@@ -43,11 +44,13 @@ go_library(
         "//pkg/sentry/control",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/platform",
+        "//pkg/state",
+        "//pkg/state/statefile",
         "//pkg/sync",
         "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
-        "//runsc/boot/platforms",
         "//runsc/console",
         "//runsc/container",
         "//runsc/flag",
@@ -79,11 +82,11 @@ go_test(
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
+        "//pkg/test/testutil",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/container",
         "//runsc/specutils",
-        "//runsc/testutil",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 0f3da69a0..01204ab4d 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -23,9 +23,10 @@ import (
 
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/runsc/boot"
-	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -82,8 +83,13 @@ type Boot struct {
 	// sandbox (e.g. gofer) and sent through this FD.
 	mountsFD int
 
-	// pidns is set if the sanadbox is in its own pid namespace.
+	// pidns is set if the sandbox is in its own pid namespace.
 	pidns bool
+
+	// attached is set to true to kill the sandbox process when the parent process
+	// terminates. This flag is set when the command execve's itself because
+	// parent death signal doesn't propagate through execve when uid/gid changes.
+	attached bool
 }
 
 // Name implements subcommands.Command.Name.
@@ -118,6 +124,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
 	f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
 	f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
+	f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates")
 }
 
 // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
@@ -129,33 +136,36 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Ensure that if there is a panic, all goroutine stacks are printed.
-	debug.SetTraceback("all")
+	debug.SetTraceback("system")
 
 	conf := args[0].(*boot.Config)
 
+	if b.attached {
+		// Ensure this process is killed after parent process terminates when
+		// attached mode is enabled. In the unfortunate event that the parent
+		// terminates before this point, this process leaks.
+		if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
+			Fatalf("error setting parent death signal: %v", err)
+		}
+	}
+
 	if b.setUpRoot {
 		if err := setUpChroot(b.pidns); err != nil {
 			Fatalf("error setting up chroot: %v", err)
 		}
 
-		if !b.applyCaps {
-			// Remove --setup-root arg to call myself.
-			var args []string
-			for _, arg := range os.Args {
-				if !strings.Contains(arg, "setup-root") {
-					args = append(args, arg)
-				}
-			}
-			if !conf.Rootless {
-				// Note that we've already read the spec from the spec FD, and
-				// we will read it again after the exec call. This works
-				// because the ReadSpecFromFile function seeks to the beginning
-				// of the file before reading.
-				if err := callSelfAsNobody(args); err != nil {
-					Fatalf("%v", err)
-				}
-				panic("callSelfAsNobody must never return success")
+		if !b.applyCaps && !conf.Rootless {
+			// Remove --apply-caps arg to call myself. It has already been done.
+			args := prepareArgs(b.attached, "setup-root")
+
+			// Note that we've already read the spec from the spec FD, and
+			// we will read it again after the exec call. This works
+			// because the ReadSpecFromFile function seeks to the beginning
+			// of the file before reading.
+			if err := callSelfAsNobody(args); err != nil {
+				Fatalf("%v", err)
 			}
+			panic("callSelfAsNobody must never return success")
 		}
 	}
 
@@ -173,7 +183,12 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		if caps == nil {
 			caps = &specs.LinuxCapabilities{}
 		}
-		if conf.Platform == platforms.Ptrace {
+
+		gPlatform, err := platform.Lookup(conf.Platform)
+		if err != nil {
+			Fatalf("loading platform: %v", err)
+		}
+		if gPlatform.Requirements().RequiresCapSysPtrace {
 			// Ptrace platform requires extra capabilities.
 			const c = "CAP_SYS_PTRACE"
 			caps.Bounding = append(caps.Bounding, c)
@@ -181,13 +196,9 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			caps.Permitted = append(caps.Permitted, c)
 		}
 
-		// Remove --apply-caps arg to call myself.
-		var args []string
-		for _, arg := range os.Args {
-			if !strings.Contains(arg, "setup-root") && !strings.Contains(arg, "apply-caps") {
-				args = append(args, arg)
-			}
-		}
+		// Remove --apply-caps and --setup-root arg to call myself. Both have
+		// already been done.
+		args := prepareArgs(b.attached, "setup-root", "apply-caps")
 
 		// Note that we've already read the spec from the spec FD, and
 		// we will read it again after the exec call. This works
@@ -258,3 +269,22 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	l.Destroy()
 	return subcommands.ExitSuccess
 }
+
+func prepareArgs(attached bool, exclude ...string) []string {
+	var args []string
+	for _, arg := range os.Args {
+		for _, excl := range exclude {
+			if strings.Contains(arg, excl) {
+				goto skip
+			}
+		}
+		args = append(args, arg)
+		if attached && arg == "boot" {
+			// Strategicaly place "--attached" after the command. This is needed
+			// to ensure the new process is killed when the parent process terminates.
+			args = append(args, "--attached")
+		}
+	skip:
+	}
+	return args
+}
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index 0c27f7313..a84067112 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -23,10 +23,10 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/syndtr/gocapability/capability"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func init() {
@@ -85,21 +85,20 @@ func TestCapabilities(t *testing.T) {
 		Inheritable: caps,
 	}
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 
 	// Use --network=host to make sandbox use spec's capabilities.
 	conf.Network = boot.NetworkHost
 
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := container.Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
index b5a0ce17d..189244765 100644
--- a/runsc/cmd/chroot.go
+++ b/runsc/cmd/chroot.go
@@ -50,7 +50,7 @@ func pivotRoot(root string) error {
 	// new_root, so after umounting the old_root, we will see only
 	// the new_root in "/".
 	if err := syscall.PivotRoot(".", "."); err != nil {
-		return fmt.Errorf("error changing root filesystem: %v", err)
+		return fmt.Errorf("pivot_root failed, make sure that the root mount has a parent: %v", err)
 	}
 
 	if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 79965460e..b5de2588b 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -32,17 +32,20 @@ import (
 
 // Debug implements subcommands.Command for the "debug" command.
 type Debug struct {
-	pid         int
-	stacks      bool
-	signal      int
-	profileHeap string
-	profileCPU  string
-	trace       string
-	strace      string
-	logLevel    string
-	logPackets  string
-	duration    time.Duration
-	ps          bool
+	pid              int
+	stacks           bool
+	signal           int
+	profileHeap      string
+	profileCPU       string
+	profileGoroutine string
+	profileBlock     string
+	profileMutex     string
+	trace            string
+	strace           string
+	logLevel         string
+	logPackets       string
+	duration         time.Duration
+	ps               bool
 }
 
 // Name implements subcommands.Command.
@@ -66,6 +69,9 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
 	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
 	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
+	f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.")
+	f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.")
+	f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.")
 	f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
 	f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
@@ -147,6 +153,42 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("Heap profile written to %q", d.profileHeap)
 	}
+	if d.profileGoroutine != "" {
+		f, err := os.Create(d.profileGoroutine)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.GoroutineProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Goroutine profile written to %q", d.profileGoroutine)
+	}
+	if d.profileBlock != "" {
+		f, err := os.Create(d.profileBlock)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.BlockProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Block profile written to %q", d.profileBlock)
+	}
+	if d.profileMutex != "" {
+		f, err := os.Create(d.profileMutex)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.MutexProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Mutex profile written to %q", d.profileMutex)
+	}
 
 	delay := false
 	if d.profileCPU != "" {
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index b184bd402..7d1310c96 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -166,15 +166,33 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 		return Errorf("Error write spec: %v", err)
 	}
 
-	runArgs := container.Args{
+	containerArgs := container.Args{
 		ID:        cid,
 		Spec:      spec,
 		BundleDir: tmpDir,
 		Attached:  true,
 	}
-	ws, err := container.Run(conf, runArgs)
+	ct, err := container.New(conf, containerArgs)
 	if err != nil {
-		return Errorf("running container: %v", err)
+		return Errorf("creating container: %v", err)
+	}
+	defer ct.Destroy()
+
+	if err := ct.Start(conf); err != nil {
+		return Errorf("starting container: %v", err)
+	}
+
+	// Forward signals to init in the container. Thus if we get SIGINT from
+	// ^C, the container gracefully exit, and we can clean up.
+	//
+	// N.B. There is a still a window before this where a signal may kill
+	// this process, skipping cleanup.
+	stopForwarding := ct.ForwardSignals(0 /* pid */, false /* fgProcess */)
+	defer stopForwarding()
+
+	ws, err := ct.Wait()
+	if err != nil {
+		return Errorf("waiting for container: %v", err)
 	}
 
 	*waitStatus = ws
@@ -237,20 +255,27 @@ func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) {
 	for _, cmd := range cmds {
 		log.Debugf("Run %q", cmd)
 		args := strings.Split(cmd, " ")
-		c := exec.Command(args[0], args[1:]...)
-		if err := c.Run(); err != nil {
+		cmd := exec.Command(args[0], args[1:]...)
+		if err := cmd.Run(); err != nil {
+			c.cleanupNet(cid, dev, "", "", "")
 			return nil, fmt.Errorf("failed to run %q: %v", cmd, err)
 		}
 	}
 
-	if err := makeFile("/etc/resolv.conf", "nameserver 8.8.8.8\n", spec); err != nil {
+	resolvPath, err := makeFile("/etc/resolv.conf", "nameserver 8.8.8.8\n", spec)
+	if err != nil {
+		c.cleanupNet(cid, dev, "", "", "")
 		return nil, err
 	}
-	if err := makeFile("/etc/hostname", cid+"\n", spec); err != nil {
+	hostnamePath, err := makeFile("/etc/hostname", cid+"\n", spec)
+	if err != nil {
+		c.cleanupNet(cid, dev, resolvPath, "", "")
 		return nil, err
 	}
 	hosts := fmt.Sprintf("127.0.0.1\tlocalhost\n%s\t%s\n", c.ip, cid)
-	if err := makeFile("/etc/hosts", hosts, spec); err != nil {
+	hostsPath, err := makeFile("/etc/hosts", hosts, spec)
+	if err != nil {
+		c.cleanupNet(cid, dev, resolvPath, hostnamePath, "")
 		return nil, err
 	}
 
@@ -263,19 +288,22 @@ func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) {
 	}
 	spec.Linux.Namespaces = append(spec.Linux.Namespaces, netns)
 
-	return func() { c.cleanNet(cid, dev) }, nil
+	return func() { c.cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath) }, nil
 }
 
-func (c *Do) cleanNet(cid, dev string) {
-	veth, peer := deviceNames(cid)
+// cleanupNet tries to cleanup the network setup in setupNet.
+//
+// It may be called when setupNet is only partially complete, in which case it
+// will cleanup as much as possible, logging warnings for the rest.
+//
+// Unfortunately none of this can be automatically cleaned up on process exit,
+// we must do so explicitly.
+func (c *Do) cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath string) {
+	_, peer := deviceNames(cid)
 
 	cmds := []string{
 		fmt.Sprintf("ip link delete %s", peer),
 		fmt.Sprintf("ip netns delete %s", cid),
-
-		fmt.Sprintf("iptables -t nat -D POSTROUTING -s %s/24 -o %s -j MASQUERADE", c.ip, dev),
-		fmt.Sprintf("iptables -D FORWARD -i %s -o %s -j ACCEPT", dev, veth),
-		fmt.Sprintf("iptables -D FORWARD -o %s -i %s -j ACCEPT", dev, veth),
 	}
 
 	for _, cmd := range cmds {
@@ -286,6 +314,10 @@ func (c *Do) cleanNet(cid, dev string) {
 			log.Warningf("Failed to run %q: %v", cmd, err)
 		}
 	}
+
+	tryRemove(resolvPath)
+	tryRemove(hostnamePath)
+	tryRemove(hostsPath)
 }
 
 func deviceNames(cid string) (string, string) {
@@ -306,13 +338,16 @@ func defaultDevice() (string, error) {
 	return parts[4], nil
 }
 
-func makeFile(dest, content string, spec *specs.Spec) error {
+func makeFile(dest, content string, spec *specs.Spec) (string, error) {
 	tmpFile, err := ioutil.TempFile("", filepath.Base(dest))
 	if err != nil {
-		return err
+		return "", err
 	}
 	if _, err := tmpFile.WriteString(content); err != nil {
-		return err
+		if err := os.Remove(tmpFile.Name()); err != nil {
+			log.Warningf("Failed to remove %q: %v", tmpFile, err)
+		}
+		return "", err
 	}
 	spec.Mounts = append(spec.Mounts, specs.Mount{
 		Source:      tmpFile.Name(),
@@ -320,7 +355,17 @@ func makeFile(dest, content string, spec *specs.Spec) error {
 		Type:        "bind",
 		Options:     []string{"ro"},
 	})
-	return nil
+	return tmpFile.Name(), nil
+}
+
+func tryRemove(path string) {
+	if path == "" {
+		return
+	}
+
+	if err := os.Remove(path); err != nil {
+		log.Warningf("Failed to remove %q: %v", path, err)
+	}
 }
 
 func calculatePeerIP(ip string) (string, error) {
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 6e06f3c0f..10448a759 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -168,7 +168,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Start with root mount, then add any other additional mount as needed.
 	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
 	ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
-		ROMount:      spec.Root.Readonly,
+		ROMount:      spec.Root.Readonly || conf.Overlay,
 		PanicOnWrite: g.panicOnWrite,
 	})
 	if err != nil {
@@ -181,7 +181,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
 			cfg := fsgofer.Config{
-				ROMount:      isReadonlyMount(m.Options),
+				ROMount:      isReadonlyMount(m.Options) || conf.Overlay,
 				PanicOnWrite: g.panicOnWrite,
 				HostUDS:      conf.FSGoferHostUDS,
 			}
@@ -272,9 +272,8 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 
 	root := spec.Root.Path
 	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		// FIXME: runsc can't be re-executed without
-		// /proc, so we create a tmpfs mount, mount ./proc and ./root
-		// there, then move this mount to the root and after
+		// runsc can't be re-executed without /proc, so we create a tmpfs mount,
+		// mount ./proc and ./root there, then move this mount to the root and after
 		// setCapsAndCallSelf, runsc will chroot into /root.
 		//
 		// We need a directory to construct a new root and we know that
@@ -335,7 +334,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 
 	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		if err := pivotRoot("/proc"); err != nil {
-			Fatalf("faild to change the root file system: %v", err)
+			Fatalf("failed to change the root file system: %v", err)
 		}
 		if err := os.Chdir("/"); err != nil {
 			Fatalf("failed to change working directory")
diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go
index c7d210140..cd85dabbb 100644
--- a/runsc/cmd/help.go
+++ b/runsc/cmd/help.go
@@ -65,16 +65,10 @@ func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}
 	switch f.NArg() {
 	case 0:
 		fmt.Fprintf(h.cdr.Output, "Usage: %s <flags> <subcommand> <subcommand args>\n\n", h.cdr.Name())
-		fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open
-Container Initiative (OCI) format. Applications run by runsc are run in an
-isolated gVisor sandbox that emulates a Linux environment.
+		fmt.Fprintf(h.cdr.Output, `runsc is the gVisor container runtime.
 
-gVisor is a user-space kernel, written in Go, that implements a substantial
-portion of the Linux system call interface. It provides an additional layer
-of isolation between running applications and the host operating system.
-
-Functionality is provided by subcommands. For additonal help on individual
-subcommands use "%s %s <subcommand>".
+Functionality is provided by subcommands. For help with a specific subcommand,
+use "%s %s <subcommand>".
 
 `, h.cdr.Name(), h.Name())
 		h.cdr.VisitGroups(func(g *subcommands.CommandGroup) {
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index 8e2b36e85..a2b0a4b14 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -16,6 +16,7 @@ package cmd
 
 import (
 	"context"
+	"fmt"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@@ -24,7 +25,8 @@ import (
 	"gvisor.dev/gvisor/runsc/flag"
 )
 
-var specTemplate = []byte(`{
+func genSpec(cwd string) []byte {
+	var template = fmt.Sprintf(`{
 	"ociVersion": "1.0.0",
 	"process": {
 		"terminal": true,
@@ -39,7 +41,7 @@ var specTemplate = []byte(`{
 			"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
 			"TERM=xterm"
 		],
-		"cwd": "/",
+		"cwd": "%s",
 		"capabilities": {
 			"bounding": [
 				"CAP_AUDIT_WRITE",
@@ -123,11 +125,15 @@ var specTemplate = []byte(`{
 			}
 		]
 	}
-}`)
+}`, cwd)
+
+	return []byte(template)
+}
 
 // Spec implements subcommands.Command for the "spec" command.
 type Spec struct {
 	bundle string
+	cwd    string
 }
 
 // Name implements subcommands.Command.Name.
@@ -165,6 +171,8 @@ EXAMPLE:
 // SetFlags implements subcommands.Command.SetFlags.
 func (s *Spec) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle")
+	f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+
+		"this value MUST be an absolute path")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -174,7 +182,9 @@ func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("file %q already exists", confPath)
 	}
 
-	if err := ioutil.WriteFile(confPath, specTemplate, 0664); err != nil {
+	var spec = genSpec(s.cwd)
+
+	if err := ioutil.WriteFile(confPath, spec, 0664); err != nil {
 		Fatalf("writing to %q: %v", confPath, err)
 	}
 
diff --git a/runsc/cmd/statefile.go b/runsc/cmd/statefile.go
new file mode 100644
index 000000000..e6f1907da
--- /dev/null
+++ b/runsc/cmd/statefile.go
@@ -0,0 +1,143 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+	"os"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/state"
+	"gvisor.dev/gvisor/pkg/state/statefile"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Statefile implements subcommands.Command for the "statefile" command.
+type Statefile struct {
+	list   bool
+	get    string
+	key    string
+	output string
+	html   bool
+}
+
+// Name implements subcommands.Command.
+func (*Statefile) Name() string {
+	return "state"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Statefile) Synopsis() string {
+	return "shows information about a statefile"
+}
+
+// Usage implements subcommands.Command.
+func (*Statefile) Usage() string {
+	return `statefile [flags] <statefile>`
+}
+
+// SetFlags implements subcommands.Command.
+func (s *Statefile) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&s.list, "list", false, "lists the metdata in the statefile.")
+	f.StringVar(&s.get, "get", "", "extracts the given metadata key.")
+	f.StringVar(&s.key, "key", "", "the integrity key for the file.")
+	f.StringVar(&s.output, "output", "", "target to write the result.")
+	f.BoolVar(&s.html, "html", false, "outputs in HTML format.")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (s *Statefile) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	// Check arguments.
+	if s.list && s.get != "" {
+		Fatalf("error: can't specify -list and -get simultaneously.")
+	}
+
+	// Setup output.
+	var output = os.Stdout // Default.
+	if s.output != "" {
+		f, err := os.OpenFile(s.output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644)
+		if err != nil {
+			Fatalf("error opening output: %v", err)
+		}
+		defer func() {
+			if err := f.Close(); err != nil {
+				Fatalf("error flushing output: %v", err)
+			}
+		}()
+		output = f
+	}
+
+	// Open the file.
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+	input, err := os.Open(f.Arg(0))
+	if err != nil {
+		Fatalf("error opening input: %v\n", err)
+	}
+
+	if s.html {
+		fmt.Fprintf(output, "<html><body>\n")
+		defer fmt.Fprintf(output, "</body></html>\n")
+	}
+
+	// Dump the full file?
+	if !s.list && s.get == "" {
+		var key []byte
+		if s.key != "" {
+			key = []byte(s.key)
+		}
+		rc, _, err := statefile.NewReader(input, key)
+		if err != nil {
+			Fatalf("error parsing statefile: %v", err)
+		}
+		if err := state.PrettyPrint(output, rc, s.html); err != nil {
+			Fatalf("error printing state: %v", err)
+		}
+		return subcommands.ExitSuccess
+	}
+
+	// Load just the metadata.
+	metadata, err := statefile.MetadataUnsafe(input)
+	if err != nil {
+		Fatalf("error reading metadata: %v", err)
+	}
+
+	// Is it a single key?
+	if s.get != "" {
+		val, ok := metadata[s.get]
+		if !ok {
+			Fatalf("metadata key %s: not found", s.get)
+		}
+		fmt.Fprintf(output, "%s\n", val)
+		return subcommands.ExitSuccess
+	}
+
+	// List all keys.
+	if s.html {
+		fmt.Fprintf(output, " <ul>\n")
+		defer fmt.Fprintf(output, " </ul>\n")
+	}
+	for key := range metadata {
+		if s.html {
+			fmt.Fprintf(output, "  <li>%s</li>\n", key)
+		} else {
+			fmt.Fprintf(output, "%s\n", key)
+		}
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go
index 7072547be..a37d66139 100644
--- a/runsc/cmd/syscalls.go
+++ b/runsc/cmd/syscalls.go
@@ -32,9 +32,10 @@ import (
 
 // Syscalls implements subcommands.Command for the "syscalls" command.
 type Syscalls struct {
-	output string
-	os     string
-	arch   string
+	format   string
+	os       string
+	arch     string
+	filename string
 }
 
 // CompatibilityInfo is a map of system and architecture to compatibility doc.
@@ -95,16 +96,17 @@ func (*Syscalls) Usage() string {
 
 // SetFlags implements subcommands.Command.SetFlags.
 func (s *Syscalls) SetFlags(f *flag.FlagSet) {
-	f.StringVar(&s.output, "o", "table", "Output format (table, csv, json).")
+	f.StringVar(&s.format, "format", "table", "Output format (table, csv, json).")
 	f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)")
 	f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).")
+	f.StringVar(&s.filename, "filename", "", "Output filename (otherwise stdout).")
 }
 
 // Execute implements subcommands.Command.Execute.
 func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	out, ok := outputMap[s.output]
+	out, ok := outputMap[s.format]
 	if !ok {
-		Fatalf("Unsupported output format %q", s.output)
+		Fatalf("Unsupported output format %q", s.format)
 	}
 
 	// Build map of all supported architectures.
@@ -124,7 +126,14 @@ func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface
 		Fatalf("%v", err)
 	}
 
-	if err := out(os.Stdout, info); err != nil {
+	w := os.Stdout // Default.
+	if s.filename != "" {
+		w, err = os.OpenFile(s.filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
+		if err != nil {
+			Fatalf("Error opening %q: %v", s.filename, err)
+		}
+	}
+	if err := out(w, info); err != nil {
 		Fatalf("Error writing output: %v", err)
 	}
 
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 0aaeea3a8..49cfb0837 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -15,8 +15,11 @@ go_library(
         "//test:__subpackages__",
     ],
     deps = [
+        "//pkg/abi/linux",
+        "//pkg/cleanup",
         "//pkg/log",
         "//pkg/sentry/control",
+        "//pkg/sentry/sighandling",
         "//pkg/sync",
         "//runsc/boot",
         "//runsc/cgroup",
@@ -33,33 +36,36 @@ go_test(
     size = "large",
     srcs = [
         "console_test.go",
+        "container_norace_test.go",
+        "container_race_test.go",
         "container_test.go",
         "multi_container_test.go",
         "shared_volume_test.go",
     ],
     data = [
         "//runsc",
-        "//runsc/container/test_app",
+        "//test/cmd/test_app",
     ],
     library = ":container",
-    shard_count = 5,
+    shard_count = 10,
     tags = [
         "requires-kvm",
     ],
     deps = [
         "//pkg/abi/linux",
         "//pkg/bits",
+        "//pkg/cleanup",
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sync",
+        "//pkg/test/testutil",
         "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/boot/platforms",
         "//runsc/specutils",
-        "//runsc/testutil",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index c2518d52b..3813c6b93 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -29,9 +29,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/urpc"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // socketPath creates a path inside bundleDir and ensures that the returned
@@ -58,25 +58,26 @@ func socketPath(bundleDir string) (string, error) {
 }
 
 // createConsoleSocket creates a socket at the given path that will receive a
-// console fd from the sandbox. If no error occurs, it returns the server
-// socket and a cleanup function.
-func createConsoleSocket(path string) (*unet.ServerSocket, func() error, error) {
+// console fd from the sandbox. If an error occurs, t.Fatalf will be called.
+// The function returning should be deferred as cleanup.
+func createConsoleSocket(t *testing.T, path string) (*unet.ServerSocket, func()) {
+	t.Helper()
 	srv, err := unet.BindAndListen(path, false)
 	if err != nil {
-		return nil, nil, fmt.Errorf("error binding and listening to socket %q: %v", path, err)
+		t.Fatalf("error binding and listening to socket %q: %v", path, err)
 	}
 
-	cleanup := func() error {
+	cleanup := func() {
+		// Log errors; nothing can be done.
 		if err := srv.Close(); err != nil {
-			return fmt.Errorf("error closing socket %q: %v", path, err)
+			t.Logf("error closing socket %q: %v", path, err)
 		}
 		if err := os.Remove(path); err != nil {
-			return fmt.Errorf("error removing socket %q: %v", path, err)
+			t.Logf("error removing socket %q: %v", path, err)
 		}
-		return nil
 	}
 
-	return srv, cleanup, nil
+	return srv, cleanup
 }
 
 // receiveConsolePTY accepts a connection on the server socket and reads fds.
@@ -118,63 +119,59 @@ func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) {
 
 // Test that an pty FD is sent over the console socket if one is provided.
 func TestConsoleSocket(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-		spec := testutil.NewSpecWithArgs("true")
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("true")
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		sock, err := socketPath(bundleDir)
-		if err != nil {
-			t.Fatalf("error getting socket path: %v", err)
-		}
-		srv, cleanup, err := createConsoleSocket(sock)
-		if err != nil {
-			t.Fatalf("error creating socket at %q: %v", sock, err)
-		}
-		defer cleanup()
-
-		// Create the container and pass the socket name.
-		args := Args{
-			ID:            testutil.UniqueContainerID(),
-			Spec:          spec,
-			BundleDir:     bundleDir,
-			ConsoleSocket: sock,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
+			sock, err := socketPath(bundleDir)
+			if err != nil {
+				t.Fatalf("error getting socket path: %v", err)
+			}
+			srv, cleanup := createConsoleSocket(t, sock)
+			defer cleanup()
+
+			// Create the container and pass the socket name.
+			args := Args{
+				ID:            testutil.RandomContainerID(),
+				Spec:          spec,
+				BundleDir:     bundleDir,
+				ConsoleSocket: sock,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
 
-		// Make sure we get a console PTY.
-		ptyMaster, err := receiveConsolePTY(srv)
-		if err != nil {
-			t.Fatalf("error receiving console FD: %v", err)
-		}
-		ptyMaster.Close()
+			// Make sure we get a console PTY.
+			ptyMaster, err := receiveConsolePTY(srv)
+			if err != nil {
+				t.Fatalf("error receiving console FD: %v", err)
+			}
+			ptyMaster.Close()
+		})
 	}
 }
 
 // Test that job control signals work on a console created with "exec -ti".
 func TestJobControlSignalExec(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -286,32 +283,28 @@ func TestJobControlSignalExec(t *testing.T) {
 
 // Test that job control signals work on a console created with "run -ti".
 func TestJobControlSignalRootContainer(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	// Don't let bash execute from profile or rc files, otherwise our PID
 	// counts get messed up.
 	spec := testutil.NewSpecWithArgs("/bin/bash", "--noprofile", "--norc")
 	spec.Process.Terminal = true
 
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	sock, err := socketPath(bundleDir)
 	if err != nil {
 		t.Fatalf("error getting socket path: %v", err)
 	}
-	srv, cleanup, err := createConsoleSocket(sock)
-	if err != nil {
-		t.Fatalf("error creating socket at %q: %v", sock, err)
-	}
+	srv, cleanup := createConsoleSocket(t, sock)
 	defer cleanup()
 
 	// Create the container and pass the socket name.
 	args := Args{
-		ID:            testutil.UniqueContainerID(),
+		ID:            testutil.RandomContainerID(),
 		Spec:          spec,
 		BundleDir:     bundleDir,
 		ConsoleSocket: sock,
@@ -333,13 +326,13 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	// file. Writes after a certain point will block unless we drain the
 	// PTY, so we must continually copy from it.
 	//
-	// We log the output to stdout for debugabilitly, and also to a buffer,
+	// We log the output to stderr for debugabilitly, and also to a buffer,
 	// since we wait on particular output from bash below. We use a custom
 	// blockingBuffer which is thread-safe and also blocks on Read calls,
 	// which makes this a suitable Reader for WaitUntilRead.
 	ptyBuf := newBlockingBuffer()
 	tee := io.TeeReader(ptyMaster, ptyBuf)
-	go io.Copy(os.Stdout, tee)
+	go io.Copy(os.Stderr, tee)
 
 	// Start the container.
 	if err := c.Start(conf); err != nil {
@@ -368,7 +361,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 		{PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Fatal(err)
+		t.Fatalf("error waiting for processes: %v", err)
 	}
 
 	// Execute sleep via the terminal.
@@ -377,7 +370,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	// Wait for sleep to start.
 	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
 	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Fatal(err)
+		t.Fatalf("error waiting for processes: %v", err)
 	}
 
 	// Reset the pty buffer, so there is less output for us to scan later.
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 68782c4be..6d297d0df 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -17,11 +17,11 @@ package container
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"io/ioutil"
 	"os"
 	"os/exec"
-	"os/signal"
 	"regexp"
 	"strconv"
 	"strings"
@@ -30,8 +30,11 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/sighandling"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/cgroup"
 	"gvisor.dev/gvisor/runsc/sandbox"
@@ -273,7 +276,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 	}
 
 	if err := os.MkdirAll(conf.RootDir, 0711); err != nil {
-		return nil, fmt.Errorf("creating container root directory: %v", err)
+		return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err)
 	}
 
 	c := &Container{
@@ -291,7 +294,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 	}
 	// The Cleanup object cleans up partially created containers when an error
 	// occurs. Any errors occurring during cleanup itself are ignored.
-	cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+	cu := cleanup.Make(func() { _ = c.Destroy() })
 	defer cu.Clean()
 
 	// Lock the container metadata file to prevent concurrent creations of
@@ -400,7 +403,7 @@ func (c *Container) Start(conf *boot.Config) error {
 	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	unlock := specutils.MakeCleanup(func() { c.Saver.unlock() })
+	unlock := cleanup.Make(func() { c.Saver.unlock() })
 	defer unlock.Clean()
 
 	if err := c.requireStatus("start", Created); err != nil {
@@ -420,7 +423,7 @@ func (c *Container) Start(conf *boot.Config) error {
 			return err
 		}
 	} else {
-		// Join cgroup to strt gofer process to ensure it's part of the cgroup from
+		// Join cgroup to start gofer process to ensure it's part of the cgroup from
 		// the start (and all their children processes).
 		if err := runInCgroup(c.Sandbox.Cgroup, func() error {
 			// Create the gofer process.
@@ -504,7 +507,7 @@ func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) {
 	}
 	// Clean up partially created container if an error occurs.
 	// Any errors returned by Destroy() itself are ignored.
-	cu := specutils.MakeCleanup(func() {
+	cu := cleanup.Make(func() {
 		c.Destroy()
 	})
 	defer cu.Clean()
@@ -620,21 +623,15 @@ func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error {
 // forwarding signals.
 func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() {
 	log.Debugf("Forwarding all signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
-	sigCh := make(chan os.Signal, 1)
-	signal.Notify(sigCh)
-	go func() {
-		for s := range sigCh {
-			log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", s, c.ID, pid, fgProcess)
-			if err := c.Sandbox.SignalProcess(c.ID, pid, s.(syscall.Signal), fgProcess); err != nil {
-				log.Warningf("error forwarding signal %d to container %q: %v", s, c.ID, err)
-			}
+	stop := sighandling.StartSignalForwarding(func(sig linux.Signal) {
+		log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", sig, c.ID, pid, fgProcess)
+		if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.Signal(sig), fgProcess); err != nil {
+			log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err)
 		}
-		log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
-	}()
-
+	})
 	return func() {
-		signal.Stop(sigCh)
-		close(sigCh)
+		log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+		stop()
 	}
 }
 
@@ -1066,27 +1063,19 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
 
 // adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer.
 func (c *Container) adjustGoferOOMScoreAdj() error {
-	if c.GoferPid != 0 && c.Spec.Process.OOMScoreAdj != nil {
-		if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
-			// Ignore NotExist error because it can be returned when the sandbox
-			// exited while OOM score was being adjusted.
-			if !os.IsNotExist(err) {
-				return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
-			}
-			log.Warningf("Gofer process (%d) not found setting oom_score_adj", c.GoferPid)
-		}
+	if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil {
+		return nil
 	}
-
-	return nil
+	return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj)
 }
 
 // adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox.
 // oom_score_adj is set to the lowest oom_score_adj among the containers
 // running in the sandbox.
 //
-// TODO(gvisor.dev/issue/512): This call could race with other containers being
+// TODO(gvisor.dev/issue/238): This call could race with other containers being
 // created at the same time and end up setting the wrong oom_score_adj to the
-// sandbox.
+// sandbox. Use rpc client to synchronize.
 func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) error {
 	containers, err := loadSandbox(rootDir, s.ID)
 	if err != nil {
@@ -1154,29 +1143,29 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool)
 	}
 
 	// Set the lowest of all containers oom_score_adj to the sandbox.
-	if err := setOOMScoreAdj(s.Pid, lowScore); err != nil {
-		// Ignore NotExist error because it can be returned when the sandbox
-		// exited while OOM score was being adjusted.
-		if !os.IsNotExist(err) {
-			return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", s.ID, err)
-		}
-		log.Warningf("Sandbox process (%d) not found setting oom_score_adj", s.Pid)
-	}
-
-	return nil
+	return setOOMScoreAdj(s.Pid, lowScore)
 }
 
 // setOOMScoreAdj sets oom_score_adj to the given value for the given PID.
 // /proc must be available and mounted read-write. scoreAdj should be between
-// -1000 and 1000.
+// -1000 and 1000. It's a noop if the process has already exited.
 func setOOMScoreAdj(pid int, scoreAdj int) error {
 	f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644)
 	if err != nil {
+		// Ignore NotExist errors because it can race with process exit.
+		if os.IsNotExist(err) {
+			log.Warningf("Process (%d) not found setting oom_score_adj", pid)
+			return nil
+		}
 		return err
 	}
 	defer f.Close()
 	if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil {
-		return err
+		if errors.Is(err, syscall.ESRCH) {
+			log.Warningf("Process (%d) exited while setting oom_score_adj", pid)
+			return nil
+		}
+		return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err)
 	}
 	return nil
 }
diff --git a/test/root/testdata/httpd.go b/runsc/container/container_norace_test.go
index 45d5e33d4..838c1e20a 100644
--- a/test/root/testdata/httpd.go
+++ b/runsc/container/container_norace_test.go
@@ -12,21 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package testdata
+// +build !race
 
-// Httpd is a JSON config for an httpd container.
-const Httpd = `
-{
-  "metadata": {
-    "name": "httpd"
-  },
-  "image":{
-    "image": "httpd"
-  },
-  "mounts": [
-  ],
-  "linux": {
-  },
-  "log_path": "httpd.log"
-}
-`
+package container
+
+// Allow both kvm and ptrace for non-race builds.
+var platformOptions = []configOption{ptrace, kvm}
diff --git a/runsc/container/container_race_test.go b/runsc/container/container_race_test.go
new file mode 100644
index 000000000..9fb4c4fc0
--- /dev/null
+++ b/runsc/container/container_race_test.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package container
+
+// Only enabled ptrace with race builds.
+var platformOptions = []configOption{ptrace}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 04a7dc237..e7715b6f7 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -39,10 +39,10 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // waitForProcessList waits for the given process list to show up in the container.
@@ -71,6 +71,7 @@ func waitForProcessCount(cont *Container, want int) error {
 			return &backoff.PermanentError{Err: err}
 		}
 		if got := len(pss); got != want {
+			log.Infof("Waiting for process count to reach %d. Current: %d", want, got)
 			return fmt.Errorf("wrong process count, got: %d, want: %d", got, want)
 		}
 		return nil
@@ -123,23 +124,6 @@ func procListsEqual(got, want []*control.Process) (bool, error) {
 	return true, nil
 }
 
-// getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
-// test for equality. This is because we already confirmed that exec occurred.
-func getAndCheckProcLists(cont *Container, want []*control.Process) error {
-	got, err := cont.Processes()
-	if err != nil {
-		return fmt.Errorf("error getting process data from container: %v", err)
-	}
-	equal, err := procListsEqual(got, want)
-	if err != nil {
-		return err
-	}
-	if equal {
-		return nil
-	}
-	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
-}
-
 func procListToString(pl []*control.Process) string {
 	strs := make([]string, 0, len(pl))
 	for _, p := range pl {
@@ -231,16 +215,15 @@ func readOutputNum(file string, position int) (int, error) {
 // run starts the sandbox and waits for it to exit, checking that the
 // application succeeded.
 func run(spec *specs.Spec, conf *boot.Config) error {
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		return fmt.Errorf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create, start and wait for the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 		Attached:  true,
@@ -259,39 +242,64 @@ type configOption int
 
 const (
 	overlay configOption = iota
+	ptrace
 	kvm
 	nonExclusiveFS
 )
 
-var noOverlay = []configOption{kvm, nonExclusiveFS}
-var all = append(noOverlay, overlay)
+var (
+	noOverlay = append(platformOptions, nonExclusiveFS)
+	all       = append(noOverlay, overlay)
+)
 
 // configs generates different configurations to run tests.
-func configs(opts ...configOption) []*boot.Config {
+func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
 	// Always load the default config.
-	cs := []*boot.Config{testutil.TestConfig()}
-
+	cs := make(map[string]*boot.Config)
 	for _, o := range opts {
-		c := testutil.TestConfig()
 		switch o {
 		case overlay:
+			c := testutil.TestConfig(t)
 			c.Overlay = true
+			cs["overlay"] = c
+		case ptrace:
+			c := testutil.TestConfig(t)
+			c.Platform = platforms.Ptrace
+			cs["ptrace"] = c
 		case kvm:
-			// TODO(b/112165693): KVM tests are flaky. Disable until fixed.
-			continue
-
+			c := testutil.TestConfig(t)
 			c.Platform = platforms.KVM
+			cs["kvm"] = c
 		case nonExclusiveFS:
+			c := testutil.TestConfig(t)
 			c.FileAccess = boot.FileAccessShared
+			cs["non-exclusive"] = c
 		default:
 			panic(fmt.Sprintf("unknown config option %v", o))
-
 		}
-		cs = append(cs, c)
 	}
 	return cs
 }
 
+func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config {
+	vfs1 := configs(t, opts...)
+
+	var optsVFS2 []configOption
+	for _, opt := range opts {
+		// TODO(gvisor.dev/issue/1487): Enable overlay tests.
+		if opt != overlay {
+			optsVFS2 = append(optsVFS2, opt)
+		}
+	}
+
+	for key, value := range configs(t, optsVFS2...) {
+		value.VFS2 = true
+		vfs1[key+"VFS2"] = value
+	}
+
+	return vfs1
+}
+
 // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
 // It verifies after each step that the container can be loaded from disk, and
 // has the correct status.
@@ -301,133 +309,133 @@ func TestLifecycle(t *testing.T) {
 	childReaper.Start()
 	defer childReaper.Stop()
 
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-		// The container will just sleep for a long time.  We will kill it before
-		// it finishes sleeping.
-		spec := testutil.NewSpecWithArgs("sleep", "100")
-
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
-
-		// expectedPL lists the expected process state of the container.
-		expectedPL := []*control.Process{
-			{
-				UID:     0,
-				PID:     1,
-				PPID:    0,
-				C:       0,
-				Cmd:     "sleep",
-				Threads: []kernel.ThreadID{1},
-			},
-		}
-		// Create the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			// The container will just sleep for a long time.  We will kill it before
+			// it finishes sleeping.
+			spec := testutil.NewSpecWithArgs("sleep", "100")
 
-		// Load the container from disk and check the status.
-		c, err = Load(rootDir, args.ID)
-		if err != nil {
-			t.Fatalf("error loading container: %v", err)
-		}
-		if got, want := c.Status, Created; got != want {
-			t.Errorf("container status got %v, want %v", got, want)
-		}
+			rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// List should return the container id.
-		ids, err := List(rootDir)
-		if err != nil {
-			t.Fatalf("error listing containers: %v", err)
-		}
-		if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) {
-			t.Errorf("container list got %v, want %v", got, want)
-		}
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				{
+					UID:     0,
+					PID:     1,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{1},
+				},
+			}
+			// Create the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
 
-		// Start the container.
-		if err := c.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Load the container from disk and check the status.
+			c, err = Load(rootDir, args.ID)
+			if err != nil {
+				t.Fatalf("error loading container: %v", err)
+			}
+			if got, want := c.Status, Created; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
 
-		// Load the container from disk and check the status.
-		c, err = Load(rootDir, args.ID)
-		if err != nil {
-			t.Fatalf("error loading container: %v", err)
-		}
-		if got, want := c.Status, Running; got != want {
-			t.Errorf("container status got %v, want %v", got, want)
-		}
+			// List should return the container id.
+			ids, err := List(rootDir)
+			if err != nil {
+				t.Fatalf("error listing containers: %v", err)
+			}
+			if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) {
+				t.Errorf("container list got %v, want %v", got, want)
+			}
 
-		// Verify that "sleep 100" is running.
-		if err := waitForProcessList(c, expectedPL); err != nil {
-			t.Error(err)
-		}
+			// Start the container.
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Wait on the container.
-		var wg sync.WaitGroup
-		wg.Add(1)
-		ch := make(chan struct{})
-		go func() {
-			ch <- struct{}{}
-			ws, err := c.Wait()
+			// Load the container from disk and check the status.
+			c, err = Load(rootDir, args.ID)
 			if err != nil {
-				t.Fatalf("error waiting on container: %v", err)
+				t.Fatalf("error loading container: %v", err)
 			}
-			if got, want := ws.Signal(), syscall.SIGTERM; got != want {
-				t.Fatalf("got signal %v, want %v", got, want)
+			if got, want := c.Status, Running; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
 			}
-			wg.Done()
-		}()
 
-		// Wait a bit to ensure that we've started waiting on the
-		// container before we signal.
-		<-ch
-		time.Sleep(100 * time.Millisecond)
-		// Send the container a SIGTERM which will cause it to stop.
-		if err := c.SignalContainer(syscall.SIGTERM, false); err != nil {
-			t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
-		}
-		// Wait for it to die.
-		wg.Wait()
+			// Verify that "sleep 100" is running.
+			if err := waitForProcessList(c, expectedPL); err != nil {
+				t.Error(err)
+			}
 
-		// Load the container from disk and check the status.
-		c, err = Load(rootDir, args.ID)
-		if err != nil {
-			t.Fatalf("error loading container: %v", err)
-		}
-		if got, want := c.Status, Stopped; got != want {
-			t.Errorf("container status got %v, want %v", got, want)
-		}
+			// Wait on the container.
+			ch := make(chan error)
+			go func() {
+				ws, err := c.Wait()
+				if err != nil {
+					ch <- err
+				}
+				if got, want := ws.Signal(), syscall.SIGTERM; got != want {
+					ch <- fmt.Errorf("got signal %v, want %v", got, want)
+				}
+				ch <- nil
+			}()
 
-		// Destroy the container.
-		if err := c.Destroy(); err != nil {
-			t.Fatalf("error destroying container: %v", err)
-		}
+			// Wait a bit to ensure that we've started waiting on
+			// the container before we signal.
+			time.Sleep(time.Second)
 
-		// List should not return the container id.
-		ids, err = List(rootDir)
-		if err != nil {
-			t.Fatalf("error listing containers: %v", err)
-		}
-		if len(ids) != 0 {
-			t.Errorf("expected container list to be empty, but got %v", ids)
-		}
+			// Send the container a SIGTERM which will cause it to stop.
+			if err := c.SignalContainer(syscall.SIGTERM, false); err != nil {
+				t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
+			}
 
-		// Loading the container by id should fail.
-		if _, err = Load(rootDir, args.ID); err == nil {
-			t.Errorf("expected loading destroyed container to fail, but it did not")
-		}
+			// Wait for it to die.
+			if err := <-ch; err != nil {
+				t.Fatalf("error waiting for container: %v", err)
+			}
+
+			// Load the container from disk and check the status.
+			c, err = Load(rootDir, args.ID)
+			if err != nil {
+				t.Fatalf("error loading container: %v", err)
+			}
+			if got, want := c.Status, Stopped; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
+
+			// Destroy the container.
+			if err := c.Destroy(); err != nil {
+				t.Fatalf("error destroying container: %v", err)
+			}
+
+			// List should not return the container id.
+			ids, err = List(rootDir)
+			if err != nil {
+				t.Fatalf("error listing containers: %v", err)
+			}
+			if len(ids) != 0 {
+				t.Errorf("expected container list to be empty, but got %v", ids)
+			}
+
+			// Loading the container by id should fail.
+			if _, err = Load(rootDir, args.ID); err == nil {
+				t.Errorf("expected loading destroyed container to fail, but it did not")
+			}
+		})
 	}
 }
 
@@ -436,12 +444,14 @@ func TestExePath(t *testing.T) {
 	// Create two directories that will be prepended to PATH.
 	firstPath, err := ioutil.TempDir(testutil.TmpDir(), "first")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("error creating temporary directory: %v", err)
 	}
+	defer os.RemoveAll(firstPath)
 	secondPath, err := ioutil.TempDir(testutil.TmpDir(), "second")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("error creating temporary directory: %v", err)
 	}
+	defer os.RemoveAll(secondPath)
 
 	// Create two minimal executables in the second path, two of which
 	// will be masked by files in first path.
@@ -449,11 +459,11 @@ func TestExePath(t *testing.T) {
 		path := filepath.Join(secondPath, p)
 		f, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0777)
 		if err != nil {
-			t.Fatal(err)
+			t.Fatalf("error opening path: %v", err)
 		}
 		defer f.Close()
 		if _, err := io.WriteString(f, "#!/bin/true\n"); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error writing contents: %v", err)
 		}
 	}
 
@@ -462,7 +472,7 @@ func TestExePath(t *testing.T) {
 	nonExecutable := filepath.Join(firstPath, "masked1")
 	f2, err := os.OpenFile(nonExecutable, os.O_CREATE|os.O_EXCL, 0666)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("error opening file: %v", err)
 	}
 	f2.Close()
 
@@ -470,85 +480,95 @@ func TestExePath(t *testing.T) {
 	// executable in the second.
 	nonRegular := filepath.Join(firstPath, "masked2")
 	if err := os.Mkdir(nonRegular, 0777); err != nil {
-		t.Fatal(err)
-	}
-
-	for _, conf := range configs(overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-		for _, test := range []struct {
-			path    string
-			success bool
-		}{
-			{path: "true", success: true},
-			{path: "bin/true", success: true},
-			{path: "/bin/true", success: true},
-			{path: "thisfiledoesntexit", success: false},
-			{path: "bin/thisfiledoesntexit", success: false},
-			{path: "/bin/thisfiledoesntexit", success: false},
-
-			{path: "unmasked", success: true},
-			{path: filepath.Join(firstPath, "unmasked"), success: false},
-			{path: filepath.Join(secondPath, "unmasked"), success: true},
-
-			{path: "masked1", success: true},
-			{path: filepath.Join(firstPath, "masked1"), success: false},
-			{path: filepath.Join(secondPath, "masked1"), success: true},
-
-			{path: "masked2", success: true},
-			{path: filepath.Join(firstPath, "masked2"), success: false},
-			{path: filepath.Join(secondPath, "masked2"), success: true},
-		} {
-			spec := testutil.NewSpecWithArgs(test.path)
-			spec.Process.Env = []string{
-				fmt.Sprintf("PATH=%s:%s:%s", firstPath, secondPath, os.Getenv("PATH")),
-			}
-
-			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-			if err != nil {
-				t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
-			}
-
-			args := Args{
-				ID:        testutil.UniqueContainerID(),
-				Spec:      spec,
-				BundleDir: bundleDir,
-				Attached:  true,
-			}
-			ws, err := Run(conf, args)
+		t.Fatalf("error making directory: %v", err)
+	}
+
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			for _, test := range []struct {
+				path    string
+				success bool
+			}{
+				{path: "true", success: true},
+				{path: "bin/true", success: true},
+				{path: "/bin/true", success: true},
+				{path: "thisfiledoesntexit", success: false},
+				{path: "bin/thisfiledoesntexit", success: false},
+				{path: "/bin/thisfiledoesntexit", success: false},
+
+				{path: "unmasked", success: true},
+				{path: filepath.Join(firstPath, "unmasked"), success: false},
+				{path: filepath.Join(secondPath, "unmasked"), success: true},
+
+				{path: "masked1", success: true},
+				{path: filepath.Join(firstPath, "masked1"), success: false},
+				{path: filepath.Join(secondPath, "masked1"), success: true},
+
+				{path: "masked2", success: true},
+				{path: filepath.Join(firstPath, "masked2"), success: false},
+				{path: filepath.Join(secondPath, "masked2"), success: true},
+			} {
+				t.Run(fmt.Sprintf("path=%s,success=%t", test.path, test.success), func(t *testing.T) {
+					spec := testutil.NewSpecWithArgs(test.path)
+					spec.Process.Env = []string{
+						fmt.Sprintf("PATH=%s:%s:%s", firstPath, secondPath, os.Getenv("PATH")),
+					}
 
-			os.RemoveAll(rootDir)
-			os.RemoveAll(bundleDir)
+					_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+					if err != nil {
+						t.Fatalf("exec: error setting up container: %v", err)
+					}
+					defer cleanup()
 
-			if test.success {
-				if err != nil {
-					t.Errorf("exec: %s, error running container: %v", test.path, err)
-				}
-				if ws.ExitStatus() != 0 {
-					t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
-				}
-			} else {
-				if err == nil {
-					t.Errorf("exec: %s, got: no error, want: error", test.path)
-				}
+					args := Args{
+						ID:        testutil.RandomContainerID(),
+						Spec:      spec,
+						BundleDir: bundleDir,
+						Attached:  true,
+					}
+					ws, err := Run(conf, args)
+
+					if test.success {
+						if err != nil {
+							t.Errorf("exec: error running container: %v", err)
+						}
+						if ws.ExitStatus() != 0 {
+							t.Errorf("exec: got exit status %v want %v", ws.ExitStatus(), 0)
+						}
+					} else {
+						if err == nil {
+							t.Errorf("exec: got: no error, want: error")
+						}
+					}
+				})
 			}
-		}
+		})
 	}
 }
 
 // Test the we can retrieve the application exit status from the container.
 func TestAppExitStatus(t *testing.T) {
+	doAppExitStatus(t, false)
+}
+
+// This is TestAppExitStatus for VFSv2.
+func TestAppExitStatusVFS2(t *testing.T) {
+	doAppExitStatus(t, true)
+}
+
+func doAppExitStatus(t *testing.T, vfs2 bool) {
 	// First container will succeed.
 	succSpec := testutil.NewSpecWithArgs("true")
-	conf := testutil.TestConfig()
-	rootDir, bundleDir, err := testutil.SetupContainer(succSpec, conf)
+	conf := testutil.TestConfig(t)
+	conf.VFS2 = vfs2
+	_, bundleDir, cleanup, err := testutil.SetupContainer(succSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      succSpec,
 		BundleDir: bundleDir,
 		Attached:  true,
@@ -565,15 +585,14 @@ func TestAppExitStatus(t *testing.T) {
 	wantStatus := 123
 	errSpec := testutil.NewSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
 
-	rootDir2, bundleDir2, err := testutil.SetupContainer(errSpec, conf)
+	_, bundleDir2, cleanup2, err := testutil.SetupContainer(errSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir2)
-	defer os.RemoveAll(bundleDir2)
+	defer cleanup2()
 
 	args2 := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      errSpec,
 		BundleDir: bundleDir2,
 		Attached:  true,
@@ -589,166 +608,163 @@ func TestAppExitStatus(t *testing.T) {
 
 // TestExec verifies that a container can exec a new program.
 func TestExec(t *testing.T) {
-	for _, conf := range configs(overlay) {
-		t.Logf("Running test with conf: %+v", conf)
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			const uid = 343
+			spec := testutil.NewSpecWithArgs("sleep", "100")
 
-		const uid = 343
-		spec := testutil.NewSpecWithArgs("sleep", "100")
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				{
+					UID:     0,
+					PID:     1,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{1},
+				},
+				{
+					UID:     uid,
+					PID:     2,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{2},
+				},
+			}
 
-		// expectedPL lists the expected process state of the container.
-		expectedPL := []*control.Process{
-			{
-				UID:     0,
-				PID:     1,
-				PPID:    0,
-				C:       0,
-				Cmd:     "sleep",
-				Threads: []kernel.ThreadID{1},
-			},
-			{
-				UID:     uid,
-				PID:     2,
-				PPID:    0,
-				C:       0,
-				Cmd:     "sleep",
-				Threads: []kernel.ThreadID{2},
-			},
-		}
+			// Verify that "sleep 100" is running.
+			if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
+				t.Error(err)
+			}
 
-		// Verify that "sleep 100" is running.
-		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
-			t.Error(err)
-		}
+			execArgs := &control.ExecArgs{
+				Filename:         "/bin/sleep",
+				Argv:             []string{"/bin/sleep", "5"},
+				WorkingDirectory: "/",
+				KUID:             uid,
+			}
 
-		execArgs := &control.ExecArgs{
-			Filename:         "/bin/sleep",
-			Argv:             []string{"/bin/sleep", "5"},
-			WorkingDirectory: "/",
-			KUID:             uid,
-		}
+			// Verify that "sleep 100" and "sleep 5" are running
+			// after exec.  First, start running exec (whick
+			// blocks).
+			ch := make(chan error)
+			go func() {
+				exitStatus, err := cont.executeSync(execArgs)
+				if err != nil {
+					ch <- err
+				} else if exitStatus != 0 {
+					ch <- fmt.Errorf("failed with exit status: %v", exitStatus)
+				} else {
+					ch <- nil
+				}
+			}()
 
-		// Verify that "sleep 100" and "sleep 5" are running after exec.
-		// First, start running exec (whick blocks).
-		status := make(chan error, 1)
-		go func() {
-			exitStatus, err := cont.executeSync(execArgs)
-			if err != nil {
-				log.Debugf("error executing: %v", err)
-				status <- err
-			} else if exitStatus != 0 {
-				log.Debugf("bad status: %d", exitStatus)
-				status <- fmt.Errorf("failed with exit status: %v", exitStatus)
-			} else {
-				status <- nil
+			if err := waitForProcessList(cont, expectedPL); err != nil {
+				t.Fatalf("error waiting for processes: %v", err)
 			}
-		}()
 
-		if err := waitForProcessList(cont, expectedPL); err != nil {
-			t.Fatal(err)
-		}
-
-		// Ensure that exec finished without error.
-		select {
-		case <-time.After(10 * time.Second):
-			t.Fatalf("container timed out waiting for exec to finish.")
-		case st := <-status:
-			if st != nil {
-				t.Errorf("container failed to exec %v: %v", args, err)
+			// Ensure that exec finished without error.
+			select {
+			case <-time.After(10 * time.Second):
+				t.Fatalf("container timed out waiting for exec to finish.")
+			case err := <-ch:
+				if err != nil {
+					t.Errorf("container failed to exec %v: %v", args, err)
+				}
 			}
-		}
+		})
 	}
 }
 
 // TestKillPid verifies that we can signal individual exec'd processes.
 func TestKillPid(t *testing.T) {
-	for _, conf := range configs(overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		app, err := testutil.FindFile("runsc/container/test_app/test_app")
-		if err != nil {
-			t.Fatal("error finding test_app:", err)
-		}
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			app, err := testutil.FindFile("test/cmd/test_app/test_app")
+			if err != nil {
+				t.Fatal("error finding test_app:", err)
+			}
 
-		const nProcs = 4
-		spec := testutil.NewSpecWithArgs(app, "task-tree", "--depth", strconv.Itoa(nProcs-1), "--width=1", "--pause=true")
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			const nProcs = 4
+			spec := testutil.NewSpecWithArgs(app, "task-tree", "--depth", strconv.Itoa(nProcs-1), "--width=1", "--pause=true")
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Verify that all processes are running.
-		if err := waitForProcessCount(cont, nProcs); err != nil {
-			t.Fatalf("timed out waiting for processes to start: %v", err)
-		}
+			// Verify that all processes are running.
+			if err := waitForProcessCount(cont, nProcs); err != nil {
+				t.Fatalf("timed out waiting for processes to start: %v", err)
+			}
 
-		// Kill the child process with the largest PID.
-		procs, err := cont.Processes()
-		if err != nil {
-			t.Fatalf("failed to get process list: %v", err)
-		}
-		var pid int32
-		for _, p := range procs {
-			if pid < int32(p.PID) {
-				pid = int32(p.PID)
+			// Kill the child process with the largest PID.
+			procs, err := cont.Processes()
+			if err != nil {
+				t.Fatalf("failed to get process list: %v", err)
+			}
+			var pid int32
+			for _, p := range procs {
+				if pid < int32(p.PID) {
+					pid = int32(p.PID)
+				}
+			}
+			if err := cont.SignalProcess(syscall.SIGKILL, pid); err != nil {
+				t.Fatalf("failed to signal process %d: %v", pid, err)
 			}
-		}
-		if err := cont.SignalProcess(syscall.SIGKILL, pid); err != nil {
-			t.Fatalf("failed to signal process %d: %v", pid, err)
-		}
 
-		// Verify that one process is gone.
-		if err := waitForProcessCount(cont, nProcs-1); err != nil {
-			t.Fatal(err)
-		}
+			// Verify that one process is gone.
+			if err := waitForProcessCount(cont, nProcs-1); err != nil {
+				t.Fatalf("error waiting for processes: %v", err)
+			}
 
-		procs, err = cont.Processes()
-		if err != nil {
-			t.Fatalf("failed to get process list: %v", err)
-		}
-		for _, p := range procs {
-			if pid == int32(p.PID) {
-				t.Fatalf("pid %d is still alive, which should be killed", pid)
+			procs, err = cont.Processes()
+			if err != nil {
+				t.Fatalf("failed to get process list: %v", err)
 			}
-		}
+			for _, p := range procs {
+				if pid == int32(p.PID) {
+					t.Fatalf("pid %d is still alive, which should be killed", pid)
+				}
+			}
+		})
 	}
 }
 
@@ -759,160 +775,160 @@ func TestKillPid(t *testing.T) {
 // be the next consecutive number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(noOverlay...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir failed: %v", err)
-		}
-		if err := os.Chmod(dir, 0777); err != nil {
-			t.Fatalf("error chmoding file: %q, %v", dir, err)
-		}
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir failed: %v", err)
+			}
+			defer os.RemoveAll(dir)
+			if err := os.Chmod(dir, 0777); err != nil {
+				t.Fatalf("error chmoding file: %q, %v", dir, err)
+			}
 
-		outputPath := filepath.Join(dir, "output")
-		outputFile, err := createWriteableOutputFile(outputPath)
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer outputFile.Close()
+			outputPath := filepath.Join(dir, "output")
+			outputFile, err := createWriteableOutputFile(outputPath)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile.Close()
 
-		script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %q; sleep 1; done", outputPath)
-		spec := testutil.NewSpecWithArgs("bash", "-c", script)
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %q; sleep 1; done", outputPath)
+			spec := testutil.NewSpecWithArgs("bash", "-c", script)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Set the image path, which is where the checkpoint image will be saved.
-		imagePath := filepath.Join(dir, "test-image-file")
+			// Set the image path, which is where the checkpoint image will be saved.
+			imagePath := filepath.Join(dir, "test-image-file")
 
-		// Create the image file and open for writing.
-		file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
-		if err != nil {
-			t.Fatalf("error opening new file at imagePath: %v", err)
-		}
-		defer file.Close()
+			// Create the image file and open for writing.
+			file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+			if err != nil {
+				t.Fatalf("error opening new file at imagePath: %v", err)
+			}
+			defer file.Close()
 
-		// Wait until application has ran.
-		if err := waitForFileNotEmpty(outputFile); err != nil {
-			t.Fatalf("Failed to wait for output file: %v", err)
-		}
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
 
-		// Checkpoint running container; save state into new file.
-		if err := cont.Checkpoint(file); err != nil {
-			t.Fatalf("error checkpointing container to empty file: %v", err)
-		}
-		defer os.RemoveAll(imagePath)
+			// Checkpoint running container; save state into new file.
+			if err := cont.Checkpoint(file); err != nil {
+				t.Fatalf("error checkpointing container to empty file: %v", err)
+			}
+			defer os.RemoveAll(imagePath)
 
-		lastNum, err := readOutputNum(outputPath, -1)
-		if err != nil {
-			t.Fatalf("error with outputFile: %v", err)
-		}
+			lastNum, err := readOutputNum(outputPath, -1)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
 
-		// Delete and recreate file before restoring.
-		if err := os.Remove(outputPath); err != nil {
-			t.Fatalf("error removing file")
-		}
-		outputFile2, err := createWriteableOutputFile(outputPath)
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer outputFile2.Close()
+			// Delete and recreate file before restoring.
+			if err := os.Remove(outputPath); err != nil {
+				t.Fatalf("error removing file")
+			}
+			outputFile2, err := createWriteableOutputFile(outputPath)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile2.Close()
 
-		// Restore into a new container.
-		args2 := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont2, err := New(conf, args2)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont2.Destroy()
+			// Restore into a new container.
+			args2 := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont2, err := New(conf, args2)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont2.Destroy()
 
-		if err := cont2.Restore(spec, conf, imagePath); err != nil {
-			t.Fatalf("error restoring container: %v", err)
-		}
+			if err := cont2.Restore(spec, conf, imagePath); err != nil {
+				t.Fatalf("error restoring container: %v", err)
+			}
 
-		// Wait until application has ran.
-		if err := waitForFileNotEmpty(outputFile2); err != nil {
-			t.Fatalf("Failed to wait for output file: %v", err)
-		}
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile2); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
 
-		firstNum, err := readOutputNum(outputPath, 0)
-		if err != nil {
-			t.Fatalf("error with outputFile: %v", err)
-		}
+			firstNum, err := readOutputNum(outputPath, 0)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
 
-		// Check that lastNum is one less than firstNum and that the container picks
-		// up from where it left off.
-		if lastNum+1 != firstNum {
-			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
-		}
-		cont2.Destroy()
+			// Check that lastNum is one less than firstNum and that the container picks
+			// up from where it left off.
+			if lastNum+1 != firstNum {
+				t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
+			}
+			cont2.Destroy()
 
-		// Restore into another container!
-		// Delete and recreate file before restoring.
-		if err := os.Remove(outputPath); err != nil {
-			t.Fatalf("error removing file")
-		}
-		outputFile3, err := createWriteableOutputFile(outputPath)
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer outputFile3.Close()
+			// Restore into another container!
+			// Delete and recreate file before restoring.
+			if err := os.Remove(outputPath); err != nil {
+				t.Fatalf("error removing file")
+			}
+			outputFile3, err := createWriteableOutputFile(outputPath)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile3.Close()
 
-		// Restore into a new container.
-		args3 := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont3, err := New(conf, args3)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont3.Destroy()
+			// Restore into a new container.
+			args3 := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont3, err := New(conf, args3)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont3.Destroy()
 
-		if err := cont3.Restore(spec, conf, imagePath); err != nil {
-			t.Fatalf("error restoring container: %v", err)
-		}
+			if err := cont3.Restore(spec, conf, imagePath); err != nil {
+				t.Fatalf("error restoring container: %v", err)
+			}
 
-		// Wait until application has ran.
-		if err := waitForFileNotEmpty(outputFile3); err != nil {
-			t.Fatalf("Failed to wait for output file: %v", err)
-		}
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile3); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
 
-		firstNum2, err := readOutputNum(outputPath, 0)
-		if err != nil {
-			t.Fatalf("error with outputFile: %v", err)
-		}
+			firstNum2, err := readOutputNum(outputPath, 0)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
 
-		// Check that lastNum is one less than firstNum and that the container picks
-		// up from where it left off.
-		if lastNum+1 != firstNum2 {
-			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
-		}
-		cont3.Destroy()
+			// Check that lastNum is one less than firstNum and that the container picks
+			// up from where it left off.
+			if lastNum+1 != firstNum2 {
+				t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
+			}
+			cont3.Destroy()
+		})
 	}
 }
 
@@ -920,135 +936,134 @@ func TestCheckpointRestore(t *testing.T) {
 // with filesystem Unix Domain Socket use.
 func TestUnixDomainSockets(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
-	for _, conf := range configs(noOverlay...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		// UDS path is limited to 108 chars for compatibility with older systems.
-		// Use '/tmp' (instead of testutil.TmpDir) to ensure the size limit is
-		// not exceeded. Assumes '/tmp' exists in the system.
-		dir, err := ioutil.TempDir("/tmp", "uds-test")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir failed: %v", err)
-		}
-		defer os.RemoveAll(dir)
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			// UDS path is limited to 108 chars for compatibility with older systems.
+			// Use '/tmp' (instead of testutil.TmpDir) to ensure the size limit is
+			// not exceeded. Assumes '/tmp' exists in the system.
+			dir, err := ioutil.TempDir("/tmp", "uds-test")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir failed: %v", err)
+			}
+			defer os.RemoveAll(dir)
 
-		outputPath := filepath.Join(dir, "uds_output")
-		outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer outputFile.Close()
+			outputPath := filepath.Join(dir, "uds_output")
+			outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile.Close()
 
-		app, err := testutil.FindFile("runsc/container/test_app/test_app")
-		if err != nil {
-			t.Fatal("error finding test_app:", err)
-		}
+			app, err := testutil.FindFile("test/cmd/test_app/test_app")
+			if err != nil {
+				t.Fatal("error finding test_app:", err)
+			}
 
-		socketPath := filepath.Join(dir, "uds_socket")
-		defer os.Remove(socketPath)
+			socketPath := filepath.Join(dir, "uds_socket")
+			defer os.Remove(socketPath)
 
-		spec := testutil.NewSpecWithArgs(app, "uds", "--file", outputPath, "--socket", socketPath)
-		spec.Process.User = specs.User{
-			UID: uint32(os.Getuid()),
-			GID: uint32(os.Getgid()),
-		}
-		spec.Mounts = []specs.Mount{{
-			Type:        "bind",
-			Destination: dir,
-			Source:      dir,
-		}}
+			spec := testutil.NewSpecWithArgs(app, "uds", "--file", outputPath, "--socket", socketPath)
+			spec.Process.User = specs.User{
+				UID: uint32(os.Getuid()),
+				GID: uint32(os.Getgid()),
+			}
+			spec.Mounts = []specs.Mount{{
+				Type:        "bind",
+				Destination: dir,
+				Source:      dir,
+			}}
 
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Set the image path, the location where the checkpoint image will be saved.
-		imagePath := filepath.Join(dir, "test-image-file")
+			// Set the image path, the location where the checkpoint image will be saved.
+			imagePath := filepath.Join(dir, "test-image-file")
 
-		// Create the image file and open for writing.
-		file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
-		if err != nil {
-			t.Fatalf("error opening new file at imagePath: %v", err)
-		}
-		defer file.Close()
-		defer os.RemoveAll(imagePath)
+			// Create the image file and open for writing.
+			file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+			if err != nil {
+				t.Fatalf("error opening new file at imagePath: %v", err)
+			}
+			defer file.Close()
+			defer os.RemoveAll(imagePath)
 
-		// Wait until application has ran.
-		if err := waitForFileNotEmpty(outputFile); err != nil {
-			t.Fatalf("Failed to wait for output file: %v", err)
-		}
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
 
-		// Checkpoint running container; save state into new file.
-		if err := cont.Checkpoint(file); err != nil {
-			t.Fatalf("error checkpointing container to empty file: %v", err)
-		}
+			// Checkpoint running container; save state into new file.
+			if err := cont.Checkpoint(file); err != nil {
+				t.Fatalf("error checkpointing container to empty file: %v", err)
+			}
 
-		// Read last number outputted before checkpoint.
-		lastNum, err := readOutputNum(outputPath, -1)
-		if err != nil {
-			t.Fatalf("error with outputFile: %v", err)
-		}
+			// Read last number outputted before checkpoint.
+			lastNum, err := readOutputNum(outputPath, -1)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
 
-		// Delete and recreate file before restoring.
-		if err := os.Remove(outputPath); err != nil {
-			t.Fatalf("error removing file")
-		}
-		outputFile2, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
-		if err != nil {
-			t.Fatalf("error creating output file: %v", err)
-		}
-		defer outputFile2.Close()
+			// Delete and recreate file before restoring.
+			if err := os.Remove(outputPath); err != nil {
+				t.Fatalf("error removing file")
+			}
+			outputFile2, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile2.Close()
 
-		// Restore into a new container.
-		argsRestore := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		contRestore, err := New(conf, argsRestore)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer contRestore.Destroy()
+			// Restore into a new container.
+			argsRestore := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			contRestore, err := New(conf, argsRestore)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer contRestore.Destroy()
 
-		if err := contRestore.Restore(spec, conf, imagePath); err != nil {
-			t.Fatalf("error restoring container: %v", err)
-		}
+			if err := contRestore.Restore(spec, conf, imagePath); err != nil {
+				t.Fatalf("error restoring container: %v", err)
+			}
 
-		// Wait until application has ran.
-		if err := waitForFileNotEmpty(outputFile2); err != nil {
-			t.Fatalf("Failed to wait for output file: %v", err)
-		}
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile2); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
 
-		// Read first number outputted after restore.
-		firstNum, err := readOutputNum(outputPath, 0)
-		if err != nil {
-			t.Fatalf("error with outputFile: %v", err)
-		}
+			// Read first number outputted after restore.
+			firstNum, err := readOutputNum(outputPath, 0)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
 
-		// Check that lastNum is one less than firstNum.
-		if lastNum+1 != firstNum {
-			t.Errorf("error numbers not consecutive, previous: %d, next: %d", lastNum, firstNum)
-		}
-		contRestore.Destroy()
+			// Check that lastNum is one less than firstNum.
+			if lastNum+1 != firstNum {
+				t.Errorf("error numbers not consecutive, previous: %d, next: %d", lastNum, firstNum)
+			}
+			contRestore.Destroy()
+		})
 	}
 }
 
@@ -1058,10 +1073,8 @@ func TestUnixDomainSockets(t *testing.T) {
 // recreated. Then it resumes the container, verify that the file gets created
 // again.
 func TestPauseResume(t *testing.T) {
-	for _, conf := range configs(noOverlay...) {
-		t.Run(fmt.Sprintf("conf: %+v", conf), func(t *testing.T) {
-			t.Logf("Running test with conf: %+v", conf)
-
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
 			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock")
 			if err != nil {
 				t.Fatalf("error creating temp dir: %v", err)
@@ -1072,16 +1085,15 @@ func TestPauseResume(t *testing.T) {
 			script := fmt.Sprintf("while [[ true ]]; do touch %q; sleep 0.1; done", running)
 			spec := testutil.NewSpecWithArgs("/bin/bash", "-c", script)
 
-			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
 			}
-			defer os.RemoveAll(rootDir)
-			defer os.RemoveAll(bundleDir)
+			defer cleanup()
 
 			// Create and start the container.
 			args := Args{
-				ID:        testutil.UniqueContainerID(),
+				ID:        testutil.RandomContainerID(),
 				Spec:      spec,
 				BundleDir: bundleDir,
 			}
@@ -1139,17 +1151,16 @@ func TestPauseResume(t *testing.T) {
 // occurs given the correct state.
 func TestPauseResumeStatus(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("sleep", "20")
-	conf := testutil.TestConfig()
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	conf := testutil.TestConfig(t)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -1205,359 +1216,365 @@ func TestCapabilities(t *testing.T) {
 	uid := auth.KUID(os.Getuid() + 1)
 	gid := auth.KGID(os.Getgid() + 1)
 
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		spec := testutil.NewSpecWithArgs("sleep", "100")
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create and start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// expectedPL lists the expected process state of the container.
-		expectedPL := []*control.Process{
-			{
-				UID:     0,
-				PID:     1,
-				PPID:    0,
-				C:       0,
-				Cmd:     "sleep",
-				Threads: []kernel.ThreadID{1},
-			},
-			{
-				UID:     uid,
-				PID:     2,
-				PPID:    0,
-				C:       0,
-				Cmd:     "exe",
-				Threads: []kernel.ThreadID{2},
-			},
-		}
-		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
-			t.Fatalf("Failed to wait for sleep to start, err: %v", err)
-		}
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				{
+					UID:     0,
+					PID:     1,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{1},
+				},
+				{
+					UID:     uid,
+					PID:     2,
+					PPID:    0,
+					C:       0,
+					Cmd:     "exe",
+					Threads: []kernel.ThreadID{2},
+				},
+			}
+			if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
+				t.Fatalf("Failed to wait for sleep to start, err: %v", err)
+			}
 
-		// Create an executable that can't be run with the specified UID:GID.
-		// This shouldn't be callable within the container until we add the
-		// CAP_DAC_OVERRIDE capability to skip the access check.
-		exePath := filepath.Join(rootDir, "exe")
-		if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
-			t.Fatalf("couldn't create executable: %v", err)
-		}
-		defer os.Remove(exePath)
-
-		// Need to traverse the intermediate directory.
-		os.Chmod(rootDir, 0755)
-
-		execArgs := &control.ExecArgs{
-			Filename:         exePath,
-			Argv:             []string{exePath},
-			WorkingDirectory: "/",
-			KUID:             uid,
-			KGID:             gid,
-			Capabilities:     &auth.TaskCapabilities{},
-		}
+			// Create an executable that can't be run with the specified UID:GID.
+			// This shouldn't be callable within the container until we add the
+			// CAP_DAC_OVERRIDE capability to skip the access check.
+			exePath := filepath.Join(rootDir, "exe")
+			if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
+				t.Fatalf("couldn't create executable: %v", err)
+			}
+			defer os.Remove(exePath)
+
+			// Need to traverse the intermediate directory.
+			os.Chmod(rootDir, 0755)
+
+			execArgs := &control.ExecArgs{
+				Filename:         exePath,
+				Argv:             []string{exePath},
+				WorkingDirectory: "/",
+				KUID:             uid,
+				KGID:             gid,
+				Capabilities:     &auth.TaskCapabilities{},
+			}
 
-		// "exe" should fail because we don't have the necessary permissions.
-		if _, err := cont.executeSync(execArgs); err == nil {
-			t.Fatalf("container executed without error, but an error was expected")
-		}
+			// "exe" should fail because we don't have the necessary permissions.
+			if _, err := cont.executeSync(execArgs); err == nil {
+				t.Fatalf("container executed without error, but an error was expected")
+			}
 
-		// Now we run with the capability enabled and should succeed.
-		execArgs.Capabilities = &auth.TaskCapabilities{
-			EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
-		}
-		// "exe" should not fail this time.
-		if _, err := cont.executeSync(execArgs); err != nil {
-			t.Fatalf("container failed to exec %v: %v", args, err)
-		}
+			// Now we run with the capability enabled and should succeed.
+			execArgs.Capabilities = &auth.TaskCapabilities{
+				EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+			}
+			// "exe" should not fail this time.
+			if _, err := cont.executeSync(execArgs); err != nil {
+				t.Fatalf("container failed to exec %v: %v", args, err)
+			}
+		})
 	}
 }
 
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-privileged user.
 func TestRunNonRoot(t *testing.T) {
-	for _, conf := range configs(noOverlay...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		spec := testutil.NewSpecWithArgs("/bin/true")
-
-		// Set a random user/group with no access to "blocked" dir.
-		spec.Process.User.UID = 343
-		spec.Process.User.GID = 2401
-		spec.Process.Capabilities = nil
+	for name, conf := range configsWithVFS2(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("/bin/true")
+
+			// Set a random user/group with no access to "blocked" dir.
+			spec.Process.User.UID = 343
+			spec.Process.User.GID = 2401
+			spec.Process.Capabilities = nil
+
+			// User running inside container can't list '$TMP/blocked' and would fail to
+			// mount it.
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			if err := os.Chmod(dir, 0700); err != nil {
+				t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+			}
+			dir = path.Join(dir, "test")
+			if err := os.Mkdir(dir, 0755); err != nil {
+				t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+			}
 
-		// User running inside container can't list '$TMP/blocked' and would fail to
-		// mount it.
-		dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
-		if err := os.Chmod(dir, 0700); err != nil {
-			t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
-		}
-		dir = path.Join(dir, "test")
-		if err := os.Mkdir(dir, 0755); err != nil {
-			t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
-		}
+			src, err := ioutil.TempDir(testutil.TmpDir(), "src")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
 
-		src, err := ioutil.TempDir(testutil.TmpDir(), "src")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      src,
+				Type:        "bind",
+			})
 
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: dir,
-			Source:      src,
-			Type:        "bind",
+			if err := run(spec, conf); err != nil {
+				t.Fatalf("error running sandbox: %v", err)
+			}
 		})
-
-		if err := run(spec, conf); err != nil {
-			t.Fatalf("error running sandbox: %v", err)
-		}
 	}
 }
 
 // TestMountNewDir checks that runsc will create destination directory if it
 // doesn't exit.
 func TestMountNewDir(t *testing.T) {
-	for _, conf := range configs(overlay) {
-		t.Logf("Running test with conf: %+v", conf)
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			root, err := ioutil.TempDir(testutil.TmpDir(), "root")
+			if err != nil {
+				t.Fatal("ioutil.TempDir() failed:", err)
+			}
 
-		root, err := ioutil.TempDir(testutil.TmpDir(), "root")
-		if err != nil {
-			t.Fatal("ioutil.TempDir() failed:", err)
-		}
+			srcDir := path.Join(root, "src", "dir", "anotherdir")
+			if err := os.MkdirAll(srcDir, 0755); err != nil {
+				t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
+			}
 
-		srcDir := path.Join(root, "src", "dir", "anotherdir")
-		if err := os.MkdirAll(srcDir, 0755); err != nil {
-			t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
-		}
+			mountDir := path.Join(root, "dir", "anotherdir")
 
-		mountDir := path.Join(root, "dir", "anotherdir")
+			spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: mountDir,
+				Source:      srcDir,
+				Type:        "bind",
+			})
 
-		spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: mountDir,
-			Source:      srcDir,
-			Type:        "bind",
+			if err := run(spec, conf); err != nil {
+				t.Fatalf("error running sandbox: %v", err)
+			}
 		})
-
-		if err := run(spec, conf); err != nil {
-			t.Fatalf("error running sandbox: %v", err)
-		}
 	}
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	for _, conf := range configs(overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
-		spec.Root.Readonly = true
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+			spec.Root.Readonly = true
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create, start and wait for the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
-		if err := c.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create, start and wait for the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		ws, err := c.Wait()
-		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
-		}
-		if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-			t.Fatalf("container failed, waitStatus: %v", ws)
-		}
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+				t.Fatalf("container failed, waitStatus: %v", ws)
+			}
+		})
 	}
 }
 
 func TestUIDMap(t *testing.T) {
-	for _, conf := range configs(noOverlay...) {
-		t.Logf("Running test with conf: %+v", conf)
-		testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
-		if err != nil {
-			t.Fatal(err)
-		}
-		defer os.RemoveAll(testDir)
-		testFile := path.Join(testDir, "testfile")
-
-		spec := testutil.NewSpecWithArgs("touch", "/tmp/testfile")
-		uid := os.Getuid()
-		gid := os.Getgid()
-		spec.Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{
-				{Type: specs.UserNamespace},
-				{Type: specs.PIDNamespace},
-				{Type: specs.MountNamespace},
-			},
-			UIDMappings: []specs.LinuxIDMapping{
-				{
-					ContainerID: 0,
-					HostID:      uint32(uid),
-					Size:        1,
+	for name, conf := range configsWithVFS2(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			defer os.RemoveAll(testDir)
+			testFile := path.Join(testDir, "testfile")
+
+			spec := testutil.NewSpecWithArgs("touch", "/tmp/testfile")
+			uid := os.Getuid()
+			gid := os.Getgid()
+			spec.Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{Type: specs.UserNamespace},
+					{Type: specs.PIDNamespace},
+					{Type: specs.MountNamespace},
 				},
-			},
-			GIDMappings: []specs.LinuxIDMapping{
-				{
-					ContainerID: 0,
-					HostID:      uint32(gid),
-					Size:        1,
+				UIDMappings: []specs.LinuxIDMapping{
+					{
+						ContainerID: 0,
+						HostID:      uint32(uid),
+						Size:        1,
+					},
 				},
-			},
-		}
+				GIDMappings: []specs.LinuxIDMapping{
+					{
+						ContainerID: 0,
+						HostID:      uint32(gid),
+						Size:        1,
+					},
+				},
+			}
 
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: "/tmp",
-			Source:      testDir,
-			Type:        "bind",
-		})
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: "/tmp",
+				Source:      testDir,
+				Type:        "bind",
+			})
 
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create, start and wait for the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
-		if err := c.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create, start and wait for the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		ws, err := c.Wait()
-		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
-		}
-		if !ws.Exited() || ws.ExitStatus() != 0 {
-			t.Fatalf("container failed, waitStatus: %v", ws)
-		}
-		st := syscall.Stat_t{}
-		if err := syscall.Stat(testFile, &st); err != nil {
-			t.Fatalf("error stat /testfile: %v", err)
-		}
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if !ws.Exited() || ws.ExitStatus() != 0 {
+				t.Fatalf("container failed, waitStatus: %v", ws)
+			}
+			st := syscall.Stat_t{}
+			if err := syscall.Stat(testFile, &st); err != nil {
+				t.Fatalf("error stat /testfile: %v", err)
+			}
 
-		if st.Uid != uint32(uid) || st.Gid != uint32(gid) {
-			t.Fatalf("UID: %d (%d) GID: %d (%d)", st.Uid, uid, st.Gid, gid)
-		}
+			if st.Uid != uint32(uid) || st.Gid != uint32(gid) {
+				t.Fatalf("UID: %d (%d) GID: %d (%d)", st.Uid, uid, st.Gid, gid)
+			}
+		})
 	}
 }
 
 func TestReadonlyMount(t *testing.T) {
-	for _, conf := range configs(overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
-		spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: dir,
-			Source:      dir,
-			Type:        "bind",
-			Options:     []string{"ro"},
-		})
-		spec.Root.Readonly = false
-
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
+			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      dir,
+				Type:        "bind",
+				Options:     []string{"ro"},
+			})
+			spec.Root.Readonly = false
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create, start and wait for the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
-		if err := c.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create, start and wait for the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		ws, err := c.Wait()
-		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
-		}
-		if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-			t.Fatalf("container failed, waitStatus: %v", ws)
-		}
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+				t.Fatalf("container failed, waitStatus: %v", ws)
+			}
+		})
 	}
 }
 
 // TestAbbreviatedIDs checks that runsc supports using abbreviated container
 // IDs in place of full IDs.
 func TestAbbreviatedIDs(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	doAbbreviatedIDsTest(t, false)
+}
+
+func TestAbbreviatedIDsVFS2(t *testing.T) {
+	doAbbreviatedIDsTest(t, true)
+}
+
+func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) {
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
+	conf.VFS2 = vfs2
 
 	cids := []string{
-		"foo-" + testutil.UniqueContainerID(),
-		"bar-" + testutil.UniqueContainerID(),
-		"baz-" + testutil.UniqueContainerID(),
+		"foo-" + testutil.RandomContainerID(),
+		"bar-" + testutil.RandomContainerID(),
+		"baz-" + testutil.RandomContainerID(),
 	}
 	for _, cid := range cids {
 		spec := testutil.NewSpecWithArgs("sleep", "100")
-		bundleDir, err := testutil.SetupBundleDir(spec)
+		bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
-		defer os.RemoveAll(bundleDir)
+		defer cleanup()
 
 		// Create and start the container.
 		args := Args{
@@ -1600,18 +1617,27 @@ func TestAbbreviatedIDs(t *testing.T) {
 }
 
 func TestGoferExits(t *testing.T) {
+	doGoferExitTest(t, false)
+}
+
+func TestGoferExitsVFS2(t *testing.T) {
+	doGoferExitTest(t, true)
+}
+
+func doGoferExitTest(t *testing.T, vfs2 bool) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	conf := testutil.TestConfig()
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	conf := testutil.TestConfig(t)
+	conf.VFS2 = vfs2
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -1640,7 +1666,7 @@ func TestGoferExits(t *testing.T) {
 }
 
 func TestRootNotMount(t *testing.T) {
-	appSym, err := testutil.FindFile("runsc/container/test_app/test_app")
+	appSym, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -1670,27 +1696,26 @@ func TestRootNotMount(t *testing.T) {
 	spec.Root.Readonly = true
 	spec.Mounts = nil
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	if err := run(spec, conf); err != nil {
 		t.Fatalf("error running sandbox: %v", err)
 	}
 }
 
 func TestUserLog(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
 
 	// sched_rr_get_interval = 148 - not implemented in gvisor.
 	spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148")
-	conf := testutil.TestConfig()
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	conf := testutil.TestConfig(t)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	dir, err := ioutil.TempDir(testutil.TmpDir(), "user_log_test")
 	if err != nil {
@@ -1700,7 +1725,7 @@ func TestUserLog(t *testing.T) {
 
 	// Create, start and wait for the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 		UserLog:   userLog,
@@ -1718,78 +1743,85 @@ func TestUserLog(t *testing.T) {
 	if err != nil {
 		t.Fatalf("error opening user log file %q: %v", userLog, err)
 	}
-	if want := "Unsupported syscall: sched_rr_get_interval"; !strings.Contains(string(out), want) {
+	if want := "Unsupported syscall sched_rr_get_interval("; !strings.Contains(string(out), want) {
 		t.Errorf("user log file doesn't contain %q, out: %s", want, string(out))
 	}
 }
 
 func TestWaitOnExitedSandbox(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		// Run a shell that sleeps for 1 second and then exits with a
-		// non-zero code.
-		const wantExit = 17
-		cmd := fmt.Sprintf("sleep 1; exit %d", wantExit)
-		spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			// Run a shell that sleeps for 1 second and then exits with a
+			// non-zero code.
+			const wantExit = 17
+			cmd := fmt.Sprintf("sleep 1; exit %d", wantExit)
+			spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		// Create and Start the container.
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		c, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		defer c.Destroy()
-		if err := c.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			// Create and Start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		// Wait on the sandbox. This will make an RPC to the sandbox
-		// and get the actual exit status of the application.
-		ws, err := c.Wait()
-		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
-		}
-		if got := ws.ExitStatus(); got != wantExit {
-			t.Errorf("got exit status %d, want %d", got, wantExit)
-		}
+			// Wait on the sandbox. This will make an RPC to the sandbox
+			// and get the actual exit status of the application.
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if got := ws.ExitStatus(); got != wantExit {
+				t.Errorf("got exit status %d, want %d", got, wantExit)
+			}
 
-		// Now the sandbox has exited, but the zombie sandbox process
-		// still exists. Calling Wait() now will return the sandbox
-		// exit status.
-		ws, err = c.Wait()
-		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
-		}
-		if got := ws.ExitStatus(); got != wantExit {
-			t.Errorf("got exit status %d, want %d", got, wantExit)
-		}
+			// Now the sandbox has exited, but the zombie sandbox process
+			// still exists. Calling Wait() now will return the sandbox
+			// exit status.
+			ws, err = c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if got := ws.ExitStatus(); got != wantExit {
+				t.Errorf("got exit status %d, want %d", got, wantExit)
+			}
+		})
 	}
 }
 
 func TestDestroyNotStarted(t *testing.T) {
+	doDestroyNotStartedTest(t, false)
+}
+
+func TestDestroyNotStartedVFS2(t *testing.T) {
+	doDestroyNotStartedTest(t, true)
+}
+
+func doDestroyNotStartedTest(t *testing.T, vfs2 bool) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
-	conf := testutil.TestConfig()
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	conf := testutil.TestConfig(t)
+	conf.VFS2 = vfs2
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create the container and check that it can be destroyed.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -1804,19 +1836,27 @@ func TestDestroyNotStarted(t *testing.T) {
 
 // TestDestroyStarting attempts to force a race between start and destroy.
 func TestDestroyStarting(t *testing.T) {
+	doDestroyNotStartedTest(t, false)
+}
+
+func TestDestroyStartedVFS2(t *testing.T) {
+	doDestroyNotStartedTest(t, true)
+}
+
+func doDestroyStartingTest(t *testing.T, vfs2 bool) {
 	for i := 0; i < 10; i++ {
 		spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
-		conf := testutil.TestConfig()
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		conf := testutil.TestConfig(t)
+		conf.VFS2 = vfs2
+		rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+		defer cleanup()
 
 		// Create the container and check that it can be destroyed.
 		args := Args{
-			ID:        testutil.UniqueContainerID(),
+			ID:        testutil.RandomContainerID(),
 			Spec:      spec,
 			BundleDir: bundleDir,
 		}
@@ -1851,23 +1891,23 @@ func TestDestroyStarting(t *testing.T) {
 }
 
 func TestCreateWorkingDir(t *testing.T) {
-	for _, conf := range configs(overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
-		dir := path.Join(tmpDir, "new/working/dir")
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			dir := path.Join(tmpDir, "new/working/dir")
 
-		// touch will fail if the directory doesn't exist.
-		spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
-		spec.Process.Cwd = dir
-		spec.Root.Readonly = true
+			// touch will fail if the directory doesn't exist.
+			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
+			spec.Process.Cwd = dir
+			spec.Root.Readonly = true
 
-		if err := run(spec, conf); err != nil {
-			t.Fatalf("Error running container: %v", err)
-		}
+			if err := run(spec, conf); err != nil {
+				t.Fatalf("Error running container: %v", err)
+			}
+		})
 	}
 }
 
@@ -1924,16 +1964,15 @@ func TestMountPropagation(t *testing.T) {
 		},
 	}
 
-	conf := testutil.TestConfig()
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	conf := testutil.TestConfig(t)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -1975,87 +2014,87 @@ func TestMountPropagation(t *testing.T) {
 }
 
 func TestMountSymlink(t *testing.T) {
-	for _, conf := range configs(overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			defer os.RemoveAll(dir)
 
-		source := path.Join(dir, "source")
-		target := path.Join(dir, "target")
-		for _, path := range []string{source, target} {
-			if err := os.MkdirAll(path, 0777); err != nil {
-				t.Fatalf("os.MkdirAll(): %v", err)
+			source := path.Join(dir, "source")
+			target := path.Join(dir, "target")
+			for _, path := range []string{source, target} {
+				if err := os.MkdirAll(path, 0777); err != nil {
+					t.Fatalf("os.MkdirAll(): %v", err)
+				}
 			}
-		}
-		f, err := os.Create(path.Join(source, "file"))
-		if err != nil {
-			t.Fatalf("os.Create(): %v", err)
-		}
-		f.Close()
+			f, err := os.Create(path.Join(source, "file"))
+			if err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+			f.Close()
 
-		link := path.Join(dir, "link")
-		if err := os.Symlink(target, link); err != nil {
-			t.Fatalf("os.Symlink(%q, %q): %v", target, link, err)
-		}
+			link := path.Join(dir, "link")
+			if err := os.Symlink(target, link); err != nil {
+				t.Fatalf("os.Symlink(%q, %q): %v", target, link, err)
+			}
 
-		spec := testutil.NewSpecWithArgs("/bin/sleep", "1000")
+			spec := testutil.NewSpecWithArgs("/bin/sleep", "1000")
 
-		// Mount to a symlink to ensure the mount code will follow it and mount
-		// at the symlink target.
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: link,
-			Source:      source,
-		})
+			// Mount to a symlink to ensure the mount code will follow it and mount
+			// at the symlink target.
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Type:        "bind",
+				Destination: link,
+				Source:      source,
+			})
 
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
 
-		args := Args{
-			ID:        testutil.UniqueContainerID(),
-			Spec:      spec,
-			BundleDir: bundleDir,
-		}
-		cont, err := New(conf, args)
-		if err != nil {
-			t.Fatalf("creating container: %v", err)
-		}
-		defer cont.Destroy()
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("creating container: %v", err)
+			}
+			defer cont.Destroy()
 
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("starting container: %v", err)
-		}
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("starting container: %v", err)
+			}
 
-		// Check that symlink was resolved and mount was created where the symlink
-		// is pointing to.
-		file := path.Join(target, "file")
-		execArgs := &control.ExecArgs{
-			Filename: "/usr/bin/test",
-			Argv:     []string{"test", "-f", file},
-		}
-		if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
-			t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
-		}
+			// Check that symlink was resolved and mount was created where the symlink
+			// is pointing to.
+			file := path.Join(target, "file")
+			execArgs := &control.ExecArgs{
+				Filename: "/usr/bin/test",
+				Argv:     []string{"test", "-f", file},
+			}
+			if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+				t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
+			}
+		})
 	}
 }
 
 // Check that --net-raw disables the CAP_NET_RAW capability.
 func TestNetRaw(t *testing.T) {
 	capNetRaw := strconv.FormatUint(bits.MaskOf64(int(linux.CAP_NET_RAW)), 10)
-	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
 
 	for _, enableRaw := range []bool{true, false} {
-		conf := testutil.TestConfig()
+		conf := testutil.TestConfig(t)
 		conf.EnableRaw = enableRaw
 
 		test := "--enabled"
@@ -2070,49 +2109,12 @@ func TestNetRaw(t *testing.T) {
 	}
 }
 
-// TestOverlayfsStaleRead most basic test that '--overlayfs-stale-read' works.
-func TestOverlayfsStaleRead(t *testing.T) {
-	conf := testutil.TestConfig()
-	conf.OverlayfsStaleRead = true
-
-	in, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.in")
-	if err != nil {
-		t.Fatalf("ioutil.TempFile() failed: %v", err)
-	}
-	defer in.Close()
-	if _, err := in.WriteString("stale data"); err != nil {
-		t.Fatalf("in.Write() failed: %v", err)
-	}
-
-	out, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.out")
-	if err != nil {
-		t.Fatalf("ioutil.TempFile() failed: %v", err)
-	}
-	defer out.Close()
-
-	const want = "foobar"
-	cmd := fmt.Sprintf("cat %q && echo %q> %q && cp %q %q", in.Name(), want, in.Name(), in.Name(), out.Name())
-	spec := testutil.NewSpecWithArgs("/bin/bash", "-c", cmd)
-	if err := run(spec, conf); err != nil {
-		t.Fatalf("Error running container: %v", err)
-	}
-
-	gotBytes, err := ioutil.ReadAll(out)
-	if err != nil {
-		t.Fatalf("out.Read() failed: %v", err)
-	}
-	got := strings.TrimSpace(string(gotBytes))
-	if want != got {
-		t.Errorf("Wrong content in out file, got: %q. want: %q", got, want)
-	}
-}
-
 // TestTTYField checks TTY field returned by container.Processes().
 func TestTTYField(t *testing.T) {
 	stop := testutil.StartReaper()
 	defer stop()
 
-	testApp, err := testutil.FindFile("runsc/container/test_app/test_app")
+	testApp, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -2135,64 +2137,70 @@ func TestTTYField(t *testing.T) {
 	}
 
 	for _, test := range testCases {
-		t.Run(test.name, func(t *testing.T) {
-			conf := testutil.TestConfig()
-
-			// We will run /bin/sleep, possibly with an open TTY.
-			cmd := []string{"/bin/sleep", "10000"}
-			if test.useTTY {
-				// Run inside the "pty-runner".
-				cmd = append([]string{testApp, "pty-runner"}, cmd...)
+		for _, vfs2 := range []bool{false, true} {
+			name := test.name
+			if vfs2 {
+				name += "-vfs2"
 			}
+			t.Run(name, func(t *testing.T) {
+				conf := testutil.TestConfig(t)
+				conf.VFS2 = vfs2
+
+				// We will run /bin/sleep, possibly with an open TTY.
+				cmd := []string{"/bin/sleep", "10000"}
+				if test.useTTY {
+					// Run inside the "pty-runner".
+					cmd = append([]string{testApp, "pty-runner"}, cmd...)
+				}
 
-			spec := testutil.NewSpecWithArgs(cmd...)
-			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-			if err != nil {
-				t.Fatalf("error setting up container: %v", err)
-			}
-			defer os.RemoveAll(rootDir)
-			defer os.RemoveAll(bundleDir)
-
-			// Create and start the container.
-			args := Args{
-				ID:        testutil.UniqueContainerID(),
-				Spec:      spec,
-				BundleDir: bundleDir,
-			}
-			c, err := New(conf, args)
-			if err != nil {
-				t.Fatalf("error creating container: %v", err)
-			}
-			defer c.Destroy()
-			if err := c.Start(conf); err != nil {
-				t.Fatalf("error starting container: %v", err)
-			}
+				spec := testutil.NewSpecWithArgs(cmd...)
+				_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+				if err != nil {
+					t.Fatalf("error setting up container: %v", err)
+				}
+				defer cleanup()
 
-			// Wait for sleep to be running, and check the TTY
-			// field.
-			var gotTTYField string
-			cb := func() error {
-				ps, err := c.Processes()
+				// Create and start the container.
+				args := Args{
+					ID:        testutil.RandomContainerID(),
+					Spec:      spec,
+					BundleDir: bundleDir,
+				}
+				c, err := New(conf, args)
 				if err != nil {
-					err = fmt.Errorf("error getting process data from container: %v", err)
-					return &backoff.PermanentError{Err: err}
+					t.Fatalf("error creating container: %v", err)
+				}
+				defer c.Destroy()
+				if err := c.Start(conf); err != nil {
+					t.Fatalf("error starting container: %v", err)
 				}
-				for _, p := range ps {
-					if strings.Contains(p.Cmd, "sleep") {
-						gotTTYField = p.TTY
-						return nil
+
+				// Wait for sleep to be running, and check the TTY
+				// field.
+				var gotTTYField string
+				cb := func() error {
+					ps, err := c.Processes()
+					if err != nil {
+						err = fmt.Errorf("error getting process data from container: %v", err)
+						return &backoff.PermanentError{Err: err}
+					}
+					for _, p := range ps {
+						if strings.Contains(p.Cmd, "sleep") {
+							gotTTYField = p.TTY
+							return nil
+						}
 					}
+					return fmt.Errorf("sleep not running")
+				}
+				if err := testutil.Poll(cb, 30*time.Second); err != nil {
+					t.Fatalf("error waiting for sleep process: %v", err)
 				}
-				return fmt.Errorf("sleep not running")
-			}
-			if err := testutil.Poll(cb, 30*time.Second); err != nil {
-				t.Fatalf("error waiting for sleep process: %v", err)
-			}
 
-			if gotTTYField != test.wantTTYField {
-				t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField)
-			}
-		})
+				if gotTTYField != test.wantTTYField {
+					t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField)
+				}
+			})
+		}
 	}
 }
 
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 2da93ec5b..207206dd2 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -27,18 +27,19 @@ import (
 	"time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 	var specs []*specs.Spec
 	var ids []string
-	rootID := testutil.UniqueContainerID()
+	rootID := testutil.RandomContainerID()
 
 	for i, cmd := range cmds {
 		spec := testutil.NewSpecWithArgs(cmd...)
@@ -52,7 +53,7 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
 				specutils.ContainerdSandboxIDAnnotation:     rootID,
 			}
-			ids = append(ids, testutil.UniqueContainerID())
+			ids = append(ids, testutil.RandomContainerID())
 		}
 		specs = append(specs, spec)
 	}
@@ -64,23 +65,16 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
 	}
 
+	cu := cleanup.Cleanup{}
+	defer cu.Clean()
+
 	var containers []*Container
-	var bundles []string
-	cleanup := func() {
-		for _, c := range containers {
-			c.Destroy()
-		}
-		for _, b := range bundles {
-			os.RemoveAll(b)
-		}
-	}
 	for i, spec := range specs {
-		bundleDir, err := testutil.SetupBundleDir(spec)
+		bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
 		if err != nil {
-			cleanup()
 			return nil, nil, fmt.Errorf("error setting up container: %v", err)
 		}
-		bundles = append(bundles, bundleDir)
+		cu.Add(cleanup)
 
 		args := Args{
 			ID:        ids[i],
@@ -89,17 +83,17 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		}
 		cont, err := New(conf, args)
 		if err != nil {
-			cleanup()
 			return nil, nil, fmt.Errorf("error creating container: %v", err)
 		}
+		cu.Add(func() { cont.Destroy() })
 		containers = append(containers, cont)
 
 		if err := cont.Start(conf); err != nil {
-			cleanup()
 			return nil, nil, fmt.Errorf("error starting container: %v", err)
 		}
 	}
-	return containers, cleanup, nil
+
+	return containers, cu.Release(), nil
 }
 
 type execDesc struct {
@@ -135,161 +129,161 @@ func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
 
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		specs, ids := createSpecs(sleep, sleep)
-		containers, cleanup, err := startContainers(conf, specs, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			specs, ids := createSpecs(sleep, sleep)
+			containers, cleanup, err := startContainers(conf, specs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		// Check via ps that multiple processes are running.
-		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
-		}
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
-		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
-		}
-		if err := waitForProcessList(containers[1], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			// Check via ps that multiple processes are running.
+			expectedPL := []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+			expectedPL = []*control.Process{
+				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+			}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+		})
 	}
 }
 
 // TestMultiPIDNS checks that it is possible to run 2 dead-simple
 // containers in the same sandbox with different pidns.
 func TestMultiPIDNS(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		testSpecs, ids := createSpecs(sleep, sleep)
-		testSpecs[1].Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{
-				{
-					Type: "pid",
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			testSpecs, ids := createSpecs(sleep, sleep)
+			testSpecs[1].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+					},
 				},
-			},
-		}
+			}
 
-		containers, cleanup, err := startContainers(conf, testSpecs, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			containers, cleanup, err := startContainers(conf, testSpecs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		// Check via ps that multiple processes are running.
-		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
-		}
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
-		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
-		}
-		if err := waitForProcessList(containers[1], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			// Check via ps that multiple processes are running.
+			expectedPL := []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+			expectedPL = []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+		})
 	}
 }
 
 // TestMultiPIDNSPath checks the pidns path.
 func TestMultiPIDNSPath(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		testSpecs, ids := createSpecs(sleep, sleep, sleep)
-		testSpecs[0].Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{
-				{
-					Type: "pid",
-					Path: "/proc/1/ns/pid",
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			testSpecs, ids := createSpecs(sleep, sleep, sleep)
+			testSpecs[0].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+						Path: "/proc/1/ns/pid",
+					},
 				},
-			},
-		}
-		testSpecs[1].Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{
-				{
-					Type: "pid",
-					Path: "/proc/1/ns/pid",
+			}
+			testSpecs[1].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+						Path: "/proc/1/ns/pid",
+					},
 				},
-			},
-		}
-		testSpecs[2].Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{
-				{
-					Type: "pid",
-					Path: "/proc/2/ns/pid",
+			}
+			testSpecs[2].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+						Path: "/proc/2/ns/pid",
+					},
 				},
-			},
-		}
+			}
 
-		containers, cleanup, err := startContainers(conf, testSpecs, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			containers, cleanup, err := startContainers(conf, testSpecs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		// Check via ps that multiple processes are running.
-		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
-		}
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
-		if err := waitForProcessList(containers[2], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			// Check via ps that multiple processes are running.
+			expectedPL := []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+			if err := waitForProcessList(containers[2], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
 
-		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
-		}
-		if err := waitForProcessList(containers[1], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			expectedPL = []*control.Process{
+				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+			}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+		})
 	}
 }
 
 func TestMultiContainerWait(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// The first container should run the entire duration of the test.
@@ -361,13 +355,13 @@ func TestMultiContainerWait(t *testing.T) {
 // TestExecWait ensures what we can wait containers and individual processes in the
 // sandbox that have already exited.
 func TestExecWait(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// The first container should run the entire duration of the test.
@@ -457,13 +451,13 @@ func TestMultiContainerMount(t *testing.T) {
 	})
 
 	// Setup the containers.
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	containers, cleanup, err := startContainers(conf, sps, ids)
@@ -484,176 +478,176 @@ func TestMultiContainerMount(t *testing.T) {
 // TestMultiContainerSignal checks that it is possible to signal individual
 // containers without killing the entire sandbox.
 func TestMultiContainerSignal(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
 
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		specs, ids := createSpecs(sleep, sleep)
-		containers, cleanup, err := startContainers(conf, specs, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			specs, ids := createSpecs(sleep, sleep)
+			containers, cleanup, err := startContainers(conf, specs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		// Check via ps that container 1 process is running.
-		expectedPL := []*control.Process{
-			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
-		}
+			// Check via ps that container 1 process is running.
+			expectedPL := []*control.Process{
+				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+			}
 
-		if err := waitForProcessList(containers[1], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
 
-		// Kill process 2.
-		if err := containers[1].SignalContainer(syscall.SIGKILL, false); err != nil {
-			t.Errorf("failed to kill process 2: %v", err)
-		}
+			// Kill process 2.
+			if err := containers[1].SignalContainer(syscall.SIGKILL, false); err != nil {
+				t.Errorf("failed to kill process 2: %v", err)
+			}
 
-		// Make sure process 1 is still running.
-		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
-		}
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			// Make sure process 1 is still running.
+			expectedPL = []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
 
-		// goferPid is reset when container is destroyed.
-		goferPid := containers[1].GoferPid
+			// goferPid is reset when container is destroyed.
+			goferPid := containers[1].GoferPid
 
-		// Destroy container and ensure container's gofer process has exited.
-		if err := containers[1].Destroy(); err != nil {
-			t.Errorf("failed to destroy container: %v", err)
-		}
-		_, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) {
-			cpid, err := syscall.Wait4(goferPid, nil, 0, nil)
-			return uintptr(cpid), 0, err
-		})
-		if err != syscall.ECHILD {
-			t.Errorf("error waiting for gofer to exit: %v", err)
-		}
-		// Make sure process 1 is still running.
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
-			t.Errorf("failed to wait for sleep to start: %v", err)
-		}
+			// Destroy container and ensure container's gofer process has exited.
+			if err := containers[1].Destroy(); err != nil {
+				t.Errorf("failed to destroy container: %v", err)
+			}
+			_, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) {
+				cpid, err := syscall.Wait4(goferPid, nil, 0, nil)
+				return uintptr(cpid), 0, err
+			})
+			if err != syscall.ECHILD {
+				t.Errorf("error waiting for gofer to exit: %v", err)
+			}
+			// Make sure process 1 is still running.
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
 
-		// Now that process 2 is gone, ensure we get an error trying to
-		// signal it again.
-		if err := containers[1].SignalContainer(syscall.SIGKILL, false); err == nil {
-			t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
-		}
+			// Now that process 2 is gone, ensure we get an error trying to
+			// signal it again.
+			if err := containers[1].SignalContainer(syscall.SIGKILL, false); err == nil {
+				t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
+			}
 
-		// Kill process 1.
-		if err := containers[0].SignalContainer(syscall.SIGKILL, false); err != nil {
-			t.Errorf("failed to kill process 1: %v", err)
-		}
+			// Kill process 1.
+			if err := containers[0].SignalContainer(syscall.SIGKILL, false); err != nil {
+				t.Errorf("failed to kill process 1: %v", err)
+			}
 
-		// Ensure that container's gofer and sandbox process are no more.
-		err = blockUntilWaitable(containers[0].GoferPid)
-		if err != nil && err != syscall.ECHILD {
-			t.Errorf("error waiting for gofer to exit: %v", err)
-		}
+			// Ensure that container's gofer and sandbox process are no more.
+			err = blockUntilWaitable(containers[0].GoferPid)
+			if err != nil && err != syscall.ECHILD {
+				t.Errorf("error waiting for gofer to exit: %v", err)
+			}
 
-		err = blockUntilWaitable(containers[0].Sandbox.Pid)
-		if err != nil && err != syscall.ECHILD {
-			t.Errorf("error waiting for sandbox to exit: %v", err)
-		}
+			err = blockUntilWaitable(containers[0].Sandbox.Pid)
+			if err != nil && err != syscall.ECHILD {
+				t.Errorf("error waiting for sandbox to exit: %v", err)
+			}
 
-		// The sentry should be gone, so signaling should yield an error.
-		if err := containers[0].SignalContainer(syscall.SIGKILL, false); err == nil {
-			t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
-		}
+			// The sentry should be gone, so signaling should yield an error.
+			if err := containers[0].SignalContainer(syscall.SIGKILL, false); err == nil {
+				t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
+			}
 
-		if err := containers[0].Destroy(); err != nil {
-			t.Errorf("failed to destroy container: %v", err)
-		}
+			if err := containers[0].Destroy(); err != nil {
+				t.Errorf("failed to destroy container: %v", err)
+			}
+		})
 	}
 }
 
 // TestMultiContainerDestroy checks that container are properly cleaned-up when
 // they are destroyed.
 func TestMultiContainerDestroy(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
 
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// First container will remain intact while the second container is killed.
-		podSpecs, ids := createSpecs(
-			[]string{"sleep", "100"},
-			[]string{app, "fork-bomb"})
-
-		// Run the fork bomb in a PID namespace to prevent processes to be
-		// re-parented to PID=1 in the root container.
-		podSpecs[1].Linux = &specs.Linux{
-			Namespaces: []specs.LinuxNamespace{{Type: "pid"}},
-		}
-		containers, cleanup, err := startContainers(conf, podSpecs, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// First container will remain intact while the second container is killed.
+			podSpecs, ids := createSpecs(
+				[]string{"sleep", "100"},
+				[]string{app, "fork-bomb"})
+
+			// Run the fork bomb in a PID namespace to prevent processes to be
+			// re-parented to PID=1 in the root container.
+			podSpecs[1].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{{Type: "pid"}},
+			}
+			containers, cleanup, err := startContainers(conf, podSpecs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		// Exec more processes to ensure signal all works for exec'd processes too.
-		args := &control.ExecArgs{
-			Filename: app,
-			Argv:     []string{app, "fork-bomb"},
-		}
-		if _, err := containers[1].Execute(args); err != nil {
-			t.Fatalf("error exec'ing: %v", err)
-		}
+			// Exec more processes to ensure signal all works for exec'd processes too.
+			args := &control.ExecArgs{
+				Filename: app,
+				Argv:     []string{app, "fork-bomb"},
+			}
+			if _, err := containers[1].Execute(args); err != nil {
+				t.Fatalf("error exec'ing: %v", err)
+			}
 
-		// Let it brew...
-		time.Sleep(500 * time.Millisecond)
+			// Let it brew...
+			time.Sleep(500 * time.Millisecond)
 
-		if err := containers[1].Destroy(); err != nil {
-			t.Fatalf("error destroying container: %v", err)
-		}
+			if err := containers[1].Destroy(); err != nil {
+				t.Fatalf("error destroying container: %v", err)
+			}
 
-		// Check that destroy killed all processes belonging to the container and
-		// waited for them to exit before returning.
-		pss, err := containers[0].Sandbox.Processes("")
-		if err != nil {
-			t.Fatalf("error getting process data from sandbox: %v", err)
-		}
-		expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
-		if r, err := procListsEqual(pss, expectedPL); !r {
-			t.Errorf("container got process list: %s, want: %s: error: %v",
-				procListToString(pss), procListToString(expectedPL), err)
-		}
+			// Check that destroy killed all processes belonging to the container and
+			// waited for them to exit before returning.
+			pss, err := containers[0].Sandbox.Processes("")
+			if err != nil {
+				t.Fatalf("error getting process data from sandbox: %v", err)
+			}
+			expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
+			if r, err := procListsEqual(pss, expectedPL); !r {
+				t.Errorf("container got process list: %s, want: %s: error: %v",
+					procListToString(pss), procListToString(expectedPL), err)
+			}
 
-		// Check that cont.Destroy is safe to call multiple times.
-		if err := containers[1].Destroy(); err != nil {
-			t.Errorf("error destroying container: %v", err)
-		}
+			// Check that cont.Destroy is safe to call multiple times.
+			if err := containers[1].Destroy(); err != nil {
+				t.Errorf("error destroying container: %v", err)
+			}
+		})
 	}
 }
 
 func TestMultiContainerProcesses(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Note: use curly braces to keep 'sh' process around. Otherwise, shell
@@ -706,13 +700,13 @@ func TestMultiContainerProcesses(t *testing.T) {
 // TestMultiContainerKillAll checks that all process that belong to a container
 // are killed when SIGKILL is sent to *all* processes in that container.
 func TestMultiContainerKillAll(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	for _, tc := range []struct {
@@ -721,7 +715,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 		{killContainer: true},
 		{killContainer: false},
 	} {
-		app, err := testutil.FindFile("runsc/container/test_app/test_app")
+		app, err := testutil.FindFile("test/cmd/test_app/test_app")
 		if err != nil {
 			t.Fatal("error finding test_app:", err)
 		}
@@ -739,11 +733,11 @@ func TestMultiContainerKillAll(t *testing.T) {
 		// Wait until all processes are created.
 		rootProcCount := int(math.Pow(2, 3) - 1)
 		if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error waitting for processes: %v", err)
 		}
 		procCount := int(math.Pow(2, 5) - 1)
 		if err := waitForProcessCount(containers[1], procCount); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error waiting for processes: %v", err)
 		}
 
 		// Exec more processes to ensure signal works for exec'd processes too.
@@ -757,7 +751,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 		// Wait for these new processes to start.
 		procCount += int(math.Pow(2, 3) - 1)
 		if err := waitForProcessCount(containers[1], procCount); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error waiting for processes: %v", err)
 		}
 
 		if tc.killContainer {
@@ -790,11 +784,11 @@ func TestMultiContainerKillAll(t *testing.T) {
 
 		// Check that all processes are gone.
 		if err := waitForProcessCount(containers[1], 0); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error waiting for processes: %v", err)
 		}
 		// Check that root container was not affected.
 		if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
-			t.Fatal(err)
+			t.Fatalf("error waiting for processes: %v", err)
 		}
 	}
 }
@@ -804,18 +798,17 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 		[]string{"/bin/sleep", "100"},
 		[]string{"/bin/sleep", "100"})
 
-	conf := testutil.TestConfig()
-	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
+	conf := testutil.TestConfig(t)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(rootBundleDir)
+	defer cleanup()
 
 	rootArgs := Args{
 		ID:        ids[0],
 		Spec:      specs[0],
-		BundleDir: rootBundleDir,
+		BundleDir: bundleDir,
 	}
 	root, err := New(conf, rootArgs)
 	if err != nil {
@@ -827,11 +820,11 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 	}
 
 	// Create and destroy sub-container.
-	bundleDir, err := testutil.SetupBundleDir(specs[1])
+	bundleDir, cleanupSub, err := testutil.SetupBundleDir(specs[1])
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(bundleDir)
+	defer cleanupSub()
 
 	args := Args{
 		ID:        ids[1],
@@ -858,18 +851,17 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 	}
 	specs, ids := createSpecs(cmds...)
 
-	conf := testutil.TestConfig()
-	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
+	conf := testutil.TestConfig(t)
+	rootDir, bundleDir, cleanup, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(rootBundleDir)
+	defer cleanup()
 
 	rootArgs := Args{
 		ID:        ids[0],
 		Spec:      specs[0],
-		BundleDir: rootBundleDir,
+		BundleDir: bundleDir,
 	}
 	root, err := New(conf, rootArgs)
 	if err != nil {
@@ -886,16 +878,16 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 			continue // skip root container
 		}
 
-		bundleDir, err := testutil.SetupBundleDir(specs[i])
+		bundleDir, cleanup, err := testutil.SetupBundleDir(specs[i])
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
-		defer os.RemoveAll(bundleDir)
+		defer cleanup()
 
 		rootArgs := Args{
 			ID:        ids[i],
 			Spec:      specs[i],
-			BundleDir: rootBundleDir,
+			BundleDir: bundleDir,
 		}
 		cont, err := New(conf, rootArgs)
 		if err != nil {
@@ -937,13 +929,13 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
 	script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename)
 	cmd := []string{"sh", "-c", script}
 
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Make sure overlay is enabled, and none of the root filesystems are
@@ -977,7 +969,7 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
 // TestMultiContainerContainerDestroyStress tests that IO operations continue
 // to work after containers have been stopped and gofers killed.
 func TestMultiContainerContainerDestroyStress(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -1006,13 +998,12 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 	childrenSpecs := allSpecs[1:]
 	childrenIDs := allIDs[1:]
 
-	conf := testutil.TestConfig()
-	rootDir, bundleDir, err := testutil.SetupContainer(rootSpec, conf)
+	conf := testutil.TestConfig(t)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(rootSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Start root container.
 	rootArgs := Args{
@@ -1038,11 +1029,11 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 
 		var children []*Container
 		for j, spec := range specs {
-			bundleDir, err := testutil.SetupBundleDir(spec)
+			bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
 			}
-			defer os.RemoveAll(bundleDir)
+			defer cleanup()
 
 			args := Args{
 				ID:        ids[j],
@@ -1080,308 +1071,308 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 // Test that pod shared mounts are properly mounted in 2 containers and that
 // changes from one container is reflected in the other.
 func TestMultiContainerSharedMount(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		podSpec, ids := createSpecs(sleep, sleep)
-		mnt0 := specs.Mount{
-			Destination: "/mydir/test",
-			Source:      "/some/dir",
-			Type:        "tmpfs",
-			Options:     nil,
-		}
-		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     nil,
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
 
-		mnt1 := mnt0
-		mnt1.Destination = "/mydir2/test2"
-		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
 
-		createSharedMount(mnt0, "test-mount", podSpec...)
+			createSharedMount(mnt0, "test-mount", podSpec...)
 
-		containers, cleanup, err := startContainers(conf, podSpec, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		file0 := path.Join(mnt0.Destination, "abc")
-		file1 := path.Join(mnt1.Destination, "abc")
-		execs := []execDesc{
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
-				desc: "directory is mounted in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
-				desc: "directory is mounted in container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/touch", file0},
-				desc: "create file in container0",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-f", file0},
-				desc: "file appears in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-f", file1},
-				desc: "file appears in container1",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/bin/rm", file1},
-				desc: "file removed from container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "!", "-f", file0},
-				desc: "file removed from container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "!", "-f", file1},
-				desc: "file removed from container1",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/bin/mkdir", file1},
-				desc: "create directory in container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-d", file0},
-				desc: "dir appears in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-d", file1},
-				desc: "dir appears in container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/bin/rmdir", file0},
-				desc: "create directory in container0",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "!", "-d", file0},
-				desc: "dir removed from container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "!", "-d", file1},
-				desc: "dir removed from container1",
-			},
-		}
-		if err := execMany(execs); err != nil {
-			t.Fatal(err.Error())
-		}
+			file0 := path.Join(mnt0.Destination, "abc")
+			file1 := path.Join(mnt1.Destination, "abc")
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+					desc: "directory is mounted in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+					desc: "directory is mounted in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/touch", file0},
+					desc: "create file in container0",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-f", file0},
+					desc: "file appears in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-f", file1},
+					desc: "file appears in container1",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/bin/rm", file1},
+					desc: "file removed from container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file0},
+					desc: "file removed from container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file1},
+					desc: "file removed from container1",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/bin/mkdir", file1},
+					desc: "create directory in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", file0},
+					desc: "dir appears in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", file1},
+					desc: "dir appears in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/bin/rmdir", file0},
+					desc: "create directory in container0",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "!", "-d", file0},
+					desc: "dir removed from container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "!", "-d", file1},
+					desc: "dir removed from container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
+		})
 	}
 }
 
 // Test that pod mounts are mounted as readonly when requested.
 func TestMultiContainerSharedMountReadonly(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		podSpec, ids := createSpecs(sleep, sleep)
-		mnt0 := specs.Mount{
-			Destination: "/mydir/test",
-			Source:      "/some/dir",
-			Type:        "tmpfs",
-			Options:     []string{"ro"},
-		}
-		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     []string{"ro"},
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
 
-		mnt1 := mnt0
-		mnt1.Destination = "/mydir2/test2"
-		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
 
-		createSharedMount(mnt0, "test-mount", podSpec...)
+			createSharedMount(mnt0, "test-mount", podSpec...)
 
-		containers, cleanup, err := startContainers(conf, podSpec, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		file0 := path.Join(mnt0.Destination, "abc")
-		file1 := path.Join(mnt1.Destination, "abc")
-		execs := []execDesc{
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
-				desc: "directory is mounted in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
-				desc: "directory is mounted in container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/touch", file0},
-				want: 1,
-				desc: "fails to write to container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/touch", file1},
-				want: 1,
-				desc: "fails to write to container1",
-			},
-		}
-		if err := execMany(execs); err != nil {
-			t.Fatal(err.Error())
-		}
+			file0 := path.Join(mnt0.Destination, "abc")
+			file1 := path.Join(mnt1.Destination, "abc")
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+					desc: "directory is mounted in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+					desc: "directory is mounted in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/touch", file0},
+					want: 1,
+					desc: "fails to write to container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/touch", file1},
+					want: 1,
+					desc: "fails to write to container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
+		})
 	}
 }
 
 // Test that shared pod mounts continue to work after container is restarted.
 func TestMultiContainerSharedMountRestart(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			t.Fatalf("error creating root dir: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		conf.RootDir = rootDir
-
-		// Setup the containers.
-		sleep := []string{"sleep", "100"}
-		podSpec, ids := createSpecs(sleep, sleep)
-		mnt0 := specs.Mount{
-			Destination: "/mydir/test",
-			Source:      "/some/dir",
-			Type:        "tmpfs",
-			Options:     nil,
-		}
-		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     nil,
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
 
-		mnt1 := mnt0
-		mnt1.Destination = "/mydir2/test2"
-		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
 
-		createSharedMount(mnt0, "test-mount", podSpec...)
+			createSharedMount(mnt0, "test-mount", podSpec...)
 
-		containers, cleanup, err := startContainers(conf, podSpec, ids)
-		if err != nil {
-			t.Fatalf("error starting containers: %v", err)
-		}
-		defer cleanup()
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-		file0 := path.Join(mnt0.Destination, "abc")
-		file1 := path.Join(mnt1.Destination, "abc")
-		execs := []execDesc{
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/touch", file0},
-				desc: "create file in container0",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-f", file0},
-				desc: "file appears in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-f", file1},
-				desc: "file appears in container1",
-			},
-		}
-		if err := execMany(execs); err != nil {
-			t.Fatal(err.Error())
-		}
+			file0 := path.Join(mnt0.Destination, "abc")
+			file1 := path.Join(mnt1.Destination, "abc")
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/touch", file0},
+					desc: "create file in container0",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-f", file0},
+					desc: "file appears in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-f", file1},
+					desc: "file appears in container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
 
-		containers[1].Destroy()
+			containers[1].Destroy()
 
-		bundleDir, err := testutil.SetupBundleDir(podSpec[1])
-		if err != nil {
-			t.Fatalf("error restarting container: %v", err)
-		}
-		defer os.RemoveAll(bundleDir)
+			bundleDir, cleanup, err := testutil.SetupBundleDir(podSpec[1])
+			if err != nil {
+				t.Fatalf("error restarting container: %v", err)
+			}
+			defer cleanup()
 
-		args := Args{
-			ID:        ids[1],
-			Spec:      podSpec[1],
-			BundleDir: bundleDir,
-		}
-		containers[1], err = New(conf, args)
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		if err := containers[1].Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
-		}
+			args := Args{
+				ID:        ids[1],
+				Spec:      podSpec[1],
+				BundleDir: bundleDir,
+			}
+			containers[1], err = New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			if err := containers[1].Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
 
-		execs = []execDesc{
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "-f", file0},
-				desc: "file is still in container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "-f", file1},
-				desc: "file is still in container1",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/bin/rm", file1},
-				desc: "file removed from container1",
-			},
-			{
-				c:    containers[0],
-				cmd:  []string{"/usr/bin/test", "!", "-f", file0},
-				desc: "file removed from container0",
-			},
-			{
-				c:    containers[1],
-				cmd:  []string{"/usr/bin/test", "!", "-f", file1},
-				desc: "file removed from container1",
-			},
-		}
-		if err := execMany(execs); err != nil {
-			t.Fatal(err.Error())
-		}
+			execs = []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-f", file0},
+					desc: "file is still in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-f", file1},
+					desc: "file is still in container1",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/bin/rm", file1},
+					desc: "file removed from container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file0},
+					desc: "file removed from container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file1},
+					desc: "file removed from container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
+		})
 	}
 }
 
 // Test that unsupported pod mounts options are ignored when matching master and
 // slave mounts.
 func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Setup the containers.
@@ -1428,7 +1419,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
 // Test that one container can send an FD to another container, even though
 // they have distinct MountNamespaces.
 func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
-	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
 	}
@@ -1457,13 +1448,13 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 		Type:        "tmpfs",
 	}
 
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Create the specs.
@@ -1494,13 +1485,13 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 
 // Test that container is destroyed when Gofer is killed.
 func TestMultiContainerGoferKilled(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	sleep := []string{"sleep", "100"}
@@ -1581,13 +1572,13 @@ func TestMultiContainerLoadSandbox(t *testing.T) {
 	sleep := []string{"sleep", "100"}
 	specs, ids := createSpecs(sleep, sleep, sleep)
 
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	// Create containers for the sandbox.
@@ -1614,7 +1605,7 @@ func TestMultiContainerLoadSandbox(t *testing.T) {
 	}
 
 	// Create a valid but empty container directory.
-	randomCID := testutil.UniqueContainerID()
+	randomCID := testutil.RandomContainerID()
 	dir = filepath.Join(conf.RootDir, randomCID)
 	if err := os.MkdirAll(dir, 0755); err != nil {
 		t.Fatalf("os.MkdirAll(%q)=%v", dir, err)
@@ -1681,13 +1672,13 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
 		Type:        "bind",
 	})
 
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	pod, cleanup, err := startContainers(conf, podSpecs, ids)
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index dc4194134..bac177a88 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -24,16 +24,15 @@ import (
 
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 // TestSharedVolume checks that modifications to a volume mount are propagated
 // into and out of the sandbox.
 func TestSharedVolume(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.FileAccess = boot.FileAccessShared
-	t.Logf("Running test with conf: %+v", conf)
 
 	// Main process just sleeps. We will use "exec" to probe the state of
 	// the filesystem.
@@ -44,16 +43,15 @@ func TestSharedVolume(t *testing.T) {
 		t.Fatalf("TempDir failed: %v", err)
 	}
 
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
@@ -190,9 +188,8 @@ func checkFile(c *Container, filename string, want []byte) error {
 // TestSharedVolumeFile tests that changes to file content outside the sandbox
 // is reflected inside.
 func TestSharedVolumeFile(t *testing.T) {
-	conf := testutil.TestConfig()
+	conf := testutil.TestConfig(t)
 	conf.FileAccess = boot.FileAccessShared
-	t.Logf("Running test with conf: %+v", conf)
 
 	// Main process just sleeps. We will use "exec" to probe the state of
 	// the filesystem.
@@ -203,16 +200,15 @@ func TestSharedVolumeFile(t *testing.T) {
 		t.Fatalf("TempDir failed: %v", err)
 	}
 
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
 	// Create and start the container.
 	args := Args{
-		ID:        testutil.UniqueContainerID(),
+		ID:        testutil.RandomContainerID(),
 		Spec:      spec,
 		BundleDir: bundleDir,
 	}
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
deleted file mode 100644
index 1ff5e8cc3..000000000
--- a/runsc/dockerutil/dockerutil.go
+++ /dev/null
@@ -1,476 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package dockerutil is a collection of utility functions, primarily for
-// testing.
-package dockerutil
-
-import (
-	"encoding/json"
-	"flag"
-	"fmt"
-	"io/ioutil"
-	"log"
-	"os"
-	"os/exec"
-	"path"
-	"regexp"
-	"strconv"
-	"strings"
-	"syscall"
-	"time"
-
-	"github.com/kr/pty"
-	"gvisor.dev/gvisor/runsc/testutil"
-)
-
-var (
-	runtime = flag.String("runtime", "runsc", "specify which runtime to use")
-	config  = flag.String("config_path", "/etc/docker/daemon.json", "configuration file for reading paths")
-)
-
-// EnsureSupportedDockerVersion checks if correct docker is installed.
-func EnsureSupportedDockerVersion() {
-	cmd := exec.Command("docker", "version")
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		log.Fatalf("Error running %q: %v", "docker version", err)
-	}
-	re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`)
-	matches := re.FindStringSubmatch(string(out))
-	if len(matches) != 3 {
-		log.Fatalf("Invalid docker output: %s", out)
-	}
-	major, _ := strconv.Atoi(matches[1])
-	minor, _ := strconv.Atoi(matches[2])
-	if major < 17 || (major == 17 && minor < 9) {
-		log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor)
-	}
-}
-
-// RuntimePath returns the binary path for the current runtime.
-func RuntimePath() (string, error) {
-	// Read the configuration data; the file must exist.
-	configBytes, err := ioutil.ReadFile(*config)
-	if err != nil {
-		return "", err
-	}
-
-	// Unmarshal the configuration.
-	c := make(map[string]interface{})
-	if err := json.Unmarshal(configBytes, &c); err != nil {
-		return "", err
-	}
-
-	// Decode the expected configuration.
-	r, ok := c["runtimes"]
-	if !ok {
-		return "", fmt.Errorf("no runtimes declared: %v", c)
-	}
-	rs, ok := r.(map[string]interface{})
-	if !ok {
-		// The runtimes are not a map.
-		return "", fmt.Errorf("unexpected format: %v", c)
-	}
-	r, ok = rs[*runtime]
-	if !ok {
-		// The expected runtime is not declared.
-		return "", fmt.Errorf("runtime %q not found: %v", *runtime, c)
-	}
-	rs, ok = r.(map[string]interface{})
-	if !ok {
-		// The runtime is not a map.
-		return "", fmt.Errorf("unexpected format: %v", c)
-	}
-	p, ok := rs["path"].(string)
-	if !ok {
-		// The runtime does not declare a path.
-		return "", fmt.Errorf("unexpected format: %v", c)
-	}
-	return p, nil
-}
-
-// MountMode describes if the mount should be ro or rw.
-type MountMode int
-
-const (
-	// ReadOnly is what the name says.
-	ReadOnly MountMode = iota
-	// ReadWrite is what the name says.
-	ReadWrite
-)
-
-// String returns the mount mode argument for this MountMode.
-func (m MountMode) String() string {
-	switch m {
-	case ReadOnly:
-		return "ro"
-	case ReadWrite:
-		return "rw"
-	}
-	panic(fmt.Sprintf("invalid mode: %d", m))
-}
-
-// MountArg formats the volume argument to mount in the container.
-func MountArg(source, target string, mode MountMode) string {
-	return fmt.Sprintf("-v=%s:%s:%v", source, target, mode)
-}
-
-// LinkArg formats the link argument.
-func LinkArg(source *Docker, target string) string {
-	return fmt.Sprintf("--link=%s:%s", source.Name, target)
-}
-
-// PrepareFiles creates temp directory to copy files there. The sandbox doesn't
-// have access to files in the test dir.
-func PrepareFiles(names ...string) (string, error) {
-	dir, err := ioutil.TempDir("", "image-test")
-	if err != nil {
-		return "", fmt.Errorf("ioutil.TempDir failed: %v", err)
-	}
-	if err := os.Chmod(dir, 0777); err != nil {
-		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
-	}
-	for _, name := range names {
-		src, err := testutil.FindFile(name)
-		if err != nil {
-			return "", fmt.Errorf("testutil.Preparefiles(%q) failed: %v", name, err)
-		}
-		dst := path.Join(dir, path.Base(name))
-		if err := testutil.Copy(src, dst); err != nil {
-			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
-		}
-	}
-	return dir, nil
-}
-
-// do executes docker command.
-func do(args ...string) (string, error) {
-	log.Printf("Running: docker %s\n", args)
-	cmd := exec.Command("docker", args...)
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		return "", fmt.Errorf("error executing docker %s: %v\nout: %s", args, err, out)
-	}
-	return string(out), nil
-}
-
-// doWithPty executes docker command with stdio attached to a pty.
-func doWithPty(args ...string) (*exec.Cmd, *os.File, error) {
-	log.Printf("Running with pty: docker %s\n", args)
-	cmd := exec.Command("docker", args...)
-	ptmx, err := pty.Start(cmd)
-	if err != nil {
-		return nil, nil, fmt.Errorf("error executing docker %s with a pty: %v", args, err)
-	}
-	return cmd, ptmx, nil
-}
-
-// Pull pulls a docker image. This is used in tests to isolate the
-// time to pull the image off the network from the time to actually
-// start the container, to avoid timeouts over slow networks.
-func Pull(image string) error {
-	_, err := do("pull", image)
-	return err
-}
-
-// Docker contains the name and the runtime of a docker container.
-type Docker struct {
-	Runtime string
-	Name    string
-}
-
-// MakeDocker sets up the struct for a Docker container.
-// Names of containers will be unique.
-func MakeDocker(namePrefix string) Docker {
-	return Docker{
-		Name:    testutil.RandomName(namePrefix),
-		Runtime: *runtime,
-	}
-}
-
-// logDockerID logs a container id, which is needed to find container runsc logs.
-func (d *Docker) logDockerID() {
-	id, err := d.ID()
-	if err != nil {
-		log.Printf("%v\n", err)
-	}
-	log.Printf("Name: %s ID: %v\n", d.Name, id)
-}
-
-// Create calls 'docker create' with the arguments provided.
-func (d *Docker) Create(args ...string) error {
-	a := []string{"create", "--runtime", d.Runtime, "--name", d.Name}
-	a = append(a, args...)
-	_, err := do(a...)
-	if err == nil {
-		d.logDockerID()
-	}
-	return err
-}
-
-// Start calls 'docker start'.
-func (d *Docker) Start() error {
-	if _, err := do("start", d.Name); err != nil {
-		return fmt.Errorf("error starting container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Stop calls 'docker stop'.
-func (d *Docker) Stop() error {
-	if _, err := do("stop", d.Name); err != nil {
-		return fmt.Errorf("error stopping container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Run calls 'docker run' with the arguments provided. The container starts
-// running in the background and the call returns immediately.
-func (d *Docker) Run(args ...string) error {
-	a := d.runArgs("-d")
-	a = append(a, args...)
-	_, err := do(a...)
-	if err == nil {
-		d.logDockerID()
-	}
-	return err
-}
-
-// RunWithPty is like Run but with an attached pty.
-func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) {
-	a := d.runArgs("-it")
-	a = append(a, args...)
-	return doWithPty(a...)
-}
-
-// RunFg calls 'docker run' with the arguments provided in the foreground. It
-// blocks until the container exits and returns the output.
-func (d *Docker) RunFg(args ...string) (string, error) {
-	a := d.runArgs(args...)
-	out, err := do(a...)
-	if err == nil {
-		d.logDockerID()
-	}
-	return string(out), err
-}
-
-func (d *Docker) runArgs(args ...string) []string {
-	// Environment variable RUNSC_TEST_NAME is picked up by the runtime and added
-	// to the log name, so one can easily identify the corresponding logs for
-	// this test.
-	rv := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-e", "RUNSC_TEST_NAME=" + d.Name}
-	return append(rv, args...)
-}
-
-// Logs calls 'docker logs'.
-func (d *Docker) Logs() (string, error) {
-	return do("logs", d.Name)
-}
-
-// Exec calls 'docker exec' with the arguments provided.
-func (d *Docker) Exec(args ...string) (string, error) {
-	return d.ExecWithFlags(nil, args...)
-}
-
-// ExecWithFlags calls 'docker exec <flags> name <args>'.
-func (d *Docker) ExecWithFlags(flags []string, args ...string) (string, error) {
-	a := []string{"exec"}
-	a = append(a, flags...)
-	a = append(a, d.Name)
-	a = append(a, args...)
-	return do(a...)
-}
-
-// ExecAsUser calls 'docker exec' as the given user with the arguments
-// provided.
-func (d *Docker) ExecAsUser(user string, args ...string) (string, error) {
-	a := []string{"exec", "--user", user, d.Name}
-	a = append(a, args...)
-	return do(a...)
-}
-
-// ExecWithTerminal calls 'docker exec -it' with the arguments provided and
-// attaches a pty to stdio.
-func (d *Docker) ExecWithTerminal(args ...string) (*exec.Cmd, *os.File, error) {
-	a := []string{"exec", "-it", d.Name}
-	a = append(a, args...)
-	return doWithPty(a...)
-}
-
-// Pause calls 'docker pause'.
-func (d *Docker) Pause() error {
-	if _, err := do("pause", d.Name); err != nil {
-		return fmt.Errorf("error pausing container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Unpause calls 'docker pause'.
-func (d *Docker) Unpause() error {
-	if _, err := do("unpause", d.Name); err != nil {
-		return fmt.Errorf("error unpausing container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Checkpoint calls 'docker checkpoint'.
-func (d *Docker) Checkpoint(name string) error {
-	if _, err := do("checkpoint", "create", d.Name, name); err != nil {
-		return fmt.Errorf("error pausing container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Restore calls 'docker start --checkname [name]'.
-func (d *Docker) Restore(name string) error {
-	if _, err := do("start", "--checkpoint", name, d.Name); err != nil {
-		return fmt.Errorf("error starting container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// Remove calls 'docker rm'.
-func (d *Docker) Remove() error {
-	if _, err := do("rm", d.Name); err != nil {
-		return fmt.Errorf("error deleting container %q: %v", d.Name, err)
-	}
-	return nil
-}
-
-// CleanUp kills and deletes the container (best effort).
-func (d *Docker) CleanUp() {
-	d.logDockerID()
-	if _, err := do("kill", d.Name); err != nil {
-		if strings.Contains(err.Error(), "is not running") {
-			// Nothing to kill. Don't log the error in this case.
-		} else {
-			log.Printf("error killing container %q: %v", d.Name, err)
-		}
-	}
-	if err := d.Remove(); err != nil {
-		log.Print(err)
-	}
-}
-
-// FindPort returns the host port that is mapped to 'sandboxPort'. This calls
-// docker to allocate a free port in the host and prevent conflicts.
-func (d *Docker) FindPort(sandboxPort int) (int, error) {
-	format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort)
-	out, err := do("inspect", "-f", format, d.Name)
-	if err != nil {
-		return -1, fmt.Errorf("error retrieving port: %v", err)
-	}
-	port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-	if err != nil {
-		return -1, fmt.Errorf("error parsing port %q: %v", out, err)
-	}
-	return port, nil
-}
-
-// FindIP returns the IP address of the container as a string.
-func (d *Docker) FindIP() (string, error) {
-	const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}`
-	out, err := do("inspect", "-f", format, d.Name)
-	if err != nil {
-		return "", fmt.Errorf("error retrieving IP: %v", err)
-	}
-	return strings.TrimSpace(out), nil
-}
-
-// SandboxPid returns the PID to the sandbox process.
-func (d *Docker) SandboxPid() (int, error) {
-	out, err := do("inspect", "-f={{.State.Pid}}", d.Name)
-	if err != nil {
-		return -1, fmt.Errorf("error retrieving pid: %v", err)
-	}
-	pid, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-	if err != nil {
-		return -1, fmt.Errorf("error parsing pid %q: %v", out, err)
-	}
-	return pid, nil
-}
-
-// ID returns the container ID.
-func (d *Docker) ID() (string, error) {
-	out, err := do("inspect", "-f={{.Id}}", d.Name)
-	if err != nil {
-		return "", fmt.Errorf("error retrieving ID: %v", err)
-	}
-	return strings.TrimSpace(string(out)), nil
-}
-
-// Wait waits for container to exit, up to the given timeout. Returns error if
-// wait fails or timeout is hit. Returns the application return code otherwise.
-// Note that the application may have failed even if err == nil, always check
-// the exit code.
-func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
-	timeoutChan := time.After(timeout)
-	waitChan := make(chan (syscall.WaitStatus))
-	errChan := make(chan (error))
-
-	go func() {
-		out, err := do("wait", d.Name)
-		if err != nil {
-			errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err)
-		}
-		exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
-		if err != nil {
-			errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err)
-		}
-		waitChan <- syscall.WaitStatus(uint32(exit))
-	}()
-
-	select {
-	case ws := <-waitChan:
-		return ws, nil
-	case err := <-errChan:
-		return syscall.WaitStatus(1), err
-	case <-timeoutChan:
-		return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name)
-	}
-}
-
-// WaitForOutput calls 'docker logs' to retrieve containers output and searches
-// for the given pattern.
-func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
-	matches, err := d.WaitForOutputSubmatch(pattern, timeout)
-	if err != nil {
-		return "", err
-	}
-	if len(matches) == 0 {
-		return "", nil
-	}
-	return matches[0], nil
-}
-
-// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and
-// searches for the given pattern. It returns any regexp submatches as well.
-func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) {
-	re := regexp.MustCompile(pattern)
-	var out string
-	for exp := time.Now().Add(timeout); time.Now().Before(exp); {
-		var err error
-		out, err = d.Logs()
-		if err != nil {
-			return nil, err
-		}
-		if matches := re.FindStringSubmatch(out); matches != nil {
-			// Success!
-			return matches, nil
-		}
-		time.Sleep(100 * time.Millisecond)
-	}
-	return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
-}
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 64a406ae2..1036b0630 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -13,12 +13,12 @@ go_library(
     visibility = ["//runsc:__subpackages__"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/cleanup",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/p9",
         "//pkg/sync",
         "//pkg/syserr",
-        "//runsc/specutils",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index cadd83273..edc239013 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -33,11 +33,11 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 const (
@@ -439,7 +439,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
-	cu := specutils.MakeCleanup(func() {
+	cu := cleanup.Make(func() {
 		child.Close()
 		// Best effort attempt to remove the file in case of failure.
 		if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
@@ -480,7 +480,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	cu := specutils.MakeCleanup(func() {
+	cu := cleanup.Make(func() {
 		// Best effort attempt to remove the dir in case of failure.
 		if err := unix.Unlinkat(l.file.FD(), name, unix.AT_REMOVEDIR); err != nil {
 			log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
@@ -767,22 +767,18 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	return err
 }
 
-// TODO(b/127675828): support getxattr.
 func (*localFile) GetXattr(string, uint64) (string, error) {
 	return "", syscall.EOPNOTSUPP
 }
 
-// TODO(b/127675828): support setxattr.
 func (*localFile) SetXattr(string, string, uint32) error {
 	return syscall.EOPNOTSUPP
 }
 
-// TODO(b/148303075): support listxattr.
 func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
 	return nil, syscall.EOPNOTSUPP
 }
 
-// TODO(b/148303075): support removexattr.
 func (*localFile) RemoveXattr(string) error {
 	return syscall.EOPNOTSUPP
 }
@@ -868,7 +864,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	cu := specutils.MakeCleanup(func() {
+	cu := cleanup.Make(func() {
 		// Best effort attempt to remove the symlink in case of failure.
 		if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
diff --git a/runsc/main.go b/runsc/main.go
index 762b0f801..920ed84a5 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -54,9 +54,11 @@ var (
 
 	// Debugging flags.
 	debugLog        = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+	panicLog        = flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
 	logPackets      = flag.Bool("log-packets", false, "enable network packet logging.")
 	logFD           = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
 	debugLogFD      = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	panicLogFD      = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
 	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
 	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.")
 
@@ -70,10 +72,11 @@ var (
 	network            = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
 	hardwareGSO        = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
 	softwareGSO        = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.")
+	qDisc              = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
 	fileAccess         = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
 	fsGoferHostUDS     = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
 	overlay            = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
-	overlayfsStaleRead = flag.Bool("overlayfs-stale-read", false, "reopen cached FDs after a file is opened for write to workaround overlayfs limitation on kernels before 4.19.")
+	overlayfsStaleRead = flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem")
 	watchdogAction     = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
 	panicSignal        = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 	profile            = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
@@ -82,6 +85,7 @@ var (
 	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
 	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
 	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
+	vfs2Enabled        = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
 
 	// Test flags, not to be used outside tests, ever.
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
@@ -116,8 +120,8 @@ func main() {
 	subcommands.Register(new(cmd.Resume), "")
 	subcommands.Register(new(cmd.Run), "")
 	subcommands.Register(new(cmd.Spec), "")
-	subcommands.Register(new(cmd.Start), "")
 	subcommands.Register(new(cmd.State), "")
+	subcommands.Register(new(cmd.Start), "")
 	subcommands.Register(new(cmd.Wait), "")
 
 	// Register internal commands with the internal group name. This causes
@@ -127,6 +131,7 @@ func main() {
 	subcommands.Register(new(cmd.Boot), internalGroup)
 	subcommands.Register(new(cmd.Debug), internalGroup)
 	subcommands.Register(new(cmd.Gofer), internalGroup)
+	subcommands.Register(new(cmd.Statefile), internalGroup)
 
 	// All subcommands must be registered before flag parsing.
 	flag.Parse()
@@ -194,6 +199,11 @@ func main() {
 		cmd.Fatalf("%v", err)
 	}
 
+	queueingDiscipline, err := boot.MakeQueueingDiscipline(*qDisc)
+	if err != nil {
+		cmd.Fatalf("%s", err)
+	}
+
 	// Sets the reference leak check mode. Also set it in config below to
 	// propagate it to child processes.
 	refs.SetLeakMode(refsLeakMode)
@@ -205,6 +215,7 @@ func main() {
 		LogFilename:        *logFilename,
 		LogFormat:          *logFormat,
 		DebugLog:           *debugLog,
+		PanicLog:           *panicLog,
 		DebugLogFormat:     *debugLogFormat,
 		FileAccess:         fsAccess,
 		FSGoferHostUDS:     *fsGoferHostUDS,
@@ -226,7 +237,8 @@ func main() {
 		ReferenceLeakMode:  refsLeakMode,
 		OverlayfsStaleRead: *overlayfsStaleRead,
 		CPUNumFromQuota:    *cpuNumFromQuota,
-
+		VFS2:               *vfs2Enabled,
+		QDisc:              queueingDiscipline,
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
 	}
@@ -257,20 +269,6 @@ func main() {
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
 
-		// Quick sanity check to make sure no other commands get passed
-		// a log fd (they should use log dir instead).
-		if subcommand != "boot" && subcommand != "gofer" {
-			cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
-		}
-
-		// If we are the boot process, then we own our stdio FDs and can do what we
-		// want with them. Since Docker and Containerd both eat boot's stderr, we
-		// dup our stderr to the provided log FD so that panics will appear in the
-		// logs, rather than just disappear.
-		if err := syscall.Dup3(int(f.Fd()), int(os.Stderr.Fd()), 0); err != nil {
-			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
-		}
-
 		e = newEmitter(*debugLogFormat, f)
 
 	} else if *debugLog != "" {
@@ -286,7 +284,25 @@ func main() {
 		e = newEmitter("text", ioutil.Discard)
 	}
 
-	if *alsoLogToStderr {
+	if *panicLogFD > -1 || *debugLogFD > -1 {
+		fd := *panicLogFD
+		if fd < 0 {
+			fd = *debugLogFD
+		}
+		// Quick sanity check to make sure no other commands get passed
+		// a log fd (they should use log dir instead).
+		if subcommand != "boot" && subcommand != "gofer" {
+			cmd.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
+		}
+
+		// If we are the boot process, then we own our stdio FDs and can do what we
+		// want with them. Since Docker and Containerd both eat boot's stderr, we
+		// dup our stderr to the provided log FD so that panics will appear in the
+		// logs, rather than just disappear.
+		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
+			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
+		}
+	} else if *alsoLogToStderr {
 		e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
 	}
 
@@ -303,6 +319,7 @@ func main() {
 	log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
 	log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
 	log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+	log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
 	log.Infof("***************************")
 
 	if *testOnlyAllowRunAsCurrentUserWithoutChroot {
@@ -319,7 +336,7 @@ func main() {
 		log.Infof("Exiting with status: %v", ws)
 		if ws.Signaled() {
 			// No good way to return it, emulate what the shell does. Maybe raise
-			// signall to self?
+			// signal to self?
 			os.Exit(128 + int(ws.Signal()))
 		}
 		os.Exit(ws.ExitStatus())
@@ -332,11 +349,11 @@ func main() {
 func newEmitter(format string, logFile io.Writer) log.Emitter {
 	switch format {
 	case "text":
-		return &log.GoogleEmitter{log.Writer{Next: logFile}}
+		return log.GoogleEmitter{&log.Writer{Next: logFile}}
 	case "json":
-		return &log.JSONEmitter{log.Writer{Next: logFile}}
+		return log.JSONEmitter{&log.Writer{Next: logFile}}
 	case "json-k8s":
-		return &log.K8sJSONEmitter{log.Writer{Next: logFile}}
+		return log.K8sJSONEmitter{&log.Writer{Next: logFile}}
 	}
 	cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
 	panic("unreachable")
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index c95d50294..035dcd3e3 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -13,6 +13,7 @@ go_library(
         "//runsc:__subpackages__",
     ],
     deps = [
+        "//pkg/cleanup",
         "//pkg/control/client",
         "//pkg/control/server",
         "//pkg/log",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 99e143696..209bfdb20 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -21,7 +21,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
-	"strings"
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -63,7 +62,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 		// Build the path to the net namespace of the sandbox process.
 		// This is what we will copy.
 		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
-		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels); err != nil {
+		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels, conf.QDisc); err != nil {
 			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
 		}
 	case boot.NetworkHost:
@@ -75,30 +74,8 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 }
 
 func createDefaultLoopbackInterface(conn *urpc.Client) error {
-	link := boot.LoopbackLink{
-		Name: "lo",
-		Addresses: []net.IP{
-			net.IP("\x7f\x00\x00\x01"),
-			net.IPv6loopback,
-		},
-		Routes: []boot.Route{
-			{
-				Destination: net.IPNet{
-
-					IP:   net.IPv4(0x7f, 0, 0, 0),
-					Mask: net.IPv4Mask(0xff, 0, 0, 0),
-				},
-			},
-			{
-				Destination: net.IPNet{
-					IP:   net.IPv6loopback,
-					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
-				},
-			},
-		},
-	}
 	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
-		LoopbackLinks: []boot.LoopbackLink{link},
+		LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink},
 	}, nil); err != nil {
 		return fmt.Errorf("creating loopback link and routes: %v", err)
 	}
@@ -138,7 +115,7 @@ func isRootNS() (bool, error) {
 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
 // net namespace with the given path, creates them in the sandbox, and removes
 // them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error {
 	// Join the network namespace that we will be copying.
 	restore, err := joinNetNS(nsPath)
 	if err != nil {
@@ -224,6 +201,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 			MTU:         iface.MTU,
 			Routes:      routes,
 			NumChannels: numNetworkChannels,
+			QDisc:       qDisc,
 		}
 
 		// Get the link for the interface.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ec72bdbfd..6e1a2af25 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -18,16 +18,19 @@ package sandbox
 import (
 	"context"
 	"fmt"
+	"io"
 	"math"
 	"os"
 	"os/exec"
 	"strconv"
+	"strings"
 	"syscall"
 	"time"
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/syndtr/gocapability/capability"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/control/client"
 	"gvisor.dev/gvisor/pkg/control/server"
 	"gvisor.dev/gvisor/pkg/log"
@@ -117,7 +120,7 @@ func New(conf *boot.Config, args *Args) (*Sandbox, error) {
 	s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup}
 	// The Cleanup object cleans up partially created sandboxes when an error
 	// occurs. Any errors occurring during cleanup itself are ignored.
-	c := specutils.MakeCleanup(func() {
+	c := cleanup.Make(func() {
 		err := s.destroy()
 		log.Warningf("error destroying sandbox: %v", err)
 	})
@@ -142,7 +145,19 @@ func New(conf *boot.Config, args *Args) (*Sandbox, error) {
 	// Wait until the sandbox has booted.
 	b := make([]byte, 1)
 	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
-		return nil, fmt.Errorf("waiting for sandbox to start: %v", err)
+		err := fmt.Errorf("waiting for sandbox to start: %v", err)
+		// If the sandbox failed to start, it may be because the binary
+		// permissions were incorrect. Check the bits and return a more helpful
+		// error message.
+		//
+		// NOTE: The error message is checked because error types are lost over
+		// rpc calls.
+		if strings.Contains(err.Error(), io.EOF.Error()) {
+			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
+				return nil, fmt.Errorf("%v: %v", err, permsErr)
+			}
+		}
+		return nil, err
 	}
 
 	c.Release()
@@ -369,8 +384,24 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		cmd.Args = append(cmd.Args, "--debug-log-fd="+strconv.Itoa(nextFD))
 		nextFD++
 	}
+	if conf.PanicLog != "" {
+		test := ""
+		if len(conf.TestOnlyTestNameEnv) != 0 {
+			// Fetch test name if one is provided and the test only flag was set.
+			if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+				test = t
+			}
+		}
 
-	cmd.Args = append(cmd.Args, "--panic-signal="+strconv.Itoa(int(syscall.SIGTERM)))
+		panicLogFile, err := specutils.DebugLogFile(conf.PanicLog, "panic", test)
+		if err != nil {
+			return fmt.Errorf("opening debug log file in %q: %v", conf.PanicLog, err)
+		}
+		defer panicLogFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, panicLogFile)
+		cmd.Args = append(cmd.Args, "--panic-log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
 
 	// Add the "boot" command to the args.
 	//
@@ -416,9 +447,13 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 
-	// If the platform needs a device FD we must pass it in.
-	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
+	gPlatform, err := platform.Lookup(conf.Platform)
+	if err != nil {
 		return err
+	}
+
+	if deviceFile, err := gPlatform.OpenDevice(); err != nil {
+		return fmt.Errorf("opening device file for platform %q: %v", gPlatform, err)
 	} else if deviceFile != nil {
 		defer deviceFile.Close()
 		cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile)
@@ -426,6 +461,12 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 
+	// TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff
+	// isn't set.
+	if conf.Platform == "kvm" {
+		cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
+	}
+
 	// The current process' stdio must be passed to the application via the
 	// --stdio-fds flag. The stdio of the sandbox process itself must not
 	// be connected to the same FDs, otherwise we risk leaking sandbox
@@ -503,7 +544,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		{Type: specs.UTSNamespace},
 	}
 
-	if conf.Platform == platforms.Ptrace {
+	if gPlatform.Requirements().RequiresCurrentPIDNS {
 		// TODO(b/75837838): Also set a new PID namespace so that we limit
 		// access to other host processes.
 		log.Infof("Sandbox will be started in the current PID namespace")
@@ -564,45 +605,32 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
 			cmd.Args = append(cmd.Args, "--setup-root")
 
+			const nobody = 65534
 			if conf.Rootless {
-				log.Infof("Rootless mode: sandbox will run as root inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
+				log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
 				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
 					{
-						ContainerID: 0,
+						ContainerID: nobody,
 						HostID:      os.Getuid(),
 						Size:        1,
 					},
 				}
 				cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
 					{
-						ContainerID: 0,
+						ContainerID: nobody,
 						HostID:      os.Getgid(),
 						Size:        1,
 					},
 				}
-				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
 
 			} else {
 				// Map nobody in the new namespace to nobody in the parent namespace.
 				//
 				// A sandbox process will construct an empty
-				// root for itself, so it has to have the CAP_SYS_ADMIN
-				// capability.
-				//
-				// FIXME(b/122554829): The current implementations of
-				// os/exec doesn't allow to set ambient capabilities if
-				// a process is started in a new user namespace. As a
-				// workaround, we start the sandbox process with the 0
-				// UID and then it constructs a chroot and sets UID to
-				// nobody.  https://github.com/golang/go/issues/2315
-				const nobody = 65534
+				// root for itself, so it has to have
+				// CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities.
 				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
 					{
-						ContainerID: 0,
-						HostID:      nobody - 1,
-						Size:        1,
-					},
-					{
 						ContainerID: nobody,
 						HostID:      nobody,
 						Size:        1,
@@ -615,11 +643,11 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 						Size:        1,
 					},
 				}
-
-				// Set credentials to run as user and group nobody.
-				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: nobody}
 			}
 
+			// Set credentials to run as user and group nobody.
+			cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody}
+			cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, uintptr(capability.CAP_SYS_ADMIN), uintptr(capability.CAP_SYS_CHROOT))
 		} else {
 			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
 		}
@@ -677,6 +705,13 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 
+	if args.Attached {
+		// Kill sandbox if parent process exits in attached mode.
+		cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL
+		// Tells boot that any process it creates must have pdeathsig set.
+		cmd.Args = append(cmd.Args, "--attached")
+	}
+
 	// Add container as the last argument.
 	cmd.Args = append(cmd.Args, s.ID)
 
@@ -685,15 +720,22 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		log.Debugf("Donating FD %d: %q", i+3, f.Name())
 	}
 
-	if args.Attached {
-		// Kill sandbox if parent process exits in attached mode.
-		cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL
-	}
-
 	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
 	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
-		return fmt.Errorf("Sandbox: %v", err)
+		err := fmt.Errorf("starting sandbox: %v", err)
+		// If the sandbox failed to start, it may be because the binary
+		// permissions were incorrect. Check the bits and return a more helpful
+		// error message.
+		//
+		// NOTE: The error message is checked because error types are lost over
+		// rpc calls.
+		if strings.Contains(err.Error(), syscall.EACCES.Error()) {
+			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
+				return fmt.Errorf("%v: %v", err, permsErr)
+			}
+		}
+		return err
 	}
 	s.child = true
 	s.Pid = cmd.Process.Pid
@@ -972,6 +1014,66 @@ func (s *Sandbox) StopCPUProfile() error {
 	return nil
 }
 
+// GoroutineProfile writes a goroutine profile to the given file.
+func (s *Sandbox) GoroutineProfile(f *os.File) error {
+	log.Debugf("Goroutine profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// BlockProfile writes a block profile to the given file.
+func (s *Sandbox) BlockProfile(f *os.File) error {
+	log.Debugf("Block profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.BlockProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q block profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// MutexProfile writes a mutex profile to the given file.
+func (s *Sandbox) MutexProfile(f *os.File) error {
+	log.Debugf("Mutex profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.MutexProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q mutex profile: %v", s.ID, err)
+	}
+	return nil
+}
+
 // StartTrace start trace  writing to the given file.
 func (s *Sandbox) StartTrace(f *os.File) error {
 	log.Debugf("Trace start %q", s.ID)
@@ -1096,3 +1198,31 @@ func deviceFileForPlatform(name string) (*os.File, error) {
 	}
 	return f, nil
 }
+
+// checkBinaryPermissions verifies that the required binary bits are set on
+// the runsc executable.
+func checkBinaryPermissions(conf *boot.Config) error {
+	// All platforms need the other exe bit
+	neededBits := os.FileMode(0001)
+	if conf.Platform == platforms.Ptrace {
+		// Ptrace needs the other read bit
+		neededBits |= os.FileMode(0004)
+	}
+
+	exePath, err := os.Executable()
+	if err != nil {
+		return fmt.Errorf("getting exe path: %v", err)
+	}
+
+	// Check the permissions of the runsc binary and print an error if it
+	// doesn't match expectations.
+	info, err := os.Stat(exePath)
+	if err != nil {
+		return fmt.Errorf("stat file: %v", err)
+	}
+
+	if info.Mode().Perm()&neededBits != neededBits {
+		return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath)))
+	}
+	return nil
+}
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index c7dd3051c..23001d67c 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"os"
 	"os/exec"
+	"os/signal"
 	"path/filepath"
 	"runtime"
 	"syscall"
@@ -252,13 +253,27 @@ func MaybeRunAsRoot() error {
 		},
 		Credential:                 &syscall.Credential{Uid: 0, Gid: 0},
 		GidMappingsEnableSetgroups: false,
+
+		// Make sure child is killed when the parent terminates.
+		Pdeathsig: syscall.SIGKILL,
 	}
 
 	cmd.Env = os.Environ()
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
-	if err := cmd.Run(); err != nil {
+	if err := cmd.Start(); err != nil {
+		return fmt.Errorf("re-executing self: %w", err)
+	}
+	ch := make(chan os.Signal, 1)
+	signal.Notify(ch)
+	go func() {
+		for {
+			// Forward all signals to child process.
+			cmd.Process.Signal(<-ch)
+		}
+	}()
+	if err := cmd.Wait(); err != nil {
 		if exit, ok := err.(*exec.ExitError); ok {
 			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
 				os.Exit(ws.ExitStatus())
@@ -266,7 +281,7 @@ func MaybeRunAsRoot() error {
 			log.Warningf("No wait status provided, exiting with -1: %v", err)
 			os.Exit(-1)
 		}
-		return fmt.Errorf("re-executing self: %v", err)
+		return err
 	}
 	// Child completed with success.
 	os.Exit(0)
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index d3c2e4e78..f1fa573c5 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -92,6 +92,12 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
 	}
 
+	// PR_SET_NO_NEW_PRIVS is assumed to always be set.
+	// See kernel.Task.updateCredsForExecLocked.
+	if !spec.Process.NoNewPrivileges {
+		log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.")
+	}
+
 	// TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox.
 	if spec.Linux != nil && spec.Linux.Seccomp != nil {
 		log.Warningf("Seccomp spec is being ignored")
@@ -438,36 +444,6 @@ func ContainsStr(strs []string, str string) bool {
 	return false
 }
 
-// Cleanup allows defers to be aborted when cleanup needs to happen
-// conditionally. Usage:
-// c := MakeCleanup(func() { f.Close() })
-// defer c.Clean() // any failure before release is called will close the file.
-// ...
-// c.Release() // on success, aborts closing the file and return it.
-// return f
-type Cleanup struct {
-	clean func()
-}
-
-// MakeCleanup creates a new Cleanup object.
-func MakeCleanup(f func()) Cleanup {
-	return Cleanup{clean: f}
-}
-
-// Clean calls the cleanup function.
-func (c *Cleanup) Clean() {
-	if c.clean != nil {
-		c.clean()
-		c.clean = nil
-	}
-}
-
-// Release releases the cleanup from its duties, i.e. cleanup function is not
-// called after this point.
-func (c *Cleanup) Release() {
-	c.clean = nil
-}
-
 // RetryEintr retries the function until an error different than EINTR is
 // returned.
 func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
@@ -528,3 +504,8 @@ func EnvVar(env []string, name string) (string, bool) {
 	}
 	return "", false
 }
+
+// FaqErrorMsg returns an error message pointing to the FAQ.
+func FaqErrorMsg(anchor, msg string) string {
+	return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor)
+}
diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
new file mode 100755
index 000000000..e0f6df438
--- /dev/null
+++ b/scripts/benchmark.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# gcloud may be installed as a "snap". If it is, include it in PATH.
+declare -r snap="/snap/bin"
+if [[ -d "${snap}" ]]; then
+  export PATH="${PATH}:${snap}"
+fi
+
+# Make sure we can find gcloud and exit if not.
+which gcloud
+
+# Exporting for subprocesses as GCP APIs and tools check this environmental
+# variable for authentication.
+export GOOGLE_APPLICATION_CREDENTIALS="${KOKORO_KEYSTORE_DIR}/${GCLOUD_CREDENTIALS}"
+
+gcloud auth activate-service-account \
+   --key-file "${GOOGLE_APPLICATION_CREDENTIALS}"
+
+gcloud config set project ${PROJECT}
+gcloud config set compute/zone ${ZONE}
+
+bazel run //benchmarks:benchmarks -- \
+  --verbose \
+  run-gcp \
+  "(startup|absl)" \
+  --internal \
+  --runtime=runc \
+  --runtime=runsc \
+  --installers=head
diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh
deleted file mode 100755
index 6b9065b07..000000000
--- a/scripts/benchmarks.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#!/usr/bin/env bash
-
-if [ "$#" -lt "1" ]; then
-  echo "usage: $0 <--mock |--env=<filename>> ..."
-  echo "example: $0 --mock --runs=8"
-  exit 1
-fi
-
-source $(dirname $0)/common.sh
-
-readonly TIMESTAMP=`date "+%Y%m%d-%H%M%S"`
-readonly OUTDIR="$(mktemp --tmpdir -d run-${TIMESTAMP}-XXX)"
-readonly DEFAULT_RUNTIMES="--runtime=runc --runtime=runsc --runtime=runsc-kvm"
-readonly ALL_RUNTIMES="--runtime=runc --runtime=runsc --runtime=runsc-kvm"
-
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'fio.(read|write)' --metric=bandwidth --size=5g --ioengine=sync --blocksize=1m > "${OUTDIR}/fio.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} fio.rand --metric=bandwidth --size=5g --ioengine=sync --blocksize=4k --time=30 > "${OUTDIR}/tmp_fio.csv"
-cat "${OUTDIR}/tmp_fio.csv" | grep "\(runc\|runsc\)" >> "${OUTDIR}/fio.csv" && rm "${OUTDIR}/tmp_fio.csv"
-
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'fio.(read|write)' --metric=bandwidth --tmpfs=True --size=5g --ioengine=sync --blocksize=1m > "${OUTDIR}/fio-tmpfs.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} fio.rand --metric=bandwidth --tmpfs=True --size=5g --ioengine=sync --blocksize=4k --time=30 > "${OUTDIR}/tmp_fio.csv"
-cat "${OUTDIR}/tmp_fio.csv" | grep "\(runc\|runsc\)" >> "${OUTDIR}/fio-tmpfs.csv" && rm "${OUTDIR}/tmp_fio.csv"
-
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} startup --count=50  >  "${OUTDIR}/startup.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} density > "${OUTDIR}/density.csv"
-
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} sysbench.cpu --threads=1 --max_prime=50000 --options='--max-time=5' > "${OUTDIR}/sysbench-cpu.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} sysbench.memory --threads=1 --options='--memory-block-size=1M --memory-total-size=500G'  > "${OUTDIR}/sysbench-memory.csv"
-run //benchmarks:perf -- run "$@" ${ALL_RUNTIMES} syscall > "${OUTDIR}/syscall.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'network.(upload|download)' --runs=20 > "${OUTDIR}/iperf.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} ml.tensorflow > "${OUTDIR}/tensorflow.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} media.ffmpeg > "${OUTDIR}/ffmpeg.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} http.httpd --path=latin100k.txt --connections=1 --connections=5 --connections=10 --connections=25 > "${OUTDIR}/httpd100k.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} http.httpd --path=latin10240k.txt --connections=1 --connections=5 --connections=10 --connections=25 > "${OUTDIR}/httpd10240k.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} redis > "${OUTDIR}/redis.csv"
-run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'http.(ruby|node)' > "${OUTDIR}/applications.csv"
-
-echo "${OUTPUT}" && exit 0
diff --git a/scripts/build.sh b/scripts/build.sh
deleted file mode 100755
index 4c042af6c..000000000
--- a/scripts/build.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# Install required packages for make_repository.sh et al.
-sudo apt-get update && sudo apt-get install -y dpkg-sig coreutils apt-utils xz-utils
-
-# Build runsc.
-runsc=$(build -c opt //runsc)
-
-# Build packages.
-pkgs=$(build -c opt //runsc:runsc-debian)
-
-# Stop here if we have no artifacts directory.
-[[ -v KOKORO_ARTIFACTS_DIR ]] || exit 0
-
-# install_raw installs raw artifacts.
-install_raw() {
-  mkdir -p "$1"
-  cp -f "${runsc}" "$1"/runsc
-  sha512sum "$1"/runsc | awk '{print $1 "  runsc"}' > "$1"/runsc.sha512
-}
-
-# Build a repository, if the key is available.
-#
-# Note that make_repository.sh script will install packages into the provided
-# root, but will output to stdout a directory that can be copied arbitrarily
-# into "${KOKORO_ARTIFACTS_DIR}"/dists/XXX. We do things this way because we
-# will copy the same repository structure into multiple locations, below.
-if [[ -v KOKORO_REPO_KEY ]]; then
-  repo=$(tools/make_repository.sh \
-          "${KOKORO_KEYSTORE_DIR}/${KOKORO_REPO_KEY}" \
-          gvisor-bot@google.com \
-          main \
-          "${KOKORO_ARTIFACTS_DIR}" \
-          ${pkgs})
-fi
-
-# install_repo installs a repository.
-#
-# Note that packages are already installed, as noted above.
-install_repo() {
-  if [[ -v repo ]]; then
-    rm -rf "$1" && mkdir -p "$(dirname "$1")" && cp -a "${repo}" "$1"
-  fi
-}
-
-# If nightly, install only nightly artifacts.
-if [[ "${KOKORO_BUILD_NIGHTLY:-false}" == "true" ]]; then
-  # The "latest" directory and current date.
-  stamp="$(date -Idate)"
-  install_raw  "${KOKORO_ARTIFACTS_DIR}/nightly/latest"
-  install_raw  "${KOKORO_ARTIFACTS_DIR}/nightly/${stamp}"
-  install_repo "${KOKORO_ARTIFACTS_DIR}/dists/nightly"
-else
-  # Is it a tagged release? Build that.
-  tags="$(git tag --points-at HEAD)"
-  if ! [[ -z "${tags}" ]]; then
-    # Note that a given commit can match any number of tags. We have to iterate
-    # through all possible tags and produce associated artifacts.
-    for tag in ${tags}; do
-      name=$(echo "${tag}" | cut -d'-' -f2)
-      base=$(echo "${name}" | cut -d'.' -f1)
-      install_raw  "${KOKORO_ARTIFACTS_DIR}/release/${name}"
-      install_raw  "${KOKORO_ARTIFACTS_DIR}/release/latest"
-      install_repo "${KOKORO_ARTIFACTS_DIR}/dists/release"
-      install_repo "${KOKORO_ARTIFACTS_DIR}/dists/${base}"
-    done
-  else
-    # Otherwise, assume it is a raw master commit.
-    install_raw  "${KOKORO_ARTIFACTS_DIR}/master/latest"
-    install_repo "${KOKORO_ARTIFACTS_DIR}/dists/master"
-  fi
-fi
diff --git a/scripts/common_build.sh b/scripts/common_build.sh
index ae8b67383..4fe1067d2 100755
--- a/scripts/common_build.sh
+++ b/scripts/common_build.sh
@@ -70,7 +70,9 @@ function collect_logs() {
     for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do
       junitparser merge `find $d -name test.xml` $d/test.xml
       cat $d/shard_*_of_*/test.log > $d/test.log
-      ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip
+      if ls -ld $d/shard_*_of_*/test.outputs 2>/dev/null; then
+        zip -r -1 "$d/outputs.zip" $d/shard_*_of_*/test.outputs
+      fi
     done
     find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf
     # Move test logs to Kokoro directory. tar is used to conveniently perform
@@ -90,7 +92,13 @@ function collect_logs() {
           echo "    gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp"
           echo "    https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}"
         fi
-        tar --create --gzip --file="${KOKORO_ARTIFACTS_DIR}/${archive}" -C "${RUNSC_LOGS_DIR}" .
+        time tar \
+          --verbose \
+          --create \
+          --gzip \
+          --file="${KOKORO_ARTIFACTS_DIR}/${archive}" \
+          --directory "${RUNSC_LOGS_DIR}" \
+          .
       fi
     fi
   fi
diff --git a/scripts/dev.sh b/scripts/dev.sh
index 6238b4d0b..a9107f33e 100755
--- a/scripts/dev.sh
+++ b/scripts/dev.sh
@@ -66,6 +66,7 @@ if [[ ${REFRESH} -eq 0 ]]; then
 else
   mkdir -p "$(dirname ${RUNSC_BIN})"
   cp -f ${OUTPUT} "${RUNSC_BIN}"
+  chmod a+rx "${RUNSC_BIN}"
 
   echo
   echo "Runtime ${RUNTIME} refreshed."
diff --git a/scripts/docker_tests.sh b/scripts/docker_tests.sh
index 72ba05260..dce0a4085 100755
--- a/scripts/docker_tests.sh
+++ b/scripts/docker_tests.sh
@@ -16,5 +16,10 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 install_runsc_for_test docker
 test_runsc //test/image:image_test //test/e2e:integration_test
+
+install_runsc_for_test docker --vfs2
+test_runsc //test/image:image_test --test_filter=.*TestHelloWorld
diff --git a/scripts/hostnet_tests.sh b/scripts/hostnet_tests.sh
index 41298293d..992db50dd 100755
--- a/scripts/hostnet_tests.sh
+++ b/scripts/hostnet_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 # Install the runtime and perform basic tests.
 install_runsc_for_test hostnet --network=host
 test_runsc --test_arg=-checkpoint=false //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh
index 3069d8628..2a8c24907 100755
--- a/scripts/iptables_tests.sh
+++ b/scripts/iptables_tests.sh
@@ -16,12 +16,8 @@
 
 source $(dirname $0)/common.sh
 
-install_runsc_for_test iptables
+make load-iptables
 
-# Build the docker image for the test.
-run //test/iptables/runner-image --norun
-
-# TODO(gvisor.dev/issue/170): Also test this on runsc once iptables are better
-# supported
-test //test/iptables:iptables_test "--test_arg=--runtime=runc" \
-  "--test_arg=--image=bazel/test/iptables/runner:runner-image"
+install_runsc_for_test iptables --net-raw
+test //test/iptables:iptables_test "--test_arg=--runtime=runc"
+test //test/iptables:iptables_test "--test_arg=--runtime=${RUNTIME}"
diff --git a/scripts/kvm_tests.sh b/scripts/kvm_tests.sh
index 5662401df..619571c74 100755
--- a/scripts/kvm_tests.sh
+++ b/scripts/kvm_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 # Ensure that KVM is loaded, and we can use it.
 (lsmod | grep -E '^(kvm_intel|kvm_amd)') || sudo modprobe kvm
 sudo chmod a+rw /dev/kvm
diff --git a/scripts/make_tests.sh b/scripts/make_tests.sh
index 79426756d..dbf1bba77 100755
--- a/scripts/make_tests.sh
+++ b/scripts/make_tests.sh
@@ -16,10 +16,5 @@
 
 source $(dirname $0)/common.sh
 
-top_level=$(git rev-parse --show-toplevel 2>/dev/null)
-[[ $? -eq 0 ]] && cd "${top_level}" || exit 1
-
-make
 make runsc
-make BAZEL_OPTIONS="build //..." bazel
 make bazel-shutdown
diff --git a/scripts/overlay_tests.sh b/scripts/overlay_tests.sh
index 2a1f12c0b..448864953 100755
--- a/scripts/overlay_tests.sh
+++ b/scripts/overlay_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 # Install the runtime and perform basic tests.
 install_runsc_for_test overlay --overlay
 test_runsc //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/packetdrill_tests.sh b/scripts/packetdrill_tests.sh
index fc6bef79c..f0fc444c8 100755
--- a/scripts/packetdrill_tests.sh
+++ b/scripts/packetdrill_tests.sh
@@ -16,5 +16,7 @@
 
 source $(dirname $0)/common.sh
 
+make load-packetdrill
+
 install_runsc_for_test runsc-d
 test_runsc $(bazel query "attr(tags, manual, tests(//test/packetdrill/...))")
diff --git a/scripts/packetimpact_tests.sh b/scripts/packetimpact_tests.sh
new file mode 100755
index 000000000..17fc43f27
--- /dev/null
+++ b/scripts/packetimpact_tests.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+make load-packetimpact
+
+install_runsc_for_test runsc-d
+test_runsc $(bazel query "attr(tags, packetimpact, tests(//test/packetimpact/...))")
diff --git a/scripts/release.sh b/scripts/release.sh
deleted file mode 100755
index 091abf87f..000000000
--- a/scripts/release.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# Tag a release only if provided.
-if ! [[ -v KOKORO_RELEASE_COMMIT ]]; then
-  echo "No KOKORO_RELEASE_COMMIT provided." >&2
-  exit 1
-fi
-if ! [[ -v KOKORO_RELEASE_TAG ]]; then
-  echo "No KOKORO_RELEASE_TAG provided." >&2
-  exit 1
-fi
-
-# Unless an explicit releaser is provided, use the bot e-mail.
-declare -r KOKORO_RELEASE_AUTHOR=${KOKORO_RELEASE_AUTHOR:-gvisor-bot}
-declare -r EMAIL=${EMAIL:-${KOKORO_RELEASE_AUTHOR}@google.com}
-
-# Ensure we have an appropriate configuration for the tag.
-git config --get user.name || git config user.name "gVisor-bot"
-git config --get user.email || git config user.email "${EMAIL}"
-
-# Provide a credential if available.
-if [[ -v KOKORO_GITHUB_ACCESS_TOKEN ]]; then
-  git config --global credential.helper cache
-  git credential approve <<EOF
-protocol=https
-host=github.com
-username=$(cat "${KOKORO_KEYSTORE_DIR}/${KOKORO_GITHUB_ACCESS_TOKEN}")
-password=x-oauth-basic
-EOF
-fi
-
-# Run the release tool, which pushes to the origin repository.
-tools/tag_release.sh "${KOKORO_RELEASE_COMMIT}" "${KOKORO_RELEASE_TAG}"
diff --git a/scripts/root_tests.sh b/scripts/root_tests.sh
index 4e4fcc76b..d629bf2aa 100755
--- a/scripts/root_tests.sh
+++ b/scripts/root_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 # Reinstall the latest containerd shim.
 declare -r base="https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim"
 declare -r latest=$(mktemp --tmpdir gvisor-containerd-shim-latest.XXXXXX)
@@ -28,4 +30,3 @@ sudo mv ${shim_path} /usr/local/bin/gvisor-containerd-shim
 # Run the tests that require root.
 install_runsc_for_test root
 run_as_root //test/root:root_test --runtime=${RUNTIME}
-
diff --git a/kokoro/runtime_tests/runtime_tests.sh b/scripts/runtime_tests.sh
index 73a58f806..350a59f7c 100755
--- a/kokoro/runtime_tests/runtime_tests.sh
+++ b/scripts/runtime_tests.sh
@@ -14,14 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Run in the root of the repo.
-cd "$(dirname "$0")"
-cd "$(git rev-parse --show-toplevel)"
-
-source scripts/common.sh
+source $(dirname $0)/common.sh
 
+# Check that a runtime is provided.
 if [ ! -v RUNTIME_TEST_NAME ]; then
-  echo 'Must set $RUNTIME_TEST_NAME' >&2
+  echo "Must set $RUNTIME_TEST_NAME" >&2
   exit 1
 fi
 
diff --git a/scripts/swgso_tests.sh b/scripts/swgso_tests.sh
index 0de2df1d2..c67f2fe5c 100755
--- a/scripts/swgso_tests.sh
+++ b/scripts/swgso_tests.sh
@@ -16,6 +16,8 @@
 
 source $(dirname $0)/common.sh
 
+make load-all-images
+
 # Install the runtime and perform basic tests.
 install_runsc_for_test swgso --software-gso=true --gso=false
 test_runsc //test/image:image_test //test/e2e:integration_test
diff --git a/scripts/syscall_kvm_tests.sh b/scripts/syscall_kvm_tests.sh
index de85daa5a..0e5d86727 100755
--- a/scripts/syscall_kvm_tests.sh
+++ b/scripts/syscall_kvm_tests.sh
@@ -16,6 +16,5 @@
 
 source $(dirname $0)/common.sh
 
-# TODO(b/112165693): "test --test_tag_filters=runsc_kvm" can be used
-# when the "manual" tag will be removed for kvm tests.
-test `bazel query "attr(tags, runsc_kvm, tests(//test/syscalls/...))"`
+# Run all ptrace-variants of the system call tests.
+test --test_tag_filters=runsc_kvm //test/syscalls/...
diff --git a/runsc/container/test_app/BUILD b/test/cmd/test_app/BUILD
index 0defbd9fc..98ba5a3d9 100644
--- a/runsc/container/test_app/BUILD
+++ b/test/cmd/test_app/BUILD
@@ -12,9 +12,9 @@ go_binary(
     pure = True,
     visibility = ["//runsc/container:__pkg__"],
     deps = [
+        "//pkg/test/testutil",
         "//pkg/unet",
         "//runsc/flag",
-        "//runsc/testutil",
         "@com_github_google_subcommands//:go_default_library",
         "@com_github_kr_pty//:go_default_library",
     ],
diff --git a/runsc/container/test_app/fds.go b/test/cmd/test_app/fds.go
index 2a146a2c3..a7658eefd 100644
--- a/runsc/container/test_app/fds.go
+++ b/test/cmd/test_app/fds.go
@@ -22,9 +22,9 @@ import (
 	"time"
 
 	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/runsc/flag"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 const fileContents = "foobarbaz"
diff --git a/runsc/container/test_app/test_app.go b/test/cmd/test_app/test_app.go
index 01c47c79f..3ba4f38f8 100644
--- a/runsc/container/test_app/test_app.go
+++ b/test/cmd/test_app/test_app.go
@@ -32,8 +32,8 @@ import (
 
 	"github.com/google/subcommands"
 	"github.com/kr/pty"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/flag"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func main() {
@@ -96,7 +96,7 @@ func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 
 	listener, err := net.Listen("unix", c.socketPath)
 	if err != nil {
-		log.Fatal("error listening on socket %q:", c.socketPath, err)
+		log.Fatalf("error listening on socket %q: %v", c.socketPath, err)
 	}
 
 	go server(listener, outputFile)
diff --git a/test/e2e/BUILD b/test/e2e/BUILD
index 76e04f878..44cce0e3b 100644
--- a/test/e2e/BUILD
+++ b/test/e2e/BUILD
@@ -20,9 +20,9 @@ go_test(
     deps = [
         "//pkg/abi/linux",
         "//pkg/bits",
-        "//runsc/dockerutil",
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
         "//runsc/specutils",
-        "//runsc/testutil",
     ],
 )
 
diff --git a/test/e2e/exec_test.go b/test/e2e/exec_test.go
index 4074d2285..6a63b1232 100644
--- a/test/e2e/exec_test.go
+++ b/test/e2e/exec_test.go
@@ -23,6 +23,8 @@ package integration
 
 import (
 	"fmt"
+	"os"
+	"os/exec"
 	"strconv"
 	"strings"
 	"syscall"
@@ -31,23 +33,23 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bits"
-	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 // Test that exec uses the exact same capability set as the container.
 func TestExecCapabilities(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-capabilities-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container.
-	if err := d.Run("alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
+	// Check that capability.
 	matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second)
 	if err != nil {
 		t.Fatalf("WaitForOutputSubmatch() timeout: %v", err)
@@ -59,7 +61,7 @@ func TestExecCapabilities(t *testing.T) {
 	t.Log("Root capabilities:", want)
 
 	// Now check that exec'd process capabilities match the root.
-	got, err := d.Exec("grep", "CapEff:", "/proc/self/status")
+	got, err := d.Exec(dockerutil.RunOpts{}, "grep", "CapEff:", "/proc/self/status")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -72,16 +74,16 @@ func TestExecCapabilities(t *testing.T) {
 // Test that 'exec --privileged' adds all capabilities, except for CAP_NET_RAW
 // which is removed from the container when --net-raw=false.
 func TestExecPrivileged(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-privileged-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container with all capabilities dropped.
-	if err := d.Run("--cap-drop=all", "alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image:   "basic/alpine",
+		CapDrop: []string{"all"},
+	}, "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Check that all capabilities where dropped from container.
 	matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second)
@@ -100,9 +102,11 @@ func TestExecPrivileged(t *testing.T) {
 		t.Fatalf("Container should have no capabilities: %x", containerCaps)
 	}
 
-	// Check that 'exec --privileged' adds all capabilities, except
-	// for CAP_NET_RAW.
-	got, err := d.ExecWithFlags([]string{"--privileged"}, "grep", "CapEff:", "/proc/self/status")
+	// Check that 'exec --privileged' adds all capabilities, except for
+	// CAP_NET_RAW.
+	got, err := d.Exec(dockerutil.RunOpts{
+		Privileged: true,
+	}, "grep", "CapEff:", "/proc/self/status")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -114,97 +118,99 @@ func TestExecPrivileged(t *testing.T) {
 }
 
 func TestExecJobControl(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-job-control-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container.
-	if err := d.Run("alpine", "sleep", "1000"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Exec 'sh' with an attached pty.
-	cmd, ptmx, err := d.ExecWithTerminal("sh")
-	if err != nil {
+	if _, err := d.Exec(dockerutil.RunOpts{
+		Pty: func(cmd *exec.Cmd, ptmx *os.File) {
+			// Call "sleep 100 | cat" in the shell. We pipe to cat
+			// so that there will be two processes in the
+			// foreground process group.
+			if _, err := ptmx.Write([]byte("sleep 100 | cat\n")); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+
+			// Give shell a few seconds to start executing the sleep.
+			time.Sleep(2 * time.Second)
+
+			// Send a ^C to the pty, which should kill sleep and
+			// cat, but not the shell.  \x03 is ASCII "end of
+			// text", which is the same as ^C.
+			if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+
+			// The shell should still be alive at this point. Sleep
+			// should have exited with code 2+128=130. We'll exit
+			// with 10 plus that number, so that we can be sure
+			// that the shell did not get signalled.
+			if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+
+			// Exec process should exit with code 10+130=140.
+			ps, err := cmd.Process.Wait()
+			if err != nil {
+				t.Fatalf("error waiting for exec process: %v", err)
+			}
+			ws := ps.Sys().(syscall.WaitStatus)
+			if !ws.Exited() {
+				t.Errorf("ws.Exited got false, want true")
+			}
+			if got, want := ws.ExitStatus(), 140; got != want {
+				t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
+			}
+		},
+	}, "sh"); err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
-	defer ptmx.Close()
-
-	// Call "sleep 100 | cat" in the shell.  We pipe to cat so that there
-	// will be two processes in the foreground process group.
-	if _, err := ptmx.Write([]byte("sleep 100 | cat\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// Give shell a few seconds to start executing the sleep.
-	time.Sleep(2 * time.Second)
-
-	// Send a ^C to the pty, which should kill sleep and cat, but not the
-	// shell.  \x03 is ASCII "end of text", which is the same as ^C.
-	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// The shell should still be alive at this point. Sleep should have
-	// exited with code 2+128=130. We'll exit with 10 plus that number, so
-	// that we can be sure that the shell did not get signalled.
-	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// Exec process should exit with code 10+130=140.
-	ps, err := cmd.Process.Wait()
-	if err != nil {
-		t.Fatalf("error waiting for exec process: %v", err)
-	}
-	ws := ps.Sys().(syscall.WaitStatus)
-	if !ws.Exited() {
-		t.Errorf("ws.Exited got false, want true")
-	}
-	if got, want := ws.ExitStatus(), 140; got != want {
-		t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
-	}
 }
 
 // Test that failure to exec returns proper error message.
 func TestExecError(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-error-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container.
-	if err := d.Run("alpine", "sleep", "1000"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
-	_, err := d.Exec("no_can_find")
+	// Attempt to exec a binary that doesn't exist.
+	out, err := d.Exec(dockerutil.RunOpts{}, "no_can_find")
 	if err == nil {
 		t.Fatalf("docker exec didn't fail")
 	}
-	if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(err.Error(), want) {
-		t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", err.Error(), want)
+	if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(out, want) {
+		t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", out, want)
 	}
 }
 
 // Test that exec inherits environment from run.
 func TestExecEnv(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-env-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container with env FOO=BAR.
-	if err := d.Run("-e", "FOO=BAR", "alpine", "sleep", "1000"); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		Env:   []string{"FOO=BAR"},
+	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Exec "echo $FOO".
-	got, err := d.Exec("/bin/sh", "-c", "echo $FOO")
+	got, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "echo $FOO")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -216,17 +222,19 @@ func TestExecEnv(t *testing.T) {
 // TestRunEnvHasHome tests that run always has HOME environment set.
 func TestRunEnvHasHome(t *testing.T) {
 	// Base alpine image does not have any environment variables set.
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("run-env-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Exec "echo $HOME". The 'bin' user's home dir is '/bin'.
-	got, err := d.RunFg("--user", "bin", "alpine", "/bin/sh", "-c", "echo $HOME")
+	got, err := d.Run(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		User:  "bin",
+	}, "/bin/sh", "-c", "echo $HOME")
 	if err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
+
+	// Check that the directory matches.
 	if got, want := strings.TrimSpace(got), "/bin"; got != want {
 		t.Errorf("bad output from 'docker run'. Got %q; Want %q.", got, want)
 	}
@@ -235,28 +243,17 @@ func TestRunEnvHasHome(t *testing.T) {
 // Test that exec always has HOME environment set, even when not set in run.
 func TestExecEnvHasHome(t *testing.T) {
 	// Base alpine image does not have any environment variables set.
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("exec-env-home-test")
-
-	// We will check that HOME is set for root user, and also for a new
-	// non-root user we will create.
-	newUID := 1234
-	newHome := "/foo/bar"
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
-	// Create a new user with a home directory, and then sleep.
-	script := fmt.Sprintf(`
-	mkdir -p -m 777 %s && \
-	adduser foo -D -u %d -h %s && \
-	sleep 1000`, newHome, newUID, newHome)
-	if err := d.Run("alpine", "/bin/sh", "-c", script); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "1000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Exec "echo $HOME", and expect to see "/root".
-	got, err := d.Exec("/bin/sh", "-c", "echo $HOME")
+	got, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "echo $HOME")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
@@ -264,8 +261,18 @@ func TestExecEnvHasHome(t *testing.T) {
 		t.Errorf("wanted exec output to contain %q, got %q", want, got)
 	}
 
-	// Execute the same as uid 123 and expect newHome.
-	got, err = d.ExecAsUser(strconv.Itoa(newUID), "/bin/sh", "-c", "echo $HOME")
+	// Create a new user with a home directory.
+	newUID := 1234
+	newHome := "/foo/bar"
+	cmd := fmt.Sprintf("mkdir -p -m 777 %q && adduser foo -D -u %d -h %q", newHome, newUID, newHome)
+	if _, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", cmd); err != nil {
+		t.Fatalf("docker exec failed: %v", err)
+	}
+
+	// Execute the same as the new user and expect newHome.
+	got, err = d.Exec(dockerutil.RunOpts{
+		User: strconv.Itoa(newUID),
+	}, "/bin/sh", "-c", "echo $HOME")
 	if err != nil {
 		t.Fatalf("docker exec failed: %v", err)
 	}
diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go
index 28064e557..91c956e10 100644
--- a/test/e2e/integration_test.go
+++ b/test/e2e/integration_test.go
@@ -27,14 +27,15 @@ import (
 	"net"
 	"net/http"
 	"os"
+	"os/exec"
 	"strconv"
 	"strings"
 	"syscall"
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 // httpRequestSucceeds sends a request to a given url and checks that the status is OK.
@@ -53,65 +54,66 @@ func httpRequestSucceeds(client http.Client, server string, port int) error {
 
 // TestLifeCycle tests a basic Create/Start/Stop docker container life cycle.
 func TestLifeCycle(t *testing.T) {
-	if err := dockerutil.Pull("nginx"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("lifecycle-test")
-	if err := d.Create("-p", "80", "nginx"); err != nil {
-		t.Fatal("docker create failed:", err)
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Start the container.
+	if err := d.Create(dockerutil.RunOpts{
+		Image: "basic/nginx",
+		Ports: []int{80},
+	}); err != nil {
+		t.Fatalf("docker create failed: %v", err)
 	}
 	if err := d.Start(); err != nil {
-		d.CleanUp()
-		t.Fatal("docker start failed:", err)
+		t.Fatalf("docker start failed: %v", err)
 	}
 
-	// Test that container is working
+	// Test that container is working.
 	port, err := d.FindPort(80)
 	if err != nil {
-		t.Fatal("docker.FindPort(80) failed: ", err)
+		t.Fatalf("docker.FindPort(80) failed: %v", err)
 	}
 	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 	client := http.Client{Timeout: time.Duration(2 * time.Second)}
 	if err := httpRequestSucceeds(client, "localhost", port); err != nil {
-		t.Error("http request failed:", err)
+		t.Errorf("http request failed: %v", err)
 	}
 
 	if err := d.Stop(); err != nil {
-		d.CleanUp()
-		t.Fatal("docker stop failed:", err)
+		t.Fatalf("docker stop failed: %v", err)
 	}
 	if err := d.Remove(); err != nil {
-		t.Fatal("docker rm failed:", err)
+		t.Fatalf("docker rm failed: %v", err)
 	}
 }
 
 func TestPauseResume(t *testing.T) {
-	const img = "gcr.io/gvisor-presubmit/python-hello"
 	if !testutil.IsCheckpointSupported() {
-		t.Log("Checkpoint is not supported, skipping test.")
-		return
+		t.Skip("Checkpoint is not supported.")
 	}
 
-	if err := dockerutil.Pull(img); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("pause-resume-test")
-	if err := d.Run("-p", "8080", img); err != nil {
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Start the container.
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/python",
+		Ports: []int{8080}, // See Dockerfile.
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Find where port 8080 is mapped to.
 	port, err := d.FindPort(8080)
 	if err != nil {
-		t.Fatal("docker.FindPort(8080) failed:", err)
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
 	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Check that container is working.
@@ -121,7 +123,7 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	if err := d.Pause(); err != nil {
-		t.Fatal("docker pause failed:", err)
+		t.Fatalf("docker pause failed: %v", err)
 	}
 
 	// Check if container is paused.
@@ -137,12 +139,12 @@ func TestPauseResume(t *testing.T) {
 	}
 
 	if err := d.Unpause(); err != nil {
-		t.Fatal("docker unpause failed:", err)
+		t.Fatalf("docker unpause failed: %v", err)
 	}
 
 	// Wait until it's up and running.
 	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Check if container is working again.
@@ -152,45 +154,43 @@ func TestPauseResume(t *testing.T) {
 }
 
 func TestCheckpointRestore(t *testing.T) {
-	const img = "gcr.io/gvisor-presubmit/python-hello"
 	if !testutil.IsCheckpointSupported() {
-		t.Log("Pause/resume is not supported, skipping test.")
-		return
+		t.Skip("Pause/resume is not supported.")
 	}
 
-	if err := dockerutil.Pull(img); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("save-restore-test")
-	if err := d.Run("-p", "8080", img); err != nil {
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Start the container.
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/python",
+		Ports: []int{8080}, // See Dockerfile.
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
+	// Create a snapshot.
 	if err := d.Checkpoint("test"); err != nil {
-		t.Fatal("docker checkpoint failed:", err)
+		t.Fatalf("docker checkpoint failed: %v", err)
 	}
-
 	if _, err := d.Wait(30 * time.Second); err != nil {
-		t.Fatal(err)
+		t.Fatalf("wait failed: %v", err)
 	}
 
-	// TODO(b/143498576): Remove after github.com/moby/moby/issues/38963 is fixed.
-	time.Sleep(1 * time.Second)
-
-	if err := d.Restore("test"); err != nil {
-		t.Fatal("docker restore failed:", err)
+	// TODO(b/143498576): Remove Poll after github.com/moby/moby/issues/38963 is fixed.
+	if err := testutil.Poll(func() error { return d.Restore("test") }, 15*time.Second); err != nil {
+		t.Fatalf("docker restore failed: %v", err)
 	}
 
 	// Find where port 8080 is mapped to.
 	port, err := d.FindPort(8080)
 	if err != nil {
-		t.Fatal("docker.FindPort(8080) failed:", err)
+		t.Fatalf("docker.FindPort(8080) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
 	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatal("WaitForHTTP() timeout:", err)
+		t.Fatalf("WaitForHTTP() timeout: %v", err)
 	}
 
 	// Check if container is working again.
@@ -202,26 +202,28 @@ func TestCheckpointRestore(t *testing.T) {
 
 // Create client and server that talk to each other using the local IP.
 func TestConnectToSelf(t *testing.T) {
-	d := dockerutil.MakeDocker("connect-to-self-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Creates server that replies "server" and exists. Sleeps at the end because
 	// 'docker exec' gets killed if the init process exists before it can finish.
-	if err := d.Run("ubuntu:trusty", "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil {
-		t.Fatal("docker run failed:", err)
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/ubuntu",
+	}, "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Finds IP address for host.
-	ip, err := d.Exec("/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'")
+	ip, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'")
 	if err != nil {
-		t.Fatal("docker exec failed:", err)
+		t.Fatalf("docker exec failed: %v", err)
 	}
 	ip = strings.TrimRight(ip, "\n")
 
 	// Runs client that sends "client" to the server and exits.
-	reply, err := d.Exec("/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip))
+	reply, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip))
 	if err != nil {
-		t.Fatal("docker exec failed:", err)
+		t.Fatalf("docker exec failed: %v", err)
 	}
 
 	// Ensure both client and server got the message from each other.
@@ -229,21 +231,24 @@ func TestConnectToSelf(t *testing.T) {
 		t.Errorf("Error on server, want: %q, got: %q", want, reply)
 	}
 	if _, err := d.WaitForOutput("^client\n$", 1*time.Second); err != nil {
-		t.Fatal("docker.WaitForOutput(client) timeout:", err)
+		t.Fatalf("docker.WaitForOutput(client) timeout: %v", err)
 	}
 }
 
 func TestMemLimit(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("cgroup-test")
-	cmd := "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'"
-	out, err := d.RunFg("--memory=500MB", "alpine", "sh", "-c", cmd)
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// N.B. Because the size of the memory file may grow in large chunks,
+	// there is a minimum threshold of 1GB for the MemTotal figure.
+	allocMemory := 1024 * 1024
+	out, err := d.Run(dockerutil.RunOpts{
+		Image:  "basic/alpine",
+		Memory: allocMemory, // In kB.
+	}, "sh", "-c", "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'")
 	if err != nil {
-		t.Fatal("docker run failed:", err)
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Remove warning message that swap isn't present.
 	if strings.HasPrefix(out, "WARNING") {
@@ -254,27 +259,30 @@ func TestMemLimit(t *testing.T) {
 		out = lines[1]
 	}
 
+	// Ensure the memory matches what we want.
 	got, err := strconv.ParseUint(strings.TrimSpace(out), 10, 64)
 	if err != nil {
 		t.Fatalf("failed to parse %q: %v", out, err)
 	}
-	if want := uint64(500 * 1024); got != want {
+	if want := uint64(allocMemory); got != want {
 		t.Errorf("MemTotal got: %d, want: %d", got, want)
 	}
 }
 
 func TestNumCPU(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("cgroup-test")
-	cmd := "cat /proc/cpuinfo | grep 'processor.*:' | wc -l"
-	out, err := d.RunFg("--cpuset-cpus=0", "alpine", "sh", "-c", cmd)
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Read how many cores are in the container.
+	out, err := d.Run(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		Extra: []string{"--cpuset-cpus=0"},
+	}, "sh", "-c", "cat /proc/cpuinfo | grep 'processor.*:' | wc -l")
 	if err != nil {
-		t.Fatal("docker run failed:", err)
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
+	// Ensure it matches what we want.
 	got, err := strconv.Atoi(strings.TrimSpace(out))
 	if err != nil {
 		t.Fatalf("failed to parse %q: %v", out, err)
@@ -286,39 +294,39 @@ func TestNumCPU(t *testing.T) {
 
 // TestJobControl tests that job control characters are handled properly.
 func TestJobControl(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("job-control-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container with an attached PTY.
-	_, ptmx, err := d.RunWithPty("alpine", "sh")
-	if err != nil {
+	if _, err := d.Run(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		Pty: func(_ *exec.Cmd, ptmx *os.File) {
+			// Call "sleep 100" in the shell.
+			if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+
+			// Give shell a few seconds to start executing the sleep.
+			time.Sleep(2 * time.Second)
+
+			// Send a ^C to the pty, which should kill sleep, but
+			// not the shell.  \x03 is ASCII "end of text", which
+			// is the same as ^C.
+			if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+
+			// The shell should still be alive at this point. Sleep
+			// should have exited with code 2+128=130. We'll exit
+			// with 10 plus that number, so that we can be sure
+			// that the shell did not get signalled.
+			if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
+				t.Fatalf("error writing to pty: %v", err)
+			}
+		},
+	}, "sh"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer ptmx.Close()
-	defer d.CleanUp()
-
-	// Call "sleep 100" in the shell.
-	if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// Give shell a few seconds to start executing the sleep.
-	time.Sleep(2 * time.Second)
-
-	// Send a ^C to the pty, which should kill sleep, but not the shell.
-	// \x03 is ASCII "end of text", which is the same as ^C.
-	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
-
-	// The shell should still be alive at this point. Sleep should have
-	// exited with code 2+128=130. We'll exit with 10 plus that number, so
-	// that we can be sure that the shell did not get signalled.
-	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
-		t.Fatalf("error writing to pty: %v", err)
-	}
 
 	// Wait for the container to exit.
 	got, err := d.Wait(5 * time.Second)
@@ -331,17 +339,69 @@ func TestJobControl(t *testing.T) {
 	}
 }
 
-// TestTmpFile checks that files inside '/tmp' are not overridden. In addition,
-// it checks that working dir is created if it doesn't exit.
+// TestWorkingDirCreation checks that working dir is created if it doesn't exit.
+func TestWorkingDirCreation(t *testing.T) {
+	for _, tc := range []struct {
+		name       string
+		workingDir string
+	}{
+		{name: "root", workingDir: "/foo"},
+		{name: "tmp", workingDir: "/tmp/foo"},
+	} {
+		for _, readonly := range []bool{true, false} {
+			name := tc.name
+			if readonly {
+				name += "-readonly"
+			}
+			t.Run(name, func(t *testing.T) {
+				d := dockerutil.MakeDocker(t)
+				defer d.CleanUp()
+
+				opts := dockerutil.RunOpts{
+					Image:    "basic/alpine",
+					WorkDir:  tc.workingDir,
+					ReadOnly: readonly,
+				}
+				got, err := d.Run(opts, "sh", "-c", "echo ${PWD}")
+				if err != nil {
+					t.Fatalf("docker run failed: %v", err)
+				}
+				if want := tc.workingDir + "\n"; want != got {
+					t.Errorf("invalid working dir, want: %q, got: %q", want, got)
+				}
+			})
+		}
+	}
+}
+
+// TestTmpFile checks that files inside '/tmp' are not overridden.
 func TestTmpFile(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	opts := dockerutil.RunOpts{Image: "tmpfile"}
+	got, err := d.Run(opts, "cat", "/tmp/foo/file.txt")
+	if err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
-	d := dockerutil.MakeDocker("tmp-file-test")
-	if err := d.Run("-w=/tmp/foo/bar", "--read-only", "alpine", "touch", "/tmp/foo/bar/file"); err != nil {
-		t.Fatal("docker run failed:", err)
+	if want := "123\n"; want != got {
+		t.Errorf("invalid file content, want: %q, got: %q", want, got)
 	}
+}
+
+// TestHostOverlayfsCopyUp tests that the --overlayfs-stale-read option causes
+// runsc to hide the incoherence of FDs opened before and after overlayfs
+// copy-up on the host.
+func TestHostOverlayfsCopyUp(t *testing.T) {
+	d := dockerutil.MakeDocker(t)
 	defer d.CleanUp()
+
+	if _, err := d.Run(dockerutil.RunOpts{
+		Image:   "hostoverlaytest",
+		WorkDir: "/root",
+	}, "./test"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
 }
 
 func TestMain(m *testing.M) {
diff --git a/test/e2e/regression_test.go b/test/e2e/regression_test.go
index 2488be383..327a2174c 100644
--- a/test/e2e/regression_test.go
+++ b/test/e2e/regression_test.go
@@ -18,7 +18,7 @@ import (
 	"strings"
 	"testing"
 
-	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
 )
 
 // Test that UDS can be created using overlay when parent directory is in lower
@@ -27,19 +27,19 @@ import (
 // Prerequisite: the directory where the socket file is created must not have
 // been open for write before bind(2) is called.
 func TestBindOverlay(t *testing.T) {
-	if err := dockerutil.Pull("ubuntu:trusty"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("bind-overlay-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
-	cmd := "nc -l -U /var/run/sock & p=$! && sleep 1 && echo foobar-asdf | nc -U /var/run/sock && wait $p"
-	got, err := d.RunFg("ubuntu:trusty", "bash", "-c", cmd)
+	// Run the container.
+	got, err := d.Run(dockerutil.RunOpts{
+		Image: "basic/ubuntu",
+	}, "bash", "-c", "nc -l -U /var/run/sock & p=$! && sleep 1 && echo foobar-asdf | nc -U /var/run/sock && wait $p")
 	if err != nil {
-		t.Fatal("docker run failed:", err)
+		t.Fatalf("docker run failed: %v", err)
 	}
 
+	// Check the output contains what we want.
 	if want := "foobar-asdf"; !strings.Contains(got, want) {
 		t.Fatalf("docker run output is missing %q: %s", want, got)
 	}
-	defer d.CleanUp()
 }
diff --git a/test/image/BUILD b/test/image/BUILD
index 7392ac54e..e749e47d4 100644
--- a/test/image/BUILD
+++ b/test/image/BUILD
@@ -22,8 +22,8 @@ go_test(
     ],
     visibility = ["//:sandbox"],
     deps = [
-        "//runsc/dockerutil",
-        "//runsc/testutil",
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
     ],
 )
 
diff --git a/test/image/image_test.go b/test/image/image_test.go
index 0a1e19d6f..2e3543109 100644
--- a/test/image/image_test.go
+++ b/test/image/image_test.go
@@ -28,24 +28,29 @@ import (
 	"log"
 	"net/http"
 	"os"
-	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 func TestHelloWorld(t *testing.T) {
-	d := dockerutil.MakeDocker("hello-test")
-	if err := d.Run("hello-world"); err != nil {
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	// Run the basic container.
+	out, err := d.Run(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "echo", "Hello world!")
+	if err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
-	if _, err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil {
-		t.Fatalf("docker didn't say hello: %v", err)
+	// Check the output.
+	if !strings.Contains(out, "Hello world!") {
+		t.Fatalf("docker didn't say hello: got %s", out)
 	}
 }
 
@@ -102,27 +107,22 @@ func testHTTPServer(t *testing.T, port int) {
 }
 
 func TestHttpd(t *testing.T) {
-	if err := dockerutil.Pull("httpd"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("http-test")
-
-	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container.
-	mountArg := dockerutil.MountArg(dir, "/usr/local/apache2/htdocs", dockerutil.ReadOnly)
-	if err := d.Run("-p", "80", mountArg, "httpd"); err != nil {
+	d.CopyFiles("/usr/local/apache2/htdocs", "test/image/latin10k.txt")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/httpd",
+		Ports: []int{80},
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Find where port 80 is mapped to.
 	port, err := d.FindPort(80)
 	if err != nil {
-		t.Fatalf("docker.FindPort(80) failed: %v", err)
+		t.Fatalf("FindPort(80) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
@@ -134,27 +134,22 @@ func TestHttpd(t *testing.T) {
 }
 
 func TestNginx(t *testing.T) {
-	if err := dockerutil.Pull("nginx"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("net-test")
-
-	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start the container.
-	mountArg := dockerutil.MountArg(dir, "/usr/share/nginx/html", dockerutil.ReadOnly)
-	if err := d.Run("-p", "80", mountArg, "nginx"); err != nil {
+	d.CopyFiles("/usr/share/nginx/html", "test/image/latin10k.txt")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/nginx",
+		Ports: []int{80},
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Find where port 80 is mapped to.
 	port, err := d.FindPort(80)
 	if err != nil {
-		t.Fatalf("docker.FindPort(80) failed: %v", err)
+		t.Fatalf("FindPort(80) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
@@ -166,99 +161,58 @@ func TestNginx(t *testing.T) {
 }
 
 func TestMysql(t *testing.T) {
-	if err := dockerutil.Pull("mysql"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("mysql-test")
+	server := dockerutil.MakeDocker(t)
+	defer server.CleanUp()
 
 	// Start the container.
-	if err := d.Run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil {
+	if err := server.Spawn(dockerutil.RunOpts{
+		Image: "basic/mysql",
+		Env:   []string{"MYSQL_ROOT_PASSWORD=foobar123"},
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Wait until it's up and running.
-	if _, err := d.WaitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
-		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	if _, err := server.WaitForOutput("port: 3306  MySQL Community Server", 3*time.Minute); err != nil {
+		t.Fatalf("WaitForOutput() timeout: %v", err)
 	}
 
-	client := dockerutil.MakeDocker("mysql-client-test")
-	dir, err := dockerutil.PrepareFiles("test/image/mysql.sql")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
+	// Generate the client and copy in the SQL payload.
+	client := dockerutil.MakeDocker(t)
+	defer client.CleanUp()
 
-	// Tell mysql client to connect to the server and execute the file in verbose
-	// mode to verify the output.
-	args := []string{
-		dockerutil.LinkArg(&d, "mysql"),
-		dockerutil.MountArg(dir, "/sql", dockerutil.ReadWrite),
-		"mysql",
-		"mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql",
-	}
-	if err := client.Run(args...); err != nil {
+	// Tell mysql client to connect to the server and execute the file in
+	// verbose mode to verify the output.
+	client.CopyFiles("/sql", "test/image/mysql.sql")
+	client.Link("mysql", server)
+	if _, err := client.Run(dockerutil.RunOpts{
+		Image: "basic/mysql",
+	}, "mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer client.CleanUp()
 
 	// Ensure file executed to the end and shutdown mysql.
-	if _, err := client.WaitForOutput("--------------\nshutdown\n--------------", 15*time.Second); err != nil {
-		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
-	}
-	if _, err := d.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
-		t.Fatalf("docker.WaitForOutput() timeout: %v", err)
+	if _, err := server.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil {
+		t.Fatalf("WaitForOutput() timeout: %v", err)
 	}
 }
 
-func TestPythonHello(t *testing.T) {
-	// TODO(b/136503277): Once we have more complete python runtime tests,
-	// we can drop this one.
-	const img = "gcr.io/gvisor-presubmit/python-hello"
-	if err := dockerutil.Pull(img); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("python-hello-test")
-	if err := d.Run("-p", "8080", img); err != nil {
-		t.Fatalf("docker run failed: %v", err)
-	}
+func TestTomcat(t *testing.T) {
+	d := dockerutil.MakeDocker(t)
 	defer d.CleanUp()
 
-	// Find where port 8080 is mapped to.
-	port, err := d.FindPort(8080)
-	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
-	}
-
-	// Wait until it's up and running.
-	if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil {
-		t.Fatalf("WaitForHTTP() timeout: %v", err)
-	}
-
-	// Ensure that content is being served.
-	url := fmt.Sprintf("http://localhost:%d", port)
-	resp, err := http.Get(url)
-	if err != nil {
-		t.Errorf("Error reaching http server: %v", err)
-	}
-	if want := http.StatusOK; resp.StatusCode != want {
-		t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
-	}
-}
-
-func TestTomcat(t *testing.T) {
-	if err := dockerutil.Pull("tomcat:8.0"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("tomcat-test")
-	if err := d.Run("-p", "8080", "tomcat:8.0"); err != nil {
+	// Start the server.
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/tomcat",
+		Ports: []int{8080},
+	}); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Find where port 8080 is mapped to.
 	port, err := d.FindPort(8080)
 	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+		t.Fatalf("FindPort(8080) failed: %v", err)
 	}
 
 	// Wait until it's up and running.
@@ -278,28 +232,22 @@ func TestTomcat(t *testing.T) {
 }
 
 func TestRuby(t *testing.T) {
-	if err := dockerutil.Pull("ruby"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("ruby-test")
-
-	dir, err := dockerutil.PrepareFiles("test/image/ruby.rb", "test/image/ruby.sh")
-	if err != nil {
-		t.Fatalf("PrepareFiles() failed: %v", err)
-	}
-	if err := os.Chmod(filepath.Join(dir, "ruby.sh"), 0333); err != nil {
-		t.Fatalf("os.Chmod(%q, 0333) failed: %v", dir, err)
-	}
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
-	if err := d.Run("-p", "8080", dockerutil.MountArg(dir, "/src", dockerutil.ReadOnly), "ruby", "/src/ruby.sh"); err != nil {
+	// Execute the ruby workload.
+	d.CopyFiles("/src", "test/image/ruby.rb", "test/image/ruby.sh")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/ruby",
+		Ports: []int{8080},
+	}, "/src/ruby.sh"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// Find where port 8080 is mapped to.
 	port, err := d.FindPort(8080)
 	if err != nil {
-		t.Fatalf("docker.FindPort(8080) failed: %v", err)
+		t.Fatalf("FindPort(8080) failed: %v", err)
 	}
 
 	// Wait until it's up and running, 'gem install' can take some time.
@@ -326,18 +274,17 @@ func TestRuby(t *testing.T) {
 }
 
 func TestStdio(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatalf("docker pull failed: %v", err)
-	}
-	d := dockerutil.MakeDocker("stdio-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	wantStdout := "hello stdout"
 	wantStderr := "bonjour stderr"
 	cmd := fmt.Sprintf("echo %q; echo %q 1>&2;", wantStdout, wantStderr)
-	if err := d.Run("alpine", "/bin/sh", "-c", cmd); err != nil {
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "/bin/sh", "-c", cmd); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	for _, want := range []string{wantStdout, wantStderr} {
 		if _, err := d.WaitForOutput(want, 5*time.Second); err != nil {
diff --git a/test/image/ruby.sh b/test/image/ruby.sh
index ebe8d5b0e..ebe8d5b0e 100644..100755
--- a/test/image/ruby.sh
+++ b/test/image/ruby.sh
diff --git a/test/iptables/BUILD b/test/iptables/BUILD
index 6bb3b82b5..3e29ca90d 100644
--- a/test/iptables/BUILD
+++ b/test/iptables/BUILD
@@ -14,7 +14,7 @@ go_library(
     ],
     visibility = ["//test/iptables:__subpackages__"],
     deps = [
-        "//runsc/testutil",
+        "//pkg/test/testutil",
     ],
 )
 
@@ -23,14 +23,14 @@ go_test(
     srcs = [
         "iptables_test.go",
     ],
+    data = ["//test/iptables/runner"],
     library = ":iptables",
     tags = [
         "local",
         "manual",
     ],
     deps = [
-        "//pkg/log",
-        "//runsc/dockerutil",
-        "//runsc/testutil",
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
     ],
 )
diff --git a/test/iptables/README.md b/test/iptables/README.md
index cc8a2fcac..b9f44bd40 100644
--- a/test/iptables/README.md
+++ b/test/iptables/README.md
@@ -38,7 +38,7 @@ Build the testing Docker container. Re-run this when you modify the test code in
 this directory:
 
 ```bash
-$ bazel run //test/iptables/runner:runner-image -- --norun
+$ make load-iptables
 ```
 
 Run an individual test via:
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index e26d6a7d2..872021358 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -26,6 +26,7 @@ const (
 	acceptPort       = 2402
 	sendloopDuration = 2 * time.Second
 	network          = "udp4"
+	chainName        = "foochain"
 )
 
 func init() {
@@ -40,6 +41,16 @@ func init() {
 	RegisterTestCase(FilterInputDefaultPolicyAccept{})
 	RegisterTestCase(FilterInputDefaultPolicyDrop{})
 	RegisterTestCase(FilterInputReturnUnderflow{})
+	RegisterTestCase(FilterInputSerializeJump{})
+	RegisterTestCase(FilterInputJumpBasic{})
+	RegisterTestCase(FilterInputJumpReturn{})
+	RegisterTestCase(FilterInputJumpReturnDrop{})
+	RegisterTestCase(FilterInputJumpBuiltin{})
+	RegisterTestCase(FilterInputJumpTwice{})
+	RegisterTestCase(FilterInputDestination{})
+	RegisterTestCase(FilterInputInvertDestination{})
+	RegisterTestCase(FilterInputSource{})
+	RegisterTestCase(FilterInputInvertSource{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -99,7 +110,7 @@ func (FilterInputDropOnlyUDP) ContainerAction(ip net.IP) error {
 func (FilterInputDropOnlyUDP) LocalAction(ip net.IP) error {
 	// Try to establish a TCP connection with the container, which should
 	// succeed.
-	return connectTCP(ip, acceptPort, dropPort, sendloopDuration)
+	return connectTCP(ip, acceptPort, sendloopDuration)
 }
 
 // FilterInputDropUDPPort tests that we can drop UDP traffic by port.
@@ -185,8 +196,11 @@ func (FilterInputDropTCPDestPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterInputDropTCPDestPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, dropPort, acceptPort, sendloopDuration); err == nil {
-		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", dropPort)
+	// Ensure we cannot connect to the container.
+	for start := time.Now(); time.Since(start) < sendloopDuration; {
+		if err := connectTCP(ip, dropPort, sendloopDuration-time.Since(start)); err == nil {
+			return fmt.Errorf("expected not to connect, but was able to connect on port %d", dropPort)
+		}
 	}
 
 	return nil
@@ -202,13 +216,14 @@ func (FilterInputDropTCPSrcPort) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputDropTCPSrcPort) ContainerAction(ip net.IP) error {
-	if err := filterTable("-A", "INPUT", "-p", "tcp", "-m", "tcp", "--sport", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+	// Drop anything from an ephemeral port.
+	if err := filterTable("-A", "INPUT", "-p", "tcp", "-m", "tcp", "--sport", "1024:65535", "-j", "DROP"); err != nil {
 		return err
 	}
 
 	// Listen for TCP packets on accept port.
 	if err := listenTCP(acceptPort, sendloopDuration); err == nil {
-		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", dropPort)
+		return fmt.Errorf("connection destined to port %d should not be accepted, but was", dropPort)
 	}
 
 	return nil
@@ -216,8 +231,11 @@ func (FilterInputDropTCPSrcPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterInputDropTCPSrcPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, acceptPort, dropPort, sendloopDuration); err == nil {
-		return fmt.Errorf("connection on port %d should not be acceptedi, but got accepted", dropPort)
+	// Ensure we cannot connect to the container.
+	for start := time.Now(); time.Since(start) < sendloopDuration; {
+		if err := connectTCP(ip, acceptPort, sendloopDuration-time.Since(start)); err == nil {
+			return fmt.Errorf("expected not to connect, but was able to connect on port %d", acceptPort)
+		}
 	}
 
 	return nil
@@ -267,13 +285,12 @@ func (FilterInputMultiUDPRules) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputMultiUDPRules) ContainerAction(ip net.IP) error {
-	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
-		return err
-	}
-	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"); err != nil {
-		return err
+	rules := [][]string{
+		{"-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"},
+		{"-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", acceptPort), "-j", "ACCEPT"},
+		{"-L"},
 	}
-	return filterTable("-L")
+	return filterTableRules(rules)
 }
 
 // LocalAction implements TestCase.LocalAction.
@@ -314,14 +331,13 @@ func (FilterInputCreateUserChain) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterInputCreateUserChain) ContainerAction(ip net.IP) error {
-	// Create a chain.
-	const chainName = "foochain"
-	if err := filterTable("-N", chainName); err != nil {
-		return err
+	rules := [][]string{
+		// Create a chain.
+		{"-N", chainName},
+		// Add a simple rule to the chain.
+		{"-A", chainName, "-j", "DROP"},
 	}
-
-	// Add a simple rule to the chain.
-	return filterTable("-A", chainName, "-j", "DROP")
+	return filterTableRules(rules)
 }
 
 // LocalAction implements TestCase.LocalAction.
@@ -396,13 +412,12 @@ func (FilterInputReturnUnderflow) Name() string {
 func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
 	// Add a RETURN rule followed by an unconditional accept, and set the
 	// default policy to DROP.
-	if err := filterTable("-A", "INPUT", "-j", "RETURN"); err != nil {
-		return err
-	}
-	if err := filterTable("-A", "INPUT", "-j", "DROP"); err != nil {
-		return err
+	rules := [][]string{
+		{"-A", "INPUT", "-j", "RETURN"},
+		{"-A", "INPUT", "-j", "DROP"},
+		{"-P", "INPUT", "ACCEPT"},
 	}
-	if err := filterTable("-P", "INPUT", "ACCEPT"); err != nil {
+	if err := filterTableRules(rules); err != nil {
 		return err
 	}
 
@@ -415,3 +430,300 @@ func (FilterInputReturnUnderflow) ContainerAction(ip net.IP) error {
 func (FilterInputReturnUnderflow) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// FilterInputSerializeJump verifies that we can serialize jumps.
+type FilterInputSerializeJump struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputSerializeJump) Name() string {
+	return "FilterInputSerializeJump"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputSerializeJump) ContainerAction(ip net.IP) error {
+	// Write a JUMP rule, the serialize it with `-L`.
+	rules := [][]string{
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-L"},
+	}
+	return filterTableRules(rules)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputSerializeJump) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// FilterInputJumpBasic jumps to a chain and executes a rule there.
+type FilterInputJumpBasic struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpBasic) Name() string {
+	return "FilterInputJumpBasic"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpBasic) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on acceptPort.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpBasic) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputJumpReturn jumps, returns, and executes a rule.
+type FilterInputJumpReturn struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpReturn) Name() string {
+	return "FilterInputJumpReturn"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpReturn) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-N", chainName},
+		{"-P", "INPUT", "ACCEPT"},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", "RETURN"},
+		{"-A", chainName, "-j", "DROP"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on acceptPort.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpReturn) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputJumpReturnDrop jumps to a chain, returns, and DROPs packets.
+type FilterInputJumpReturnDrop struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpReturnDrop) Name() string {
+	return "FilterInputJumpReturnDrop"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpReturnDrop) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-N", chainName},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", "INPUT", "-j", "DROP"},
+		{"-A", chainName, "-j", "RETURN"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpReturnDrop) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// FilterInputJumpBuiltin verifies that jumping to a top-levl chain is illegal.
+type FilterInputJumpBuiltin struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpBuiltin) Name() string {
+	return "FilterInputJumpBuiltin"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpBuiltin) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-j", "OUTPUT"); err == nil {
+		return fmt.Errorf("iptables should be unable to jump to a built-in chain")
+	}
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpBuiltin) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// FilterInputJumpTwice jumps twice, then returns twice and executes a rule.
+type FilterInputJumpTwice struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputJumpTwice) Name() string {
+	return "FilterInputJumpTwice"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputJumpTwice) ContainerAction(ip net.IP) error {
+	const chainName2 = chainName + "2"
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-N", chainName},
+		{"-N", chainName2},
+		{"-A", "INPUT", "-j", chainName},
+		{"-A", chainName, "-j", chainName2},
+		{"-A", "INPUT", "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// UDP packets should jump and return twice, eventually hitting the
+	// ACCEPT rule.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputJumpTwice) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputDestination verifies that we can filter packets via `-d
+// <ipaddr>`.
+type FilterInputDestination struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDestination) Name() string {
+	return "FilterInputDestination"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDestination) ContainerAction(ip net.IP) error {
+	addrs, err := localAddrs()
+	if err != nil {
+		return err
+	}
+
+	// Make INPUT's default action DROP, then ACCEPT all packets bound for
+	// this machine.
+	rules := [][]string{{"-P", "INPUT", "DROP"}}
+	for _, addr := range addrs {
+		rules = append(rules, []string{"-A", "INPUT", "-d", addr, "-j", "ACCEPT"})
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDestination) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputInvertDestination verifies that we can filter packets via `! -d
+// <ipaddr>`.
+type FilterInputInvertDestination struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputInvertDestination) Name() string {
+	return "FilterInputInvertDestination"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputInvertDestination) ContainerAction(ip net.IP) error {
+	// Make INPUT's default action DROP, then ACCEPT all packets not bound
+	// for 127.0.0.1.
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-A", "INPUT", "!", "-d", localIP, "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputInvertDestination) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputSource verifies that we can filter packets via `-s
+// <ipaddr>`.
+type FilterInputSource struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputSource) Name() string {
+	return "FilterInputSource"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputSource) ContainerAction(ip net.IP) error {
+	// Make INPUT's default action DROP, then ACCEPT all packets from this
+	// machine.
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-A", "INPUT", "-s", fmt.Sprintf("%v", ip), "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputSource) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputInvertSource verifies that we can filter packets via `! -s
+// <ipaddr>`.
+type FilterInputInvertSource struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputInvertSource) Name() string {
+	return "FilterInputInvertSource"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputInvertSource) ContainerAction(ip net.IP) error {
+	// Make INPUT's default action DROP, then ACCEPT all packets not bound
+	// for 127.0.0.1.
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-A", "INPUT", "!", "-s", localIP, "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputInvertSource) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/filter_output.go b/test/iptables/filter_output.go
index ee2c49f9a..ba0d6fc29 100644
--- a/test/iptables/filter_output.go
+++ b/test/iptables/filter_output.go
@@ -22,9 +22,28 @@ import (
 func init() {
 	RegisterTestCase(FilterOutputDropTCPDestPort{})
 	RegisterTestCase(FilterOutputDropTCPSrcPort{})
+	RegisterTestCase(FilterOutputDestination{})
+	RegisterTestCase(FilterOutputInvertDestination{})
+	RegisterTestCase(FilterOutputAcceptTCPOwner{})
+	RegisterTestCase(FilterOutputDropTCPOwner{})
+	RegisterTestCase(FilterOutputAcceptUDPOwner{})
+	RegisterTestCase(FilterOutputDropUDPOwner{})
+	RegisterTestCase(FilterOutputOwnerFail{})
+	RegisterTestCase(FilterOutputAcceptGIDOwner{})
+	RegisterTestCase(FilterOutputDropGIDOwner{})
+	RegisterTestCase(FilterOutputInvertGIDOwner{})
+	RegisterTestCase(FilterOutputInvertUIDOwner{})
+	RegisterTestCase(FilterOutputInvertUIDAndGIDOwner{})
+	RegisterTestCase(FilterOutputInterfaceAccept{})
+	RegisterTestCase(FilterOutputInterfaceDrop{})
+	RegisterTestCase(FilterOutputInterface{})
+	RegisterTestCase(FilterOutputInterfaceBeginsWith{})
+	RegisterTestCase(FilterOutputInterfaceInvertDrop{})
+	RegisterTestCase(FilterOutputInterfaceInvertAccept{})
 }
 
-// FilterOutputDropTCPDestPort tests that connections are not accepted on specified source ports.
+// FilterOutputDropTCPDestPort tests that connections are not accepted on
+// specified source ports.
 type FilterOutputDropTCPDestPort struct{}
 
 // Name implements TestCase.Name.
@@ -34,7 +53,7 @@ func (FilterOutputDropTCPDestPort) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (FilterOutputDropTCPDestPort) ContainerAction(ip net.IP) error {
-	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "tcp", "--dport", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "tcp", "--dport", "1024:65535", "-j", "DROP"); err != nil {
 		return err
 	}
 
@@ -48,14 +67,15 @@ func (FilterOutputDropTCPDestPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterOutputDropTCPDestPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, acceptPort, dropPort, sendloopDuration); err == nil {
+	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
 		return fmt.Errorf("connection on port %d should not be accepted, but got accepted", dropPort)
 	}
 
 	return nil
 }
 
-// FilterOutputDropTCPSrcPort tests that connections are not accepted on specified source ports.
+// FilterOutputDropTCPSrcPort tests that connections are not accepted on
+// specified source ports.
 type FilterOutputDropTCPSrcPort struct{}
 
 // Name implements TestCase.Name.
@@ -79,9 +99,509 @@ func (FilterOutputDropTCPSrcPort) ContainerAction(ip net.IP) error {
 
 // LocalAction implements TestCase.LocalAction.
 func (FilterOutputDropTCPSrcPort) LocalAction(ip net.IP) error {
-	if err := connectTCP(ip, dropPort, acceptPort, sendloopDuration); err == nil {
+	if err := connectTCP(ip, dropPort, sendloopDuration); err == nil {
 		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", dropPort)
 	}
 
 	return nil
 }
+
+// FilterOutputAcceptTCPOwner tests that TCP connections from uid owner are accepted.
+type FilterOutputAcceptTCPOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputAcceptTCPOwner) Name() string {
+	return "FilterOutputAcceptTCPOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputAcceptTCPOwner) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "owner", "--uid-owner", "root", "-j", "ACCEPT"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	return listenTCP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputAcceptTCPOwner) LocalAction(ip net.IP) error {
+	return connectTCP(ip, acceptPort, sendloopDuration)
+}
+
+// FilterOutputDropTCPOwner tests that TCP connections from uid owner are dropped.
+type FilterOutputDropTCPOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputDropTCPOwner) Name() string {
+	return "FilterOutputDropTCPOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputDropTCPOwner) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "owner", "--uid-owner", "root", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	if err := listenTCP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection on port %d should be dropped, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputDropTCPOwner) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection destined to port %d should be dropped, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// FilterOutputAcceptUDPOwner tests that UDP packets from uid owner are accepted.
+type FilterOutputAcceptUDPOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputAcceptUDPOwner) Name() string {
+	return "FilterOutputAcceptUDPOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputAcceptUDPOwner) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "udp", "-m", "owner", "--uid-owner", "root", "-j", "ACCEPT"); err != nil {
+		return err
+	}
+
+	// Send UDP packets on acceptPort.
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputAcceptUDPOwner) LocalAction(ip net.IP) error {
+	// Listen for UDP packets on acceptPort.
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// FilterOutputDropUDPOwner tests that UDP packets from uid owner are dropped.
+type FilterOutputDropUDPOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputDropUDPOwner) Name() string {
+	return "FilterOutputDropUDPOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputDropUDPOwner) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "udp", "-m", "owner", "--uid-owner", "root", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Send UDP packets on dropPort.
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputDropUDPOwner) LocalAction(ip net.IP) error {
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets should not be received")
+	}
+
+	return nil
+}
+
+// FilterOutputOwnerFail tests that without uid/gid option, owner rule
+// will fail.
+type FilterOutputOwnerFail struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputOwnerFail) Name() string {
+	return "FilterOutputOwnerFail"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputOwnerFail) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "udp", "-m", "owner", "-j", "ACCEPT"); err == nil {
+		return fmt.Errorf("Invalid argument")
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputOwnerFail) LocalAction(ip net.IP) error {
+	// no-op.
+	return nil
+}
+
+// FilterOutputAcceptGIDOwner tests that TCP connections from gid owner are accepted.
+type FilterOutputAcceptGIDOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputAcceptGIDOwner) Name() string {
+	return "FilterOutputAcceptGIDOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputAcceptGIDOwner) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "owner", "--gid-owner", "root", "-j", "ACCEPT"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	return listenTCP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputAcceptGIDOwner) LocalAction(ip net.IP) error {
+	return connectTCP(ip, acceptPort, sendloopDuration)
+}
+
+// FilterOutputDropGIDOwner tests that TCP connections from gid owner are dropped.
+type FilterOutputDropGIDOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputDropGIDOwner) Name() string {
+	return "FilterOutputDropGIDOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputDropGIDOwner) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "-m", "owner", "--gid-owner", "root", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	if err := listenTCP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection on port %d should not be accepted, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputDropGIDOwner) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// FilterOutputInvertGIDOwner tests that TCP connections from gid owner are dropped.
+type FilterOutputInvertGIDOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInvertGIDOwner) Name() string {
+	return "FilterOutputInvertGIDOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInvertGIDOwner) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-A", "OUTPUT", "-p", "tcp", "-m", "owner", "!", "--gid-owner", "root", "-j", "ACCEPT"},
+		{"-A", "OUTPUT", "-p", "tcp", "-j", "DROP"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	if err := listenTCP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection on port %d should not be accepted, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInvertGIDOwner) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// FilterOutputInvertUIDOwner tests that TCP connections from gid owner are dropped.
+type FilterOutputInvertUIDOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInvertUIDOwner) Name() string {
+	return "FilterOutputInvertUIDOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInvertUIDOwner) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-A", "OUTPUT", "-p", "tcp", "-m", "owner", "!", "--uid-owner", "root", "-j", "DROP"},
+		{"-A", "OUTPUT", "-p", "tcp", "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	return listenTCP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInvertUIDOwner) LocalAction(ip net.IP) error {
+	return connectTCP(ip, acceptPort, sendloopDuration)
+}
+
+// FilterOutputInvertUIDAndGIDOwner tests that TCP connections from uid and gid
+// owner are dropped.
+type FilterOutputInvertUIDAndGIDOwner struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInvertUIDAndGIDOwner) Name() string {
+	return "FilterOutputInvertUIDAndGIDOwner"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInvertUIDAndGIDOwner) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-A", "OUTPUT", "-p", "tcp", "-m", "owner", "!", "--uid-owner", "root", "!", "--gid-owner", "root", "-j", "ACCEPT"},
+		{"-A", "OUTPUT", "-p", "tcp", "-j", "DROP"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	if err := listenTCP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection on port %d should not be accepted, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInvertUIDAndGIDOwner) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// FilterOutputDestination tests that we can selectively allow packets to
+// certain destinations.
+type FilterOutputDestination struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputDestination) Name() string {
+	return "FilterOutputDestination"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputDestination) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-A", "OUTPUT", "-d", ip.String(), "-j", "ACCEPT"},
+		{"-P", "OUTPUT", "DROP"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputDestination) LocalAction(ip net.IP) error {
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// FilterOutputInvertDestination tests that we can selectively allow packets
+// not headed for a particular destination.
+type FilterOutputInvertDestination struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInvertDestination) Name() string {
+	return "FilterOutputInvertDestination"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInvertDestination) ContainerAction(ip net.IP) error {
+	rules := [][]string{
+		{"-A", "OUTPUT", "!", "-d", localIP, "-j", "ACCEPT"},
+		{"-P", "OUTPUT", "DROP"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInvertDestination) LocalAction(ip net.IP) error {
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// FilterOutputInterfaceAccept tests that packets are sent via interface
+// matching the iptables rule.
+type FilterOutputInterfaceAccept struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInterfaceAccept) Name() string {
+	return "FilterOutputInterfaceAccept"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInterfaceAccept) ContainerAction(ip net.IP) error {
+	ifname, ok := getInterfaceName()
+	if !ok {
+		return fmt.Errorf("no interface is present, except loopback")
+	}
+	if err := filterTable("-A", "OUTPUT", "-p", "udp", "-o", ifname, "-j", "ACCEPT"); err != nil {
+		return err
+	}
+
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInterfaceAccept) LocalAction(ip net.IP) error {
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// FilterOutputInterfaceDrop tests that packets are not sent via interface
+// matching the iptables rule.
+type FilterOutputInterfaceDrop struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInterfaceDrop) Name() string {
+	return "FilterOutputInterfaceDrop"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInterfaceDrop) ContainerAction(ip net.IP) error {
+	ifname, ok := getInterfaceName()
+	if !ok {
+		return fmt.Errorf("no interface is present, except loopback")
+	}
+	if err := filterTable("-A", "OUTPUT", "-p", "udp", "-o", ifname, "-j", "DROP"); err != nil {
+		return err
+	}
+
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInterfaceDrop) LocalAction(ip net.IP) error {
+	if err := listenUDP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets should not be received on port %v, but are received", acceptPort)
+	}
+
+	return nil
+}
+
+// FilterOutputInterface tests that packets are sent via interface which is
+// not matching the interface name in the iptables rule.
+type FilterOutputInterface struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInterface) Name() string {
+	return "FilterOutputInterface"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInterface) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "udp", "-o", "lo", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInterface) LocalAction(ip net.IP) error {
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// FilterOutputInterfaceBeginsWith tests that packets are not sent via an
+// interface which begins with the given interface name.
+type FilterOutputInterfaceBeginsWith struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInterfaceBeginsWith) Name() string {
+	return "FilterOutputInterfaceBeginsWith"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInterfaceBeginsWith) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "udp", "-o", "e+", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInterfaceBeginsWith) LocalAction(ip net.IP) error {
+	if err := listenUDP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets should not be received on port %v, but are received", acceptPort)
+	}
+
+	return nil
+}
+
+// FilterOutputInterfaceInvertDrop tests that we selectively do not send
+// packets via interface not matching the interface name.
+type FilterOutputInterfaceInvertDrop struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInterfaceInvertDrop) Name() string {
+	return "FilterOutputInterfaceInvertDrop"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInterfaceInvertDrop) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "!", "-o", "lo", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	if err := listenTCP(acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection on port %d should not be accepted, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInterfaceInvertDrop) LocalAction(ip net.IP) error {
+	if err := connectTCP(ip, acceptPort, sendloopDuration); err == nil {
+		return fmt.Errorf("connection destined to port %d should not be accepted, but got accepted", acceptPort)
+	}
+
+	return nil
+}
+
+// FilterOutputInterfaceInvertAccept tests that we can selectively send packets
+// not matching the specific outgoing interface.
+type FilterOutputInterfaceInvertAccept struct{}
+
+// Name implements TestCase.Name.
+func (FilterOutputInterfaceInvertAccept) Name() string {
+	return "FilterOutputInterfaceInvertAccept"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterOutputInterfaceInvertAccept) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "OUTPUT", "-p", "tcp", "!", "-o", "lo", "-j", "ACCEPT"); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on accept port.
+	return listenTCP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterOutputInterfaceInvertAccept) LocalAction(ip net.IP) error {
+	return connectTCP(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables.go b/test/iptables/iptables.go
index 2e565d988..16cb4f4da 100644
--- a/test/iptables/iptables.go
+++ b/test/iptables/iptables.go
@@ -18,12 +18,19 @@ package iptables
 import (
 	"fmt"
 	"net"
+	"time"
 )
 
 // IPExchangePort is the port the container listens on to receive the IP
 // address of the local process.
 const IPExchangePort = 2349
 
+// TerminalStatement is the last statement in the test runner.
+const TerminalStatement = "Finished!"
+
+// TestTimeout is the timeout used for all tests.
+const TestTimeout = 10 * time.Minute
+
 // A TestCase contains one action to run in the container and one to run
 // locally. The actions run concurrently and each must succeed for the test
 // pass.
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 46a7c99b0..38319a3b2 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -15,28 +15,14 @@
 package iptables
 
 import (
-	"flag"
 	"fmt"
 	"net"
-	"os"
-	"path"
 	"testing"
-	"time"
 
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
-const timeout = 18 * time.Second
-
-var image = flag.String("image", "bazel/test/iptables/runner:runner-image", "image to run tests in")
-
-type result struct {
-	output string
-	err    error
-}
-
 // singleTest runs a TestCase. Each test follows a pattern:
 // - Create a container.
 // - Get the container's IP.
@@ -46,77 +32,45 @@ type result struct {
 //
 // Container output is logged to $TEST_UNDECLARED_OUTPUTS_DIR if it exists, or
 // to stderr.
-func singleTest(test TestCase) error {
+func singleTest(t *testing.T, test TestCase) {
 	if _, ok := Tests[test.Name()]; !ok {
-		return fmt.Errorf("no test found with name %q. Has it been registered?", test.Name())
+		t.Fatalf("no test found with name %q. Has it been registered?", test.Name())
 	}
 
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
 	// Create and start the container.
-	cont := dockerutil.MakeDocker("gvisor-iptables")
-	defer cont.CleanUp()
-	resultChan := make(chan *result)
-	go func() {
-		output, err := cont.RunFg("--cap-add=NET_ADMIN", *image, "-name", test.Name())
-		logContainer(output, err)
-		resultChan <- &result{output, err}
-	}()
+	d.CopyFiles("/runner", "test/iptables/runner/runner")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image:  "iptables",
+		CapAdd: []string{"NET_ADMIN"},
+	}, "/runner/runner", "-name", test.Name()); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
 
 	// Get the container IP.
-	ip, err := getIP(cont)
+	ip, err := d.FindIP()
 	if err != nil {
-		return fmt.Errorf("failed to get container IP: %v", err)
+		t.Fatalf("failed to get container IP: %v", err)
 	}
 
 	// Give the container our IP.
 	if err := sendIP(ip); err != nil {
-		return fmt.Errorf("failed to send IP to container: %v", err)
+		t.Fatalf("failed to send IP to container: %v", err)
 	}
 
 	// Run our side of the test.
-	errChan := make(chan error)
-	go func() {
-		errChan <- test.LocalAction(ip)
-	}()
-
-	// Wait for both the container and local tests to finish.
-	var res *result
-	to := time.After(timeout)
-	for localDone := false; res == nil || !localDone; {
-		select {
-		case res = <-resultChan:
-			log.Infof("Container finished.")
-		case err, localDone = <-errChan:
-			log.Infof("Local finished.")
-			if err != nil {
-				return fmt.Errorf("local test failed: %v", err)
-			}
-		case <-to:
-			return fmt.Errorf("timed out after %f seconds", timeout.Seconds())
-		}
+	if err := test.LocalAction(ip); err != nil {
+		t.Fatalf("LocalAction failed: %v", err)
 	}
 
-	return res.err
-}
-
-func getIP(cont dockerutil.Docker) (net.IP, error) {
-	// The container might not have started yet, so retry a few times.
-	var ipStr string
-	to := time.After(timeout)
-	for ipStr == "" {
-		ipStr, _ = cont.FindIP()
-		select {
-		case <-to:
-			return net.IP{}, fmt.Errorf("timed out getting IP after %f seconds", timeout.Seconds())
-		default:
-			time.Sleep(250 * time.Millisecond)
-		}
+	// Wait for the final statement. This structure has the side effect
+	// that all container logs will appear within the individual test
+	// context.
+	if _, err := d.WaitForOutput(TerminalStatement, TestTimeout); err != nil {
+		t.Fatalf("test failed: %v", err)
 	}
-	ip := net.ParseIP(ipStr)
-	if ip == nil {
-		return net.IP{}, fmt.Errorf("invalid IP: %q", ipStr)
-	}
-	log.Infof("Container has IP of %s", ipStr)
-	return ip, nil
 }
 
 func sendIP(ip net.IP) error {
@@ -132,7 +86,7 @@ func sendIP(ip net.IP) error {
 		conn = c
 		return err
 	}
-	if err := testutil.Poll(cb, timeout); err != nil {
+	if err := testutil.Poll(cb, TestTimeout); err != nil {
 		return fmt.Errorf("timed out waiting to send IP, most recent error: %v", err)
 	}
 	if _, err := conn.Write([]byte{0}); err != nil {
@@ -141,111 +95,222 @@ func sendIP(ip net.IP) error {
 	return nil
 }
 
-func logContainer(output string, err error) {
-	msg := fmt.Sprintf("Container error: %v\nContainer output:\n%v", err, output)
-	if artifactsDir := os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); artifactsDir != "" {
-		fpath := path.Join(artifactsDir, "container.log")
-		if file, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE, 0644); err != nil {
-			log.Warningf("Failed to open log file %q: %v", fpath, err)
-		} else {
-			defer file.Close()
-			if _, err := file.Write([]byte(msg)); err == nil {
-				return
-			}
-			log.Warningf("Failed to write to log file %s: %v", fpath, err)
-		}
-	}
-
-	// We couldn't write to the output directory -- just log to stderr.
-	log.Infof(msg)
-}
-
 func TestFilterInputDropUDP(t *testing.T) {
-	if err := singleTest(FilterInputDropUDP{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropUDP{})
 }
 
 func TestFilterInputDropUDPPort(t *testing.T) {
-	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropUDPPort{})
 }
 
 func TestFilterInputDropDifferentUDPPort(t *testing.T) {
-	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropDifferentUDPPort{})
 }
 
 func TestFilterInputDropAll(t *testing.T) {
-	if err := singleTest(FilterInputDropAll{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropAll{})
 }
 
 func TestFilterInputDropOnlyUDP(t *testing.T) {
-	if err := singleTest(FilterInputDropOnlyUDP{}); err != nil {
-		t.Fatal(err)
-	}
-}
-
-func TestNATRedirectUDPPort(t *testing.T) {
-	if err := singleTest(NATRedirectUDPPort{}); err != nil {
-		t.Fatal(err)
-	}
-}
-
-func TestNATDropUDP(t *testing.T) {
-	if err := singleTest(NATDropUDP{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropOnlyUDP{})
 }
 
 func TestFilterInputDropTCPDestPort(t *testing.T) {
-	if err := singleTest(FilterInputDropTCPDestPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropTCPDestPort{})
 }
 
 func TestFilterInputDropTCPSrcPort(t *testing.T) {
-	if err := singleTest(FilterInputDropTCPSrcPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDropTCPSrcPort{})
 }
 
 func TestFilterInputCreateUserChain(t *testing.T) {
-	if err := singleTest(FilterInputCreateUserChain{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputCreateUserChain{})
 }
 
 func TestFilterInputDefaultPolicyAccept(t *testing.T) {
-	if err := singleTest(FilterInputDefaultPolicyAccept{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDefaultPolicyAccept{})
 }
 
 func TestFilterInputDefaultPolicyDrop(t *testing.T) {
-	if err := singleTest(FilterInputDefaultPolicyDrop{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputDefaultPolicyDrop{})
 }
 
 func TestFilterInputReturnUnderflow(t *testing.T) {
-	if err := singleTest(FilterInputReturnUnderflow{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterInputReturnUnderflow{})
 }
 
 func TestFilterOutputDropTCPDestPort(t *testing.T) {
-	if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputDropTCPDestPort{})
 }
 
 func TestFilterOutputDropTCPSrcPort(t *testing.T) {
-	if err := singleTest(FilterOutputDropTCPSrcPort{}); err != nil {
-		t.Fatal(err)
-	}
+	singleTest(t, FilterOutputDropTCPSrcPort{})
+}
+
+func TestFilterOutputAcceptTCPOwner(t *testing.T) {
+	singleTest(t, FilterOutputAcceptTCPOwner{})
+}
+
+func TestFilterOutputDropTCPOwner(t *testing.T) {
+	singleTest(t, FilterOutputDropTCPOwner{})
+}
+
+func TestFilterOutputAcceptUDPOwner(t *testing.T) {
+	singleTest(t, FilterOutputAcceptUDPOwner{})
+}
+
+func TestFilterOutputDropUDPOwner(t *testing.T) {
+	singleTest(t, FilterOutputDropUDPOwner{})
+}
+
+func TestFilterOutputOwnerFail(t *testing.T) {
+	singleTest(t, FilterOutputOwnerFail{})
+}
+
+func TestFilterOutputAcceptGIDOwner(t *testing.T) {
+	singleTest(t, FilterOutputAcceptGIDOwner{})
+}
+
+func TestFilterOutputDropGIDOwner(t *testing.T) {
+	singleTest(t, FilterOutputDropGIDOwner{})
+}
+
+func TestFilterOutputInvertGIDOwner(t *testing.T) {
+	singleTest(t, FilterOutputInvertGIDOwner{})
+}
+
+func TestFilterOutputInvertUIDOwner(t *testing.T) {
+	singleTest(t, FilterOutputInvertUIDOwner{})
+}
+
+func TestFilterOutputInvertUIDAndGIDOwner(t *testing.T) {
+	singleTest(t, FilterOutputInvertUIDAndGIDOwner{})
+}
+
+func TestFilterOutputInterfaceAccept(t *testing.T) {
+	singleTest(t, FilterOutputInterfaceAccept{})
+}
+
+func TestFilterOutputInterfaceDrop(t *testing.T) {
+	singleTest(t, FilterOutputInterfaceDrop{})
+}
+
+func TestFilterOutputInterface(t *testing.T) {
+	singleTest(t, FilterOutputInterface{})
+}
+
+func TestFilterOutputInterfaceBeginsWith(t *testing.T) {
+	singleTest(t, FilterOutputInterfaceBeginsWith{})
+}
+
+func TestFilterOutputInterfaceInvertDrop(t *testing.T) {
+	singleTest(t, FilterOutputInterfaceInvertDrop{})
+}
+
+func TestFilterOutputInterfaceInvertAccept(t *testing.T) {
+	singleTest(t, FilterOutputInterfaceInvertAccept{})
+}
+
+func TestJumpSerialize(t *testing.T) {
+	singleTest(t, FilterInputSerializeJump{})
+}
+
+func TestJumpBasic(t *testing.T) {
+	singleTest(t, FilterInputJumpBasic{})
+}
+
+func TestJumpReturn(t *testing.T) {
+	singleTest(t, FilterInputJumpReturn{})
+}
+
+func TestJumpReturnDrop(t *testing.T) {
+	singleTest(t, FilterInputJumpReturnDrop{})
+}
+
+func TestJumpBuiltin(t *testing.T) {
+	singleTest(t, FilterInputJumpBuiltin{})
+}
+
+func TestJumpTwice(t *testing.T) {
+	singleTest(t, FilterInputJumpTwice{})
+}
+
+func TestInputDestination(t *testing.T) {
+	singleTest(t, FilterInputDestination{})
+}
+
+func TestInputInvertDestination(t *testing.T) {
+	singleTest(t, FilterInputInvertDestination{})
+}
+
+func TestOutputDestination(t *testing.T) {
+	singleTest(t, FilterOutputDestination{})
+}
+
+func TestOutputInvertDestination(t *testing.T) {
+	singleTest(t, FilterOutputInvertDestination{})
+}
+
+func TestNATPreRedirectUDPPort(t *testing.T) {
+	singleTest(t, NATPreRedirectUDPPort{})
+}
+
+func TestNATPreRedirectTCPPort(t *testing.T) {
+	singleTest(t, NATPreRedirectTCPPort{})
+}
+
+func TestNATOutRedirectUDPPort(t *testing.T) {
+	singleTest(t, NATOutRedirectUDPPort{})
+}
+
+func TestNATOutRedirectTCPPort(t *testing.T) {
+	singleTest(t, NATOutRedirectTCPPort{})
+}
+
+func TestNATDropUDP(t *testing.T) {
+	singleTest(t, NATDropUDP{})
+}
+
+func TestNATAcceptAll(t *testing.T) {
+	singleTest(t, NATAcceptAll{})
+}
+
+func TestNATOutRedirectIP(t *testing.T) {
+	singleTest(t, NATOutRedirectIP{})
+}
+
+func TestNATOutDontRedirectIP(t *testing.T) {
+	singleTest(t, NATOutDontRedirectIP{})
+}
+
+func TestNATOutRedirectInvert(t *testing.T) {
+	singleTest(t, NATOutRedirectInvert{})
+}
+
+func TestNATPreRedirectIP(t *testing.T) {
+	singleTest(t, NATPreRedirectIP{})
+}
+
+func TestNATPreDontRedirectIP(t *testing.T) {
+	singleTest(t, NATPreDontRedirectIP{})
+}
+
+func TestNATPreRedirectInvert(t *testing.T) {
+	singleTest(t, NATPreRedirectInvert{})
+}
+
+func TestNATRedirectRequiresProtocol(t *testing.T) {
+	singleTest(t, NATRedirectRequiresProtocol{})
+}
+
+func TestNATLoopbackSkipsPrerouting(t *testing.T) {
+	singleTest(t, NATLoopbackSkipsPrerouting{})
+}
+
+func TestInputSource(t *testing.T) {
+	singleTest(t, FilterInputSource{})
+}
+
+func TestInputInvertSource(t *testing.T) {
+	singleTest(t, FilterInputInvertSource{})
 }
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
index 043114c78..7146edbb9 100644
--- a/test/iptables/iptables_util.go
+++ b/test/iptables/iptables_util.go
@@ -20,14 +20,24 @@ import (
 	"os/exec"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 const iptablesBinary = "iptables"
+const localIP = "127.0.0.1"
 
 // filterTable calls `iptables -t filter` with the given args.
 func filterTable(args ...string) error {
-	args = append([]string{"-t", "filter"}, args...)
+	return tableCmd("filter", args)
+}
+
+// natTable calls `iptables -t nat` with the given args.
+func natTable(args ...string) error {
+	return tableCmd("nat", args)
+}
+
+func tableCmd(table string, args []string) error {
+	args = append([]string{"-t", table}, args...)
 	cmd := exec.Command(iptablesBinary, args...)
 	if out, err := cmd.CombinedOutput(); err != nil {
 		return fmt.Errorf("error running iptables with args %v\nerror: %v\noutput: %s", args, err, string(out))
@@ -35,6 +45,25 @@ func filterTable(args ...string) error {
 	return nil
 }
 
+// filterTableRules is like filterTable, but runs multiple iptables commands.
+func filterTableRules(argsList [][]string) error {
+	return tableRules("filter", argsList)
+}
+
+// natTableRules is like natTable, but runs multiple iptables commands.
+func natTableRules(argsList [][]string) error {
+	return tableRules("nat", argsList)
+}
+
+func tableRules(table string, argsList [][]string) error {
+	for _, args := range argsList {
+		if err := tableCmd(table, args); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
 // listenUDP listens on a UDP port and returns the value of net.Conn.Read() for
 // the first read on that port.
 func listenUDP(port int, timeout time.Duration) error {
@@ -106,27 +135,52 @@ func listenTCP(port int, timeout time.Duration) error {
 	return nil
 }
 
-// connectTCP connects the TCP server over specified local port, server IP and remote/server port.
-func connectTCP(ip net.IP, remotePort, localPort int, timeout time.Duration) error {
+// connectTCP connects to the given IP and port from an ephemeral local address.
+func connectTCP(ip net.IP, port int, timeout time.Duration) error {
 	contAddr := net.TCPAddr{
 		IP:   ip,
-		Port: remotePort,
+		Port: port,
 	}
 	// The container may not be listening when we first connect, so retry
 	// upon error.
 	callback := func() error {
-		localAddr := net.TCPAddr{
-			Port: localPort,
-		}
-		conn, err := net.DialTCP("tcp4", &localAddr, &contAddr)
+		conn, err := net.DialTimeout("tcp", contAddr.String(), timeout)
 		if conn != nil {
 			conn.Close()
 		}
 		return err
 	}
 	if err := testutil.Poll(callback, timeout); err != nil {
-		return fmt.Errorf("timed out waiting to send IP, most recent error: %v", err)
+		return fmt.Errorf("timed out waiting to connect IP on port %v, most recent error: %v", port, err)
 	}
 
 	return nil
 }
+
+// localAddrs returns a list of local network interface addresses.
+func localAddrs() ([]string, error) {
+	addrs, err := net.InterfaceAddrs()
+	if err != nil {
+		return nil, err
+	}
+	addrStrs := make([]string, 0, len(addrs))
+	for _, addr := range addrs {
+		addrStrs = append(addrStrs, addr.String())
+	}
+	return addrStrs, nil
+}
+
+// getInterfaceName returns the name of the interface other than loopback.
+func getInterfaceName() (string, bool) {
+	var ifname string
+	if interfaces, err := net.Interfaces(); err == nil {
+		for _, intf := range interfaces {
+			if intf.Name != "lo" {
+				ifname = intf.Name
+				break
+			}
+		}
+	}
+
+	return ifname, ifname != ""
+}
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
index b5c6f927e..5e54a3963 100644
--- a/test/iptables/nat.go
+++ b/test/iptables/nat.go
@@ -15,8 +15,10 @@
 package iptables
 
 import (
+	"errors"
 	"fmt"
 	"net"
+	"time"
 )
 
 const (
@@ -24,36 +26,93 @@ const (
 )
 
 func init() {
-	RegisterTestCase(NATRedirectUDPPort{})
+	RegisterTestCase(NATPreRedirectUDPPort{})
+	RegisterTestCase(NATPreRedirectTCPPort{})
+	RegisterTestCase(NATOutRedirectUDPPort{})
+	RegisterTestCase(NATOutRedirectTCPPort{})
 	RegisterTestCase(NATDropUDP{})
+	RegisterTestCase(NATAcceptAll{})
+	RegisterTestCase(NATPreRedirectIP{})
+	RegisterTestCase(NATPreDontRedirectIP{})
+	RegisterTestCase(NATPreRedirectInvert{})
+	RegisterTestCase(NATOutRedirectIP{})
+	RegisterTestCase(NATOutDontRedirectIP{})
+	RegisterTestCase(NATOutRedirectInvert{})
+	RegisterTestCase(NATRedirectRequiresProtocol{})
+	RegisterTestCase(NATLoopbackSkipsPrerouting{})
 }
 
-// NATRedirectUDPPort tests that packets are redirected to different port.
-type NATRedirectUDPPort struct{}
+// NATPreRedirectUDPPort tests that packets are redirected to different port.
+type NATPreRedirectUDPPort struct{}
 
 // Name implements TestCase.Name.
-func (NATRedirectUDPPort) Name() string {
-	return "NATRedirectUDPPort"
+func (NATPreRedirectUDPPort) Name() string {
+	return "NATPreRedirectUDPPort"
 }
 
 // ContainerAction implements TestCase.ContainerAction.
-func (NATRedirectUDPPort) ContainerAction(ip net.IP) error {
-	if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
+func (NATPreRedirectUDPPort) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
 		return err
 	}
 
 	if err := listenUDP(redirectPort, sendloopDuration); err != nil {
 		return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", redirectPort, err)
 	}
+
 	return nil
 }
 
 // LocalAction implements TestCase.LocalAction.
-func (NATRedirectUDPPort) LocalAction(ip net.IP) error {
+func (NATPreRedirectUDPPort) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
 
-// NATDropUDP tests that packets are not received in ports other than redirect port.
+// NATPreRedirectTCPPort tests that connections are redirected on specified ports.
+type NATPreRedirectTCPPort struct{}
+
+// Name implements TestCase.Name.
+func (NATPreRedirectTCPPort) Name() string {
+	return "NATPreRedirectTCPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATPreRedirectTCPPort) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-p", "tcp", "-m", "tcp", "--dport", fmt.Sprintf("%d", dropPort), "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", acceptPort)); err != nil {
+		return err
+	}
+
+	// Listen for TCP packets on redirect port.
+	return listenTCP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATPreRedirectTCPPort) LocalAction(ip net.IP) error {
+	return connectTCP(ip, dropPort, sendloopDuration)
+}
+
+// NATOutRedirectUDPPort tests that packets are redirected to different port.
+type NATOutRedirectUDPPort struct{}
+
+// Name implements TestCase.Name.
+func (NATOutRedirectUDPPort) Name() string {
+	return "NATOutRedirectUDPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATOutRedirectUDPPort) ContainerAction(ip net.IP) error {
+	dest := []byte{200, 0, 0, 1}
+	return loopbackTest(dest, "-A", "OUTPUT", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", acceptPort))
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATOutRedirectUDPPort) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// NATDropUDP tests that packets are not received in ports other than redirect
+// port.
 type NATDropUDP struct{}
 
 // Name implements TestCase.Name.
@@ -63,7 +122,7 @@ func (NATDropUDP) Name() string {
 
 // ContainerAction implements TestCase.ContainerAction.
 func (NATDropUDP) ContainerAction(ip net.IP) error {
-	if err := filterTable("-t", "nat", "-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", redirectPort)); err != nil {
 		return err
 	}
 
@@ -78,3 +137,303 @@ func (NATDropUDP) ContainerAction(ip net.IP) error {
 func (NATDropUDP) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// NATAcceptAll tests that all UDP packets are accepted.
+type NATAcceptAll struct{}
+
+// Name implements TestCase.Name.
+func (NATAcceptAll) Name() string {
+	return "NATAcceptAll"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATAcceptAll) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "-j", "ACCEPT"); err != nil {
+		return err
+	}
+
+	if err := listenUDP(acceptPort, sendloopDuration); err != nil {
+		return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", acceptPort, err)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATAcceptAll) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// NATOutRedirectIP uses iptables to select packets based on destination IP and
+// redirects them.
+type NATOutRedirectIP struct{}
+
+// Name implements TestCase.Name.
+func (NATOutRedirectIP) Name() string {
+	return "NATOutRedirectIP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATOutRedirectIP) ContainerAction(ip net.IP) error {
+	// Redirect OUTPUT packets to a listening localhost port.
+	dest := net.IP([]byte{200, 0, 0, 2})
+	return loopbackTest(dest, "-A", "OUTPUT", "-d", dest.String(), "-p", "udp", "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", acceptPort))
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATOutRedirectIP) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// NATOutDontRedirectIP tests that iptables matching with "-d" does not match
+// packets it shouldn't.
+type NATOutDontRedirectIP struct{}
+
+// Name implements TestCase.Name.
+func (NATOutDontRedirectIP) Name() string {
+	return "NATOutDontRedirectIP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATOutDontRedirectIP) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "OUTPUT", "-d", localIP, "-p", "udp", "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", dropPort)); err != nil {
+		return err
+	}
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATOutDontRedirectIP) LocalAction(ip net.IP) error {
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// NATOutRedirectInvert tests that iptables can match with "! -d".
+type NATOutRedirectInvert struct{}
+
+// Name implements TestCase.Name.
+func (NATOutRedirectInvert) Name() string {
+	return "NATOutRedirectInvert"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATOutRedirectInvert) ContainerAction(ip net.IP) error {
+	// Redirect OUTPUT packets to a listening localhost port.
+	dest := []byte{200, 0, 0, 3}
+	destStr := "200.0.0.2"
+	return loopbackTest(dest, "-A", "OUTPUT", "!", "-d", destStr, "-p", "udp", "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", acceptPort))
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATOutRedirectInvert) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// NATPreRedirectIP tests that we can use iptables to select packets based on
+// destination IP and redirect them.
+type NATPreRedirectIP struct{}
+
+// Name implements TestCase.Name.
+func (NATPreRedirectIP) Name() string {
+	return "NATPreRedirectIP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATPreRedirectIP) ContainerAction(ip net.IP) error {
+	addrs, err := localAddrs()
+	if err != nil {
+		return err
+	}
+
+	var rules [][]string
+	for _, addr := range addrs {
+		rules = append(rules, []string{"-A", "PREROUTING", "-p", "udp", "-d", addr, "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", acceptPort)})
+	}
+	if err := natTableRules(rules); err != nil {
+		return err
+	}
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATPreRedirectIP) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// NATPreDontRedirectIP tests that iptables matching with "-d" does not match
+// packets it shouldn't.
+type NATPreDontRedirectIP struct{}
+
+// Name implements TestCase.Name.
+func (NATPreDontRedirectIP) Name() string {
+	return "NATPreDontRedirectIP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATPreDontRedirectIP) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "-d", localIP, "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", dropPort)); err != nil {
+		return err
+	}
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATPreDontRedirectIP) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// NATPreRedirectInvert tests that iptables can match with "! -d".
+type NATPreRedirectInvert struct{}
+
+// Name implements TestCase.Name.
+func (NATPreRedirectInvert) Name() string {
+	return "NATPreRedirectInvert"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATPreRedirectInvert) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-p", "udp", "!", "-d", localIP, "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", acceptPort)); err != nil {
+		return err
+	}
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATPreRedirectInvert) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// NATRedirectRequiresProtocol tests that use of the --to-ports flag requires a
+// protocol to be specified with -p.
+type NATRedirectRequiresProtocol struct{}
+
+// Name implements TestCase.Name.
+func (NATRedirectRequiresProtocol) Name() string {
+	return "NATRedirectRequiresProtocol"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATRedirectRequiresProtocol) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "PREROUTING", "-d", localIP, "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", acceptPort)); err == nil {
+		return errors.New("expected an error using REDIRECT --to-ports without a protocol")
+	}
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATRedirectRequiresProtocol) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// NATOutRedirectTCPPort tests that connections are redirected on specified ports.
+type NATOutRedirectTCPPort struct{}
+
+// Name implements TestCase.Name.
+func (NATOutRedirectTCPPort) Name() string {
+	return "NATOutRedirectTCPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATOutRedirectTCPPort) ContainerAction(ip net.IP) error {
+	if err := natTable("-A", "OUTPUT", "-p", "tcp", "-m", "tcp", "--dport", fmt.Sprintf("%d", dropPort), "-j", "REDIRECT", "--to-ports", fmt.Sprintf("%d", acceptPort)); err != nil {
+		return err
+	}
+
+	timeout := 20 * time.Second
+	dest := []byte{127, 0, 0, 1}
+	localAddr := net.TCPAddr{
+		IP:   dest,
+		Port: acceptPort,
+	}
+
+	// Starts listening on port.
+	lConn, err := net.ListenTCP("tcp", &localAddr)
+	if err != nil {
+		return err
+	}
+	defer lConn.Close()
+
+	// Accept connections on port.
+	lConn.SetDeadline(time.Now().Add(timeout))
+	err = connectTCP(ip, dropPort, timeout)
+	if err != nil {
+		return err
+	}
+
+	conn, err := lConn.AcceptTCP()
+	if err != nil {
+		return err
+	}
+	conn.Close()
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATOutRedirectTCPPort) LocalAction(ip net.IP) error {
+	return nil
+}
+
+// NATLoopbackSkipsPrerouting tests that packets sent via loopback aren't
+// affected by PREROUTING rules.
+type NATLoopbackSkipsPrerouting struct{}
+
+// Name implements TestCase.Name.
+func (NATLoopbackSkipsPrerouting) Name() string {
+	return "NATLoopbackSkipsPrerouting"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATLoopbackSkipsPrerouting) ContainerAction(ip net.IP) error {
+	// Redirect anything sent to localhost to an unused port.
+	dest := []byte{127, 0, 0, 1}
+	if err := natTable("-A", "PREROUTING", "-p", "tcp", "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", dropPort)); err != nil {
+		return err
+	}
+
+	// Establish a connection via localhost. If the PREROUTING rule did apply to
+	// loopback traffic, the connection would fail.
+	sendCh := make(chan error)
+	go func() {
+		sendCh <- connectTCP(dest, acceptPort, sendloopDuration)
+	}()
+
+	if err := listenTCP(acceptPort, sendloopDuration); err != nil {
+		return err
+	}
+	return <-sendCh
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATLoopbackSkipsPrerouting) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// loopbackTests runs an iptables rule and ensures that packets sent to
+// dest:dropPort are received by localhost:acceptPort.
+func loopbackTest(dest net.IP, args ...string) error {
+	if err := natTable(args...); err != nil {
+		return err
+	}
+	sendCh := make(chan error)
+	listenCh := make(chan error)
+	go func() {
+		sendCh <- sendUDPLoop(dest, dropPort, sendloopDuration)
+	}()
+	go func() {
+		listenCh <- listenUDP(acceptPort, sendloopDuration)
+	}()
+	select {
+	case err := <-listenCh:
+		if err != nil {
+			return err
+		}
+	case <-time.After(sendloopDuration):
+		return errors.New("timed out")
+	}
+	// sendCh will always take the full sendloop time.
+	return <-sendCh
+}
diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD
index b9199387a..24504a1b9 100644
--- a/test/iptables/runner/BUILD
+++ b/test/iptables/runner/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "container_image", "go_binary", "go_image")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
@@ -6,18 +6,7 @@ go_binary(
     name = "runner",
     testonly = 1,
     srcs = ["main.go"],
-    deps = ["//test/iptables"],
-)
-
-container_image(
-    name = "iptables-base",
-    base = "@iptables-test//image",
-)
-
-go_image(
-    name = "runner-image",
-    testonly = 1,
-    srcs = ["main.go"],
-    base = ":iptables-base",
+    pure = True,
+    visibility = ["//test/iptables:__subpackages__"],
     deps = ["//test/iptables"],
 )
diff --git a/test/iptables/runner/Dockerfile b/test/iptables/runner/Dockerfile
deleted file mode 100644
index b77db44a1..000000000
--- a/test/iptables/runner/Dockerfile
+++ /dev/null
@@ -1,4 +0,0 @@
-# This Dockerfile builds the image hosted at
-# gcr.io/gvisor-presubmit/iptables-test.
-FROM ubuntu
-RUN apt update && apt install -y iptables
diff --git a/test/iptables/runner/main.go b/test/iptables/runner/main.go
index 3c794114e..6f77c0684 100644
--- a/test/iptables/runner/main.go
+++ b/test/iptables/runner/main.go
@@ -46,6 +46,9 @@ func main() {
 	if err := test.ContainerAction(ip); err != nil {
 		log.Fatalf("Failed running test %q: %v", *name, err)
 	}
+
+	// Emit the final line.
+	log.Printf("%s", iptables.TerminalStatement)
 }
 
 // getIP listens for a connection from the local process and returns the source
diff --git a/test/packetdrill/BUILD b/test/packetdrill/BUILD
index d113555b1..dfcd55f60 100644
--- a/test/packetdrill/BUILD
+++ b/test/packetdrill/BUILD
@@ -3,6 +3,36 @@ load("defs.bzl", "packetdrill_test")
 package(licenses = ["notice"])
 
 packetdrill_test(
-    name = "fin_wait2_timeout",
+    name = "packetdrill_sanity_test",
+    scripts = ["sanity_test.pkt"],
+)
+
+packetdrill_test(
+    name = "accept_ack_drop_test",
+    scripts = ["accept_ack_drop.pkt"],
+)
+
+packetdrill_test(
+    name = "fin_wait2_timeout_test",
     scripts = ["fin_wait2_timeout.pkt"],
 )
+
+packetdrill_test(
+    name = "listen_close_before_handshake_complete_test",
+    scripts = ["listen_close_before_handshake_complete.pkt"],
+)
+
+packetdrill_test(
+    name = "no_rst_to_rst_test",
+    scripts = ["no_rst_to_rst.pkt"],
+)
+
+packetdrill_test(
+    name = "tcp_defer_accept_test",
+    scripts = ["tcp_defer_accept.pkt"],
+)
+
+packetdrill_test(
+    name = "tcp_defer_accept_timeout_test",
+    scripts = ["tcp_defer_accept_timeout.pkt"],
+)
diff --git a/test/packetdrill/Dockerfile b/test/packetdrill/Dockerfile
deleted file mode 100644
index bd4451355..000000000
--- a/test/packetdrill/Dockerfile
+++ /dev/null
@@ -1,9 +0,0 @@
-FROM ubuntu:bionic
-
-RUN apt-get update
-RUN apt-get install -y net-tools git iptables iputils-ping netcat tcpdump jq tar
-RUN hash -r
-RUN git clone --branch packetdrill-v2.0 \
-        https://github.com/google/packetdrill.git
-RUN cd packetdrill/gtests/net/packetdrill && ./configure && \
-        apt-get install -y bison flex make && make
diff --git a/test/packetdrill/accept_ack_drop.pkt b/test/packetdrill/accept_ack_drop.pkt
new file mode 100644
index 000000000..76e638fd4
--- /dev/null
+++ b/test/packetdrill/accept_ack_drop.pkt
@@ -0,0 +1,27 @@
+// Test that the accept works if the final ACK is dropped and an ack with data
+// follows the dropped ack.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
+// Set backlog to 1 so that we can easily test.
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0.0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0.0 > S. 0:0(0) ack 1    <...>
+
++0.0 < . 1:5(4) ack 1 win 257
++0.0 > . 1:1(0) ack 5 <...>
+
+// This should cause connection to transition to connected state.
++0.000 accept(3, ..., ...) = 4
++0.000 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Now read the data and we should get 4 bytes.
++0.000 read(4,..., 4) = 4
++0.000 close(4) = 0
+
++0.0 > F. 1:1(0) ack 5 <...>
++0.0 < F. 5:5(0) ack 2 win 257
++0.01 > . 2:2(0) ack 6 <...>
+\ No newline at end of file
diff --git a/test/packetdrill/defs.bzl b/test/packetdrill/defs.bzl
index 8623ce7b1..f499c177b 100644
--- a/test/packetdrill/defs.bzl
+++ b/test/packetdrill/defs.bzl
@@ -66,7 +66,7 @@ def packetdrill_linux_test(name, **kwargs):
     if "tags" not in kwargs:
         kwargs["tags"] = _PACKETDRILL_TAGS
     _packetdrill_test(
-        name = name + "_linux_test",
+        name = name,
         flags = ["--dut_platform", "linux"],
         **kwargs
     )
@@ -75,13 +75,13 @@ def packetdrill_netstack_test(name, **kwargs):
     if "tags" not in kwargs:
         kwargs["tags"] = _PACKETDRILL_TAGS
     _packetdrill_test(
-        name = name + "_netstack_test",
+        name = name,
         # This is the default runtime unless
         # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value.
         flags = ["--dut_platform", "netstack", "--runtime", "runsc-d"],
         **kwargs
     )
 
-def packetdrill_test(**kwargs):
-    packetdrill_linux_test(**kwargs)
-    packetdrill_netstack_test(**kwargs)
+def packetdrill_test(name, **kwargs):
+    packetdrill_linux_test(name + "_linux_test", **kwargs)
+    packetdrill_netstack_test(name + "_netstack_test", **kwargs)
diff --git a/test/packetdrill/fin_wait2_timeout.pkt b/test/packetdrill/fin_wait2_timeout.pkt
index 613f0bec9..93ab08575 100644
--- a/test/packetdrill/fin_wait2_timeout.pkt
+++ b/test/packetdrill/fin_wait2_timeout.pkt
@@ -19,5 +19,5 @@
 +0 > F. 1:1(0) ack 1 <...>
 +0 < . 1:1(0) ack 2 win 257
 
-+1.1 < . 1:1(0) ack 2 win 257
++2 < . 1:1(0) ack 2 win 257
 +0 > R  2:2(0) win 0
diff --git a/test/packetdrill/listen_close_before_handshake_complete.pkt b/test/packetdrill/listen_close_before_handshake_complete.pkt
new file mode 100644
index 000000000..51c3f1a32
--- /dev/null
+++ b/test/packetdrill/listen_close_before_handshake_complete.pkt
@@ -0,0 +1,31 @@
+// Test that closing a listening socket closes any connections in SYN-RCVD
+// state and any packets bound for these connections generate a RESET.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
+// Set backlog to 1 so that we can easily test.
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
+
++0.100 close(3) = 0
++0.1 < P. 1:1(0) ack 1 win 257
+
+// Linux generates a reset with no ack number/bit set. This is contradictory to
+// what is specified in Rule 1 under Reset Generation in
+// https://tools.ietf.org/html/rfc793#section-3.4.
+//   "1. If the connection does not exist (CLOSED) then a reset is sent
+//    in response to any incoming segment except another reset.  In
+//    particular, SYNs addressed to a non-existent connection are rejected
+//    by this means.
+//
+//    If the incoming segment has an ACK field, the reset takes its
+//    sequence number from the ACK field of the segment, otherwise the
+//    reset has sequence number zero and the ACK field is set to the sum
+//    of the sequence number and segment length of the incoming segment.
+//    The connection remains in the CLOSED state."
+
++0.0  > R 1:1(0) win 0
+\ No newline at end of file
diff --git a/test/packetdrill/no_rst_to_rst.pkt b/test/packetdrill/no_rst_to_rst.pkt
new file mode 100644
index 000000000..612747827
--- /dev/null
+++ b/test/packetdrill/no_rst_to_rst.pkt
@@ -0,0 +1,36 @@
+// Test a RST is not generated in response to a RST and a RST is correctly
+// generated when an accepted endpoint is RST due to an incoming RST.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0 > S. 0:0(0) ack 1 <...>
++0 < P. 1:1(0) ack 1 win 257
+
++0.100 accept(3, ..., ...) = 4
+
++0.200 < R 1:1(0) win 0
+
++0.300 read(4,..., 4) = -1 ECONNRESET (Connection Reset by Peer)
+
++0.00 < . 1:1(0) ack 1 win 257
+
+// Linux generates a reset with no ack number/bit set. This is contradictory to
+// what is specified in Rule 1 under Reset Generation in
+// https://tools.ietf.org/html/rfc793#section-3.4.
+//   "1. If the connection does not exist (CLOSED) then a reset is sent
+//    in response to any incoming segment except another reset.  In
+//    particular, SYNs addressed to a non-existent connection are rejected
+//    by this means.
+//
+//    If the incoming segment has an ACK field, the reset takes its
+//    sequence number from the ACK field of the segment, otherwise the
+//    reset has sequence number zero and the ACK field is set to the sum
+//    of the sequence number and segment length of the incoming segment.
+//    The connection remains in the CLOSED state."
+
++0.00 > R 1:1(0) win 0
+\ No newline at end of file
diff --git a/test/packetdrill/packetdrill_test.sh b/test/packetdrill/packetdrill_test.sh
index 0b22dfd5c..922547d65 100755
--- a/test/packetdrill/packetdrill_test.sh
+++ b/test/packetdrill/packetdrill_test.sh
@@ -85,23 +85,26 @@ if [[ ! -x "${INIT_SCRIPT-}" ]]; then
   exit 2
 fi
 
+function new_net_prefix() {
+  # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
+  echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)"
+}
+
 # Variables specific to the control network and interface start with CTRL_.
 # Variables specific to the test network and interface start with TEST_.
 # Variables specific to the DUT start with DUT_.
 # Variables specific to the test runner start with TEST_RUNNER_.
 declare -r PACKETDRILL="/packetdrill/gtests/net/packetdrill/packetdrill"
 # Use random numbers so that test networks don't collide.
-declare -r CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
-declare -r TEST_NET="test_net-${RANDOM}${RANDOM}"
+declare CTRL_NET="ctrl_net-$(shuf -i 0-99999999 -n 1)"
+declare CTRL_NET_PREFIX=$(new_net_prefix)
+declare TEST_NET="test_net-$(shuf -i 0-99999999 -n 1)"
+declare TEST_NET_PREFIX=$(new_net_prefix)
 declare -r tolerance_usecs=100000
 # On both DUT and test runner, testing packets are on the eth2 interface.
 declare -r TEST_DEVICE="eth2"
 # Number of bits in the *_NET_PREFIX variables.
 declare -r NET_MASK="24"
-function new_net_prefix() {
-  # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
-  echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)"
-}
 # Last bits of the DUT's IP address.
 declare -r DUT_NET_SUFFIX=".10"
 # Control port.
@@ -137,23 +140,21 @@ function finish {
 trap finish EXIT
 
 # Subnet for control packets between test runner and DUT.
-declare CTRL_NET_PREFIX=$(new_net_prefix)
 while ! docker network create \
   "--subnet=${CTRL_NET_PREFIX}.0/${NET_MASK}" "${CTRL_NET}"; do
   sleep 0.1
-  declare CTRL_NET_PREFIX=$(new_net_prefix)
+  CTRL_NET_PREFIX=$(new_net_prefix)
+  CTRL_NET="ctrl_net-$(shuf -i 0-99999999 -n 1)"
 done
 
 # Subnet for the packets that are part of the test.
-declare TEST_NET_PREFIX=$(new_net_prefix)
 while ! docker network create \
   "--subnet=${TEST_NET_PREFIX}.0/${NET_MASK}" "${TEST_NET}"; do
   sleep 0.1
-  declare TEST_NET_PREFIX=$(new_net_prefix)
+  TEST_NET_PREFIX=$(new_net_prefix)
+  TEST_NET="test_net-$(shuf -i 0-99999999 -n 1)"
 done
 
-docker pull "${IMAGE_TAG}"
-
 # Create the DUT container and connect to network.
 DUT=$(docker create ${RUNTIME_ARG} --privileged --rm \
   --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
diff --git a/test/packetdrill/reset_for_ack_when_no_syn_cookies_in_use.pkt b/test/packetdrill/reset_for_ack_when_no_syn_cookies_in_use.pkt
new file mode 100644
index 000000000..a86b90ce6
--- /dev/null
+++ b/test/packetdrill/reset_for_ack_when_no_syn_cookies_in_use.pkt
@@ -0,0 +1,9 @@
+// Test that a listening socket generates a RST when it receives an
+// ACK and syn cookies are not in use.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 bind(3, ..., ...) = 0
+
++0 listen(3, 1) = 0
++0.1 < . 1:1(0) ack 1 win 32792
++0 > R 1:1(0) ack 0 win 0
+\ No newline at end of file
diff --git a/test/packetdrill/sanity_test.pkt b/test/packetdrill/sanity_test.pkt
new file mode 100644
index 000000000..b3b58c366
--- /dev/null
+++ b/test/packetdrill/sanity_test.pkt
@@ -0,0 +1,7 @@
+// Basic sanity test. One system call.
+//
+// All of the plumbing has to be working however, and the packetdrill wire
+// client needs to be able to connect to the wire server and send the script,
+// probe local interfaces, run through the test w/ timings, etc.
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
diff --git a/test/packetdrill/tcp_defer_accept.pkt b/test/packetdrill/tcp_defer_accept.pkt
new file mode 100644
index 000000000..a17f946db
--- /dev/null
+++ b/test/packetdrill/tcp_defer_accept.pkt
@@ -0,0 +1,48 @@
+// Test that a bare ACK does not complete a connection when TCP_DEFER_ACCEPT
+// timeout is not hit but an ACK w/ data does complete and deliver the
+// connection to the accept queue.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_TCP, TCP_DEFER_ACCEPT, [5], 4) = 0
++0.000 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 bind(3, ..., ...) = 0
+
+// Set backlog to 1 so that we can easily test.
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0.0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0.0 > S. 0:0(0) ack 1    <...>
+
+// Send a bare ACK this should not complete the connection as we
+// set the TCP_DEFER_ACCEPT above.
++0.0 < . 1:1(0) ack 1 win 257
+
+// The bare ACK should be dropped and no connection should be delivered
+// to the accept queue.
++0.100 accept(3, ..., ...) = -1 EWOULDBLOCK (operation would block)
+
+// Send another bare ACK and it should still fail we set TCP_DEFER_ACCEPT
+// to 5 seconds above.
++2.5 < . 1:1(0) ack 1 win 257
++0.100 accept(3, ..., ...) = -1 EWOULDBLOCK (operation would block)
+
+// set accept socket back to blocking.
++0.000 fcntl(3, F_SETFL, O_RDWR) = 0
+
+// Now send an ACK w/ data. This should complete the connection
+// and deliver the socket to the accept queue.
++0.1 < . 1:5(4) ack 1 win 257
++0.0 > . 1:1(0) ack 5 <...>
+
+// This should cause connection to transition to connected state.
++0.000 accept(3, ..., ...) = 4
++0.000 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Now read the data and we should get 4 bytes.
++0.000 read(4,..., 4) = 4
++0.000 close(4) = 0
+
++0.0 > F. 1:1(0) ack 5 <...>
++0.0 < F. 5:5(0) ack 2 win 257
++0.01 > . 2:2(0) ack 6 <...>
+\ No newline at end of file
diff --git a/test/packetdrill/tcp_defer_accept_timeout.pkt b/test/packetdrill/tcp_defer_accept_timeout.pkt
new file mode 100644
index 000000000..201fdeb14
--- /dev/null
+++ b/test/packetdrill/tcp_defer_accept_timeout.pkt
@@ -0,0 +1,48 @@
+// Test that a bare ACK is accepted after TCP_DEFER_ACCEPT timeout
+// is hit and a connection is delivered.
+
+0  socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_TCP, TCP_DEFER_ACCEPT, [3], 4) = 0
++0.000 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
++0 bind(3, ..., ...) = 0
+
+// Set backlog to 1 so that we can easily test.
++0 listen(3, 1) = 0
+
+// Establish a connection without timestamps.
++0.0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
++0.0 > S. 0:0(0) ack 1    <...>
+
+// Send a bare ACK this should not complete the connection as we
+// set the TCP_DEFER_ACCEPT above.
++0.0 < . 1:1(0) ack 1 win 257
+
+// The bare ACK should be dropped and no connection should be delivered
+// to the accept queue.
++0.100 accept(3, ..., ...) = -1 EWOULDBLOCK (operation would block)
+
+// Send another bare ACK and it should still fail we set TCP_DEFER_ACCEPT
+// to 5 seconds above.
++2.5 < . 1:1(0) ack 1 win 257
++0.100 accept(3, ..., ...) = -1 EWOULDBLOCK (operation would block)
+
+// set accept socket back to blocking.
++0.000 fcntl(3, F_SETFL, O_RDWR) = 0
+
+// We should see one more retransmit of the SYN-ACK as a last ditch
+// attempt when TCP_DEFER_ACCEPT timeout is hit to trigger another
+// ACK or a packet with data.
++.35~+2.35 > S. 0:0(0) ack 1 <...>
+
+// Now send another bare ACK after TCP_DEFER_ACCEPT time has been passed.
++0.0 < . 1:1(0) ack 1 win 257
+
+// The ACK above should cause connection to transition to connected state.
++0.000 accept(3, ..., ...) = 4
++0.000 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
++0.000 close(4) = 0
+
++0.0 > F. 1:1(0) ack 1 <...>
++0.0 < F. 1:1(0) ack 2 win 257
++0.01 > . 2:2(0) ack 2 <...>
diff --git a/test/packetimpact/README.md b/test/packetimpact/README.md
new file mode 100644
index 000000000..f46c67a0c
--- /dev/null
+++ b/test/packetimpact/README.md
@@ -0,0 +1,702 @@
+# Packetimpact
+
+## What is packetimpact?
+
+Packetimpact is a tool for platform-independent network testing. It is heavily
+inspired by [packetdrill](https://github.com/google/packetdrill). It creates two
+docker containers connected by a network. One is for the test bench, which
+operates the test. The other is for the device-under-test (DUT), which is the
+software being tested. The test bench communicates over the network with the DUT
+to check correctness of the network.
+
+### Goals
+
+Packetimpact aims to provide:
+
+*   A **multi-platform** solution that can test both Linux and gVisor.
+*   **Conciseness** on par with packetdrill scripts.
+*   **Control-flow** like for loops, conditionals, and variables.
+*   **Flexibilty** to specify every byte in a packet or use multiple sockets.
+
+## How to run packetimpact tests?
+
+Build the test container image by running the following at the root of the
+repository:
+
+```bash
+$ make load-packetimpact
+```
+
+Run a test, e.g. `fin_wait2_timeout`, against Linux:
+
+```bash
+$ bazel test //test/packetimpact/tests:fin_wait2_timeout_linux_test
+```
+
+Run the same test, but against gVisor:
+
+```bash
+$ bazel test //test/packetimpact/tests:fin_wait2_timeout_netstack_test
+```
+
+## When to use packetimpact?
+
+There are a few ways to write networking tests for gVisor currently:
+
+*   [Go unit tests](https://github.com/google/gvisor/tree/master/pkg/tcpip)
+*   [syscall tests](https://github.com/google/gvisor/tree/master/test/syscalls/linux)
+*   [packetdrill tests](https://github.com/google/gvisor/tree/master/test/packetdrill)
+*   packetimpact tests
+
+The right choice depends on the needs of the test.
+
+Feature        | Go unit test | syscall test | packetdrill | packetimpact
+-------------- | ------------ | ------------ | ----------- | ------------
+Multi-platform | no           | **YES**      | **YES**     | **YES**
+Concise        | no           | somewhat     | somewhat    | **VERY**
+Control-flow   | **YES**      | **YES**      | no          | **YES**
+Flexible       | **VERY**     | no           | somewhat    | **VERY**
+
+### Go unit tests
+
+If the test depends on the internals of gVisor and doesn't need to run on Linux
+or other platforms for comparison purposes, a Go unit test can be appropriate.
+They can observe internals of gVisor networking. The downside is that they are
+**not concise** and **not multi-platform**. If you require insight on gVisor
+internals, this is the right choice.
+
+### Syscall tests
+
+Syscall tests are **multi-platform** but cannot examine the internals of gVisor
+networking. They are **concise**. They can use **control-flow** structures like
+conditionals, for loops, and variables. However, they are limited to only what
+the POSIX interface provides so they are **not flexible**. For example, you
+would have difficulty writing a syscall test that intentionally sends a bad IP
+checksum. Or if you did write that test with raw sockets, it would be very
+**verbose** to write a test that intentionally send wrong checksums, wrong
+protocols, wrong sequence numbers, etc.
+
+### Packetdrill tests
+
+Packetdrill tests are **multi-platform** and can run against both Linux and
+gVisor. They are **concise** and use a special packetdrill scripting language.
+They are **more flexible** than a syscall test in that they can send packets
+that a syscall test would have difficulty sending, like a packet with a
+calcuated ACK number. But they are also somewhat limimted in flexibiilty in that
+they can't do tests with multiple sockets. They have **no control-flow** ability
+like variables or conditionals. For example, it isn't possible to send a packet
+that depends on the window size of a previous packet because the packetdrill
+language can't express that. Nor could you branch based on whether or not the
+other side supports window scaling, for example.
+
+### Packetimpact tests
+
+Packetimpact tests are similar to Packetdrill tests except that they are written
+in Go instead of the packetdrill scripting language. That gives them all the
+**control-flow** abilities of Go (loops, functions, variables, etc). They are
+**multi-platform** in the same way as packetdrill tests but even more
+**flexible** because Go is more expressive than the scripting language of
+packetdrill. However, Go is **not as concise** as the packetdrill language. Many
+design decisions below are made to mitigate that.
+
+## How it works
+
+```
+     Testbench                           Device-Under-Test (DUT)
+    +-------------------+               +------------------------+
+    |                   |   TEST NET    |                        |
+    | rawsockets.go <-->| <===========> | <---+                  |
+    |           ^       |               |     |                  |
+    |           |       |               |     |                  |
+    |           v       |               |     |                  |
+    |     unittest      |               |     |                  |
+    |           ^       |               |     |                  |
+    |           |       |               |     |                  |
+    |           v       |               |     v                  |
+    |         dut.go <========gRPC========> posix server         |
+    |                   |  CONTROL NET  |                        |
+    +-------------------+               +------------------------+
+```
+
+Two docker containers are created by a "runner" script, one for the testbench
+and the other for the device under test (DUT). The script connects the two
+containers with a control network and test network. It also does some other
+tasks like waiting until the DUT is ready before starting the test and disabling
+Linux networking that would interfere with the test bench.
+
+### DUT
+
+The DUT container runs a program called the "posix_server". The posix_server is
+written in c++ for maximum portability. It is compiled on the host. The script
+that starts the containers copies it into the DUT's container and runs it. It's
+job is to receive directions from the test bench on what actions to take. For
+this, the posix_server does three steps in a loop:
+
+1.  Listen for a request from the test bench.
+2.  Execute a command.
+3.  Send the response back to the test bench.
+
+The requests and responses are
+[protobufs](https://developers.google.com/protocol-buffers) and the
+communication is done with [gRPC](https://grpc.io/). The commands run are
+[POSIX socket commands](https://en.wikipedia.org/wiki/Berkeley_sockets#Socket_API_functions),
+with the inputs and outputs converted into protobuf requests and responses. All
+communication is on the control network, so that the test network is unaffected
+by extra packets.
+
+For example, this is the request and response pair to call
+[`socket()`](http://man7.org/linux/man-pages/man2/socket.2.html):
+
+```protocol-buffer
+message SocketRequest {
+  int32 domain = 1;
+  int32 type = 2;
+  int32 protocol = 3;
+}
+
+message SocketResponse {
+  int32 fd = 1;
+  int32 errno_ = 2;
+}
+```
+
+##### Alternatives considered
+
+*   We could have use JSON for communication instead. It would have been a
+    lighter-touch than protobuf but protobuf handles all the data type and has
+    strict typing to prevent a class of errors. The test bench could be written
+    in other languages, too.
+*   Instead of mimicking the POSIX interfaces, arguments could have had a more
+    natural form, like the `bind()` getting a string IP address instead of bytes
+    in a `sockaddr_t`. However, conforming to the existing structures keeps more
+    of the complexity in Go and keeps the posix_server simpler and thus more
+    likely to compile everywhere.
+
+### Test Bench
+
+The test bench does most of the work in a test. It is a Go program that compiles
+on the host and is copied by the script into test bench's container. It is a
+regular [go unit test](https://golang.org/pkg/testing/) that imports the test
+bench framework. The test bench framwork is based on three basic utilities:
+
+*   Commanding the DUT to run POSIX commands and return responses.
+*   Sending raw packets to the DUT on the test network.
+*   Listening for raw packets from the DUT on the test network.
+
+#### DUT commands
+
+To keep the interface to the DUT consistent and easy-to-use, each POSIX command
+supported by the posix_server is wrapped in functions with signatures similar to
+the ones in the [Go unix package](https://godoc.org/golang.org/x/sys/unix). This
+way all the details of endianess and (un)marshalling of go structs such as
+[unix.Timeval](https://godoc.org/golang.org/x/sys/unix#Timeval) is handled in
+one place. This also makes it straight-forward to convert tests that use `unix.`
+or `syscall.` calls to `dut.` calls.
+
+For example, creating a connection to the DUT and commanding it to make a socket
+looks like this:
+
+```go
+dut := testbench.NewDut(t)
+fd, err := dut.SocketWithErrno(unix.AF_INET, unix.SOCK_STREAM, unix.IPPROTO_IP)
+if fd < 0 {
+  t.Fatalf(...)
+}
+```
+
+Because the usual case is to fail the test when the DUT fails to create a
+socket, there is a concise version of each of the `...WithErrno` functions that
+does that:
+
+```go
+dut := testbench.NewDut(t)
+fd := dut.Socket(unix.AF_INET, unix.SOCK_STREAM, unix.IPPROTO_IP)
+```
+
+The DUT and other structs in the code store a `*testing.T` so that they can
+provide versions of functions that call `t.Fatalf(...)`. This helps keep tests
+concise.
+
+##### Alternatives considered
+
+*   Instead of mimicking the `unix.` go interface, we could have invented a more
+    natural one, like using `float64` instead of `Timeval`. However, using the
+    same function signatures that `unix.` has makes it easier to convert code to
+    `dut.`. Also, using an existing interface ensures that we don't invent an
+    interface that isn't extensible. For example, if we invented a function for
+    `bind()` that didn't support IPv6 and later we had to add a second `bind6()`
+    function.
+
+#### Sending/Receiving Raw Packets
+
+The framework wraps POSIX sockets for sending and receiving raw frames. Both
+send and receive are synchronous commands.
+[SO_RCVTIMEO](http://man7.org/linux/man-pages/man7/socket.7.html) is used to set
+a timeout on the receive commands. For ease of use, these are wrapped in an
+`Injector` and a `Sniffer`. They have functions:
+
+```go
+func (s *Sniffer) Recv(timeout time.Duration) []byte {...}
+func (i *Injector) Send(b []byte) {...}
+```
+
+##### Alternatives considered
+
+*   [gopacket](https://github.com/google/gopacket) pcap has raw socket support
+    but requires cgo. cgo is not guaranteed to be portable from the host to the
+    container and in practice, the container doesn't recognize binaries built on
+    the host if they use cgo.
+*   Both gVisor and gopacket have the ability to read and write pcap files
+    without cgo but that is insufficient here because we can't just replay pcap
+    files, we need a more dynamic solution.
+*   The sniffer and injector can't share a socket because they need to be bound
+    differently.
+*   Sniffing could have been done asynchronously with channels, obviating the
+    need for `SO_RCVTIMEO`. But that would introduce asynchronous complication.
+    `SO_RCVTIMEO` is well supported on the test bench.
+
+#### `Layer` struct
+
+A large part of packetimpact tests is creating packets to send and comparing
+received packets against expectations. To keep tests concise, it is useful to be
+able to specify just the important parts of packets that need to be set. For
+example, sending a packet with default values except for TCP Flags. And for
+packets received, it's useful to be able to compare just the necessary parts of
+received packets and ignore the rest.
+
+To aid in both of those, Go structs with optional fields are created for each
+encapsulation type, such as IPv4, TCP, and Ethernet. This is inspired by
+[scapy](https://scapy.readthedocs.io/en/latest/). For example, here is the
+struct for Ethernet:
+
+```go
+type Ether struct {
+  LayerBase
+  SrcAddr *tcpip.LinkAddress
+  DstAddr *tcpip.LinkAddress
+  Type    *tcpip.NetworkProtocolNumber
+}
+```
+
+Each struct has the same fields as those in the
+[gVisor headers](https://github.com/google/gvisor/tree/master/pkg/tcpip/header)
+but with a pointer for each field that may be `nil`.
+
+##### Alternatives considered
+
+*   Just use []byte like gVisor headers do. The drawback is that it makes the
+    tests more verbose.
+    *   For example, there would be no way to call `Send(myBytes)` concisely and
+        indicate if the checksum should be calculated automatically versus
+        overridden. The only way would be to add lines to the test to calculate
+        it before each Send, which is wordy. Or make multiple versions of Send:
+        one that checksums IP, one that doesn't, one that checksums TCP, one
+        that does both, etc. That would be many combinations.
+    *   Filtering inputs would become verbose. Either:
+    *   large conditionals that need to be repeated many places:
+        `h[FlagOffset] == SYN && h[LengthOffset:LengthOffset+2] == ...` or
+    *   Many functions, one per field, like: `filterByFlag(myBytes, SYN)`,
+        `filterByLength(myBytes, 20)`, `filterByNextProto(myBytes, 0x8000)`,
+        etc.
+    *   Using pointers allows us to combine `Layer`s with reflection. So the
+        default `Layers` can be overridden by a `Layers` with just the TCP
+        conection's src/dst which can be overridden by one with just a test
+        specific TCP window size.
+    *   It's a proven way to separate the details of a packet from the byte
+        format as shown by scapy's success.
+*   Use packetgo. It's more general than parsing packets with gVisor. However:
+    *   packetgo doesn't have optional fields so many of the above problems
+        still apply.
+    *   It would be yet another dependency.
+    *   It's not as well known to engineers that are already writing gVisor
+        code.
+    *   It might be a good candidate for replacing the parsing of packets into
+        `Layer`s if all that parsing turns out to be more work than parsing by
+        packetgo and converting *that* to `Layer`. packetgo has easier to use
+        getters for the layers. This could be done later in a way that doesn't
+        break tests.
+
+#### `Layer` methods
+
+The `Layer` structs provide a way to partially specify an encapsulation. They
+also need methods for using those partially specified encapsulation, for example
+to marshal them to bytes or compare them. For those, each encapsulation
+implements the `Layer` interface:
+
+```go
+// Layer is the interface that all encapsulations must implement.
+//
+// A Layer is an encapsulation in a packet, such as TCP, IPv4, IPv6, etc. A
+// Layer contains all the fields of the encapsulation. Each field is a pointer
+// and may be nil.
+type Layer interface {
+    // toBytes converts the Layer into bytes. In places where the Layer's field
+    // isn't nil, the value that is pointed to is used. When the field is nil, a
+    // reasonable default for the Layer is used. For example, "64" for IPv4 TTL
+    // and a calculated checksum for TCP or IP. Some layers require information
+    // from the previous or next layers in order to compute a default, such as
+    // TCP's checksum or Ethernet's type, so each Layer has a doubly-linked list
+    // to the layer's neighbors.
+    toBytes() ([]byte, error)
+
+    // match checks if the current Layer matches the provided Layer. If either
+    // Layer has a nil in a given field, that field is considered matching.
+    // Otherwise, the values pointed to by the fields must match.
+    match(Layer) bool
+
+    // length in bytes of the current encapsulation
+    length() int
+
+    // next gets a pointer to the encapsulated Layer.
+    next() Layer
+
+    // prev gets a pointer to the Layer encapsulating this one.
+    prev() Layer
+
+    // setNext sets the pointer to the encapsulated Layer.
+    setNext(Layer)
+
+    // setPrev sets the pointer to the Layer encapsulating this one.
+    setPrev(Layer)
+}
+```
+
+The `next` and `prev` make up a link listed so that each layer can get at the
+information in the layer around it. This is necessary for some protocols, like
+TCP that needs the layer before and payload after to compute the checksum. Any
+sequence of `Layer` structs is valid so long as the parser and `toBytes`
+functions can map from type to protool number and vice-versa. When the mapping
+fails, an error is emitted explaining what functionality is missing. The
+solution is either to fix the ordering or implement the missing protocol.
+
+For each `Layer` there is also a parsing function. For example, this one is for
+Ethernet:
+
+```
+func ParseEther(b []byte) (Layers, error)
+```
+
+The parsing function converts bytes received on the wire into a `Layer`
+(actually `Layers`, see below) which has no `nil`s in it. By using
+`match(Layer)` to compare against another `Layer` that *does* have `nil`s in it,
+the received bytes can be partially compared. The `nil`s behave as
+"don't-cares".
+
+##### Alternatives considered
+
+*   Matching against `[]byte` instead of converting to `Layer` first.
+    *   The downside is that it precludes the use of a `cmp.Equal` one-liner to
+        do comparisons.
+    *   It creates confusion in the code to deal with both representations at
+        different times. For example, is the checksum calculated on `[]byte` or
+        `Layer` when sending? What about when checking received packets?
+
+#### `Layers`
+
+```
+type Layers []Layer
+
+func (ls *Layers) match(other Layers) bool {...}
+func (ls *Layers) toBytes() ([]byte, error) {...}
+```
+
+`Layers` is an array of `Layer`. It represents a stack of encapsulations, such
+as `Layers{Ether{},IPv4{},TCP{},Payload{}}`. It also has `toBytes()` and
+`match(Layers)`, like `Layer`. The parse functions above actually return
+`Layers` and not `Layer` because they know about the headers below and
+sequentially call each parser on the remaining, encapsulated bytes.
+
+All this leads to the ability to write concise packet processing. For example:
+
+```go
+etherType := 0x8000
+flags = uint8(header.TCPFlagSyn|header.TCPFlagAck)
+toMatch := Layers{Ether{Type: &etherType}, IPv4{}, TCP{Flags: &flags}}
+for {
+  recvBytes := sniffer.Recv(time.Second)
+  if recvBytes == nil {
+    println("Got no packet for 1 second")
+  }
+  gotPacket, err := ParseEther(recvBytes)
+  if err == nil && toMatch.match(gotPacket) {
+    println("Got a TCP/IPv4/Eth packet with SYNACK")
+  }
+}
+```
+
+##### Alternatives considered
+
+*   Don't use previous and next pointers.
+    *   Each layer may need to be able to interrogate the layers around it, like
+        for computing the next protocol number or total length. So *some*
+        mechanism is needed for a `Layer` to see neighboring layers.
+    *   We could pass the entire array `Layers` to the `toBytes()` function.
+        Passing an array to a method that includes in the array the function
+        receiver itself seems wrong.
+
+#### `layerState`
+
+`Layers` represents the different headers of a packet but a connection includes
+more state. For example, a TCP connection needs to keep track of the next
+expected sequence number and also the next sequence number to send. This is
+stored in a `layerState` struct. This is the `layerState` for TCP:
+
+```go
+// tcpState maintains state about a TCP connection.
+type tcpState struct {
+    out, in                   TCP
+    localSeqNum, remoteSeqNum *seqnum.Value
+    synAck                    *TCP
+    portPickerFD              int
+    finSent                   bool
+}
+```
+
+The next sequence numbers for each side of the connection are stored. `out` and
+`in` have defaults for the TCP header, such as the expected source and
+destination ports for outgoing packets and incoming packets.
+
+##### `layerState` interface
+
+```go
+// layerState stores the state of a layer of a connection.
+type layerState interface {
+    // outgoing returns an outgoing layer to be sent in a frame.
+    outgoing() Layer
+
+    // incoming creates an expected Layer for comparing against a received Layer.
+    // Because the expectation can depend on values in the received Layer, it is
+    // an input to incoming. For example, the ACK number needs to be checked in a
+    // TCP packet but only if the ACK flag is set in the received packet.
+    incoming(received Layer) Layer
+
+    // sent updates the layerState based on the Layer that was sent. The input is
+    // a Layer with all prev and next pointers populated so that the entire frame
+    // as it was sent is available.
+    sent(sent Layer) error
+
+    // received updates the layerState based on a Layer that is receieved. The
+    // input is a Layer with all prev and next pointers populated so that the
+    // entire frame as it was receieved is available.
+    received(received Layer) error
+
+    // close frees associated resources held by the LayerState.
+    close() error
+}
+```
+
+`outgoing` generates the default Layer for an outgoing packet. For TCP, this
+would be a `TCP` with the source and destination ports populated. Because they
+are static, they are stored inside the `out` member of `tcpState`. However, the
+sequence numbers change frequently so the outgoing sequence number is stored in
+the `localSeqNum` and put into the output of outgoing for each call.
+
+`incoming` does the same functions for packets that arrive but instead of
+generating a packet to send, it generates an expect packet for filtering packets
+that arrive. For example, if a `TCP` header arrives with the wrong ports, it can
+be ignored as belonging to a different connection. `incoming` needs the received
+header itself as an input because the filter may depend on the input. For
+example, the expected sequence number depends on the flags in the TCP header.
+
+`sent` and `received` are run for each header that is actually sent or received
+and used to update the internal state. `incoming` and `outgoing` should *not* be
+used for these purpose. For example, `incoming` is called on every packet that
+arrives but only packets that match ought to actually update the state.
+`outgoing` is called to created outgoing packets and those packets are always
+sent, so unlike `incoming`/`received`, there is one `outgoing` call for each
+`sent` call.
+
+`close` cleans up after the layerState. For example, TCP and UDP need to keep a
+port reserved and then release it.
+
+#### Connections
+
+Using `layerState` above, we can create connections.
+
+```go
+// Connection holds a collection of layer states for maintaining a connection
+// along with sockets for sniffer and injecting packets.
+type Connection struct {
+    layerStates []layerState
+    injector    Injector
+    sniffer     Sniffer
+    t           *testing.T
+}
+```
+
+The connection stores an array of `layerState` in the order that the headers
+should be present in the frame to send. For example, Ether then IPv4 then TCP.
+The injector and sniffer are for writing and reading frames. A `*testing.T` is
+stored so that internal errors can be reported directly without code in the unit
+test.
+
+The `Connection` has some useful functions:
+
+```go
+// Close frees associated resources held by the Connection.
+func (conn *Connection) Close() {...}
+// CreateFrame builds a frame for the connection with layer overriding defaults
+// of the innermost layer and additionalLayers added after it.
+func (conn *Connection) CreateFrame(layer Layer, additionalLayers ...Layer) Layers {...}
+// SendFrame sends a frame on the wire and updates the state of all layers.
+func (conn *Connection) SendFrame(frame Layers) {...}
+// Send a packet with reasonable defaults. Potentially override the final layer
+// in the connection with the provided layer and add additionLayers.
+func (conn *Connection) Send(layer Layer, additionalLayers ...Layer) {...}
+// Expect a frame with the final layerStates layer matching the provided Layer
+// within the timeout specified. If it doesn't arrive in time, it returns nil.
+func (conn *Connection) Expect(layer Layer, timeout time.Duration) (Layer, error) {...}
+// ExpectFrame expects a frame that matches the provided Layers within the
+// timeout specified. If it doesn't arrive in time, it returns nil.
+func (conn *Connection) ExpectFrame(layers Layers, timeout time.Duration) (Layers, error) {...}
+// Drain drains the sniffer's receive buffer by receiving packets until there's
+// nothing else to receive.
+func (conn *Connection) Drain() {...}
+```
+
+`CreateFrame` uses the `[]layerState` to create a frame to send. The first
+argument is for overriding defaults in the last header of the frame, because
+this is the most common need. For a TCPIPv4 connection, this would be the TCP
+header. Optional additionalLayers can be specified to add to the frame being
+created, such as a `Payload` for `TCP`.
+
+`SendFrame` sends the frame to the DUT. It is combined with `CreateFrame` to
+make `Send`. For unittests with basic sending needs, `Send` can be used. If more
+control is needed over the frame, it can be made with `CreateFrame`, modified in
+the unit test, and then sent with `SendFrame`.
+
+On the receiving side, there is `Expect` and `ExpectFrame`. Like with the
+sending side, there are two forms of each function, one for just the last header
+and one for the whole frame. The expect functions use the `[]layerState` to
+create a template for the expected incoming frame. That frame is then overridden
+by the values in the first argument. Finally, a loop starts sniffing packets on
+the wire for frames. If a matching frame is found before the timeout, it is
+returned without error. If not, nil is returned and the error contains text of
+all the received frames that didn't match. Exactly one of the outputs will be
+non-nil, even if no frames are received at all.
+
+`Drain` sniffs and discards all the frames that have yet to be received. A
+common way to write a test is:
+
+```go
+conn.Drain() // Discard all outstanding frames.
+conn.Send(...) // Send a frame with overrides.
+// Now expect a frame with a certain header and fail if it doesn't arrive.
+if _, err := conn.Expect(...); err != nil { t.Fatal(...) }
+```
+
+Or for a test where we want to check that no frame arrives:
+
+```go
+if gotOne, _ := conn.Expect(...); gotOne != nil { t.Fatal(...) }
+```
+
+#### Specializing `Connection`
+
+Because there are some common combinations of `layerState` into `Connection`,
+they are defined:
+
+```go
+// TCPIPv4 maintains the state for all the layers in a TCP/IPv4 connection.
+type TCPIPv4 Connection
+// UDPIPv4 maintains the state for all the layers in a UDP/IPv4 connection.
+type UDPIPv4 Connection
+```
+
+Each has a `NewXxx` function to create a new connection with reasonable
+defaults. They also have functions that call the underlying `Connection`
+functions but with specialization and tighter type-checking. For example:
+
+```go
+func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
+    (*Connection)(conn).Send(&tcp, additionalLayers...)
+}
+func (conn *TCPIPv4) Drain() {
+    conn.sniffer.Drain()
+}
+```
+
+They may also have some accessors to get or set the internal state of the
+connection:
+
+```go
+func (conn *TCPIPv4) state() *tcpState {
+    state, ok := conn.layerStates[len(conn.layerStates)-1].(*tcpState)
+    if !ok {
+        conn.t.Fatalf("expected final state of %v to be tcpState", conn.layerStates)
+    }
+    return state
+}
+func (conn *TCPIPv4) RemoteSeqNum() *seqnum.Value {
+    return conn.state().remoteSeqNum
+}
+func (conn *TCPIPv4) LocalSeqNum() *seqnum.Value {
+    return conn.state().localSeqNum
+}
+```
+
+Unittests will in practice use these functions and not the functions on
+`Connection`. For example, `NewTCPIPv4()` and then call `Send` on that rather
+than cast is to a `Connection` and call `Send` on that cast result.
+
+##### Alternatives considered
+
+*   Instead of storing `outgoing` and `incoming`, store values.
+    *   There would be many more things to store instead, like `localMac`,
+        `remoteMac`, `localIP`, `remoteIP`, `localPort`, and `remotePort`.
+    *   Construction of a packet would be many lines to copy each of these
+        values into a `[]byte`. And there would be slight variations needed for
+        each encapsulation stack, like TCPIPv6 and ARP.
+    *   Filtering incoming packets would be a long sequence:
+    *   Compare the MACs, then
+    *   Parse the next header, then
+    *   Compare the IPs, then
+    *   Parse the next header, then
+    *   Compare the TCP ports. Instead it's all just one call to
+        `cmp.Equal(...)`, for all sequences.
+    *   A TCPIPv6 connection could share most of the code. Only the type of the
+        IP addresses are different. The types of `outgoing` and `incoming` would
+        be remain `Layers`.
+    *   An ARP connection could share all the Ethernet parts. The IP `Layer`
+        could be factored out of `outgoing`. After that, the IPv4 and IPv6
+        connections could implement one interface and a single TCP struct could
+        have either network protocol through composition.
+
+## Putting it all together
+
+Here's what te start of a packetimpact unit test looks like. This test creates a
+TCP connection with the DUT. There are added comments for explanation in this
+document but a real test might not include them in order to stay even more
+concise.
+
+```go
+func TestMyTcpTest(t *testing.T) {
+  // Prepare a DUT for communication.
+  dut := testbench.NewDUT(t)
+
+  // This does:
+  //   dut.Socket()
+  //   dut.Bind()
+  //   dut.Getsockname() to learn the new port number
+  //   dut.Listen()
+  listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+  defer dut.Close(listenFD) // Tell the DUT to close the socket at the end of the test.
+
+  // Monitor a new TCP connection with sniffer, injector, sequence number tracking,
+  // and reasonable outgoing and incoming packet field default IPs, MACs, and port numbers.
+  conn := testbench.NewTCPIPv4(t, dut, remotePort)
+
+  // Perform a 3-way handshake: send SYN, expect SYNACK, send ACK.
+  conn.Handshake()
+
+  // Tell the DUT to accept the new connection.
+  acceptFD := dut.Accept(acceptFd)
+}
+```
+
+## Other notes
+
+*   The time between receiving a SYN-ACK and replying with an ACK in `Handshake`
+    is about 3ms. This is much slower than the native unix response, which is
+    about 0.3ms. Packetdrill gets closer to 0.3ms. For tests where timing is
+    crucial, packetdrill is faster and more precise.
diff --git a/test/packetimpact/dut/BUILD b/test/packetimpact/dut/BUILD
new file mode 100644
index 000000000..3ce63c2c6
--- /dev/null
+++ b/test/packetimpact/dut/BUILD
@@ -0,0 +1,18 @@
+load("//tools:defs.bzl", "cc_binary", "grpcpp")
+
+package(
+    default_visibility = ["//test/packetimpact:__subpackages__"],
+    licenses = ["notice"],
+)
+
+cc_binary(
+    name = "posix_server",
+    srcs = ["posix_server.cc"],
+    linkstatic = 1,
+    static = True,  # This is needed for running in a docker container.
+    deps = [
+        grpcpp,
+        "//test/packetimpact/proto:posix_server_cc_grpc_proto",
+        "//test/packetimpact/proto:posix_server_cc_proto",
+    ],
+)
diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
new file mode 100644
index 000000000..a1a5c3612
--- /dev/null
+++ b/test/packetimpact/dut/posix_server.cc
@@ -0,0 +1,365 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at //
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <unordered_map>
+
+#include "include/grpcpp/security/server_credentials.h"
+#include "include/grpcpp/server_builder.h"
+#include "test/packetimpact/proto/posix_server.grpc.pb.h"
+#include "test/packetimpact/proto/posix_server.pb.h"
+
+// Converts a sockaddr_storage to a Sockaddr message.
+::grpc::Status sockaddr_to_proto(const sockaddr_storage &addr,
+                                 socklen_t addrlen,
+                                 posix_server::Sockaddr *sockaddr_proto) {
+  switch (addr.ss_family) {
+    case AF_INET: {
+      auto addr_in = reinterpret_cast<const sockaddr_in *>(&addr);
+      auto response_in = sockaddr_proto->mutable_in();
+      response_in->set_family(addr_in->sin_family);
+      response_in->set_port(ntohs(addr_in->sin_port));
+      response_in->mutable_addr()->assign(
+          reinterpret_cast<const char *>(&addr_in->sin_addr.s_addr), 4);
+      return ::grpc::Status::OK;
+    }
+    case AF_INET6: {
+      auto addr_in6 = reinterpret_cast<const sockaddr_in6 *>(&addr);
+      auto response_in6 = sockaddr_proto->mutable_in6();
+      response_in6->set_family(addr_in6->sin6_family);
+      response_in6->set_port(ntohs(addr_in6->sin6_port));
+      response_in6->set_flowinfo(ntohl(addr_in6->sin6_flowinfo));
+      response_in6->mutable_addr()->assign(
+          reinterpret_cast<const char *>(&addr_in6->sin6_addr.s6_addr), 16);
+      response_in6->set_scope_id(ntohl(addr_in6->sin6_scope_id));
+      return ::grpc::Status::OK;
+    }
+  }
+  return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Unknown Sockaddr");
+}
+
+::grpc::Status proto_to_sockaddr(const posix_server::Sockaddr &sockaddr_proto,
+                                 sockaddr_storage *addr, socklen_t *addr_len) {
+  switch (sockaddr_proto.sockaddr_case()) {
+    case posix_server::Sockaddr::SockaddrCase::kIn: {
+      auto proto_in = sockaddr_proto.in();
+      if (proto_in.addr().size() != 4) {
+        return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                              "IPv4 address must be 4 bytes");
+      }
+      auto addr_in = reinterpret_cast<sockaddr_in *>(addr);
+      addr_in->sin_family = proto_in.family();
+      addr_in->sin_port = htons(proto_in.port());
+      proto_in.addr().copy(reinterpret_cast<char *>(&addr_in->sin_addr.s_addr),
+                           4);
+      *addr_len = sizeof(*addr_in);
+      break;
+    }
+    case posix_server::Sockaddr::SockaddrCase::kIn6: {
+      auto proto_in6 = sockaddr_proto.in6();
+      if (proto_in6.addr().size() != 16) {
+        return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                              "IPv6 address must be 16 bytes");
+      }
+      auto addr_in6 = reinterpret_cast<sockaddr_in6 *>(addr);
+      addr_in6->sin6_family = proto_in6.family();
+      addr_in6->sin6_port = htons(proto_in6.port());
+      addr_in6->sin6_flowinfo = htonl(proto_in6.flowinfo());
+      proto_in6.addr().copy(
+          reinterpret_cast<char *>(&addr_in6->sin6_addr.s6_addr), 16);
+      addr_in6->sin6_scope_id = htonl(proto_in6.scope_id());
+      *addr_len = sizeof(*addr_in6);
+      break;
+    }
+    case posix_server::Sockaddr::SockaddrCase::SOCKADDR_NOT_SET:
+    default:
+      return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                            "Unknown Sockaddr");
+  }
+  return ::grpc::Status::OK;
+}
+
+class PosixImpl final : public posix_server::Posix::Service {
+  ::grpc::Status Accept(grpc_impl::ServerContext *context,
+                        const ::posix_server::AcceptRequest *request,
+                        ::posix_server::AcceptResponse *response) override {
+    sockaddr_storage addr;
+    socklen_t addrlen = sizeof(addr);
+    response->set_fd(accept(request->sockfd(),
+                            reinterpret_cast<sockaddr *>(&addr), &addrlen));
+    response->set_errno_(errno);
+    return sockaddr_to_proto(addr, addrlen, response->mutable_addr());
+  }
+
+  ::grpc::Status Bind(grpc_impl::ServerContext *context,
+                      const ::posix_server::BindRequest *request,
+                      ::posix_server::BindResponse *response) override {
+    if (!request->has_addr()) {
+      return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                            "Missing address");
+    }
+
+    sockaddr_storage addr;
+    socklen_t addr_len;
+    auto err = proto_to_sockaddr(request->addr(), &addr, &addr_len);
+    if (!err.ok()) {
+      return err;
+    }
+
+    response->set_ret(
+        bind(request->sockfd(), reinterpret_cast<sockaddr *>(&addr), addr_len));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Close(grpc_impl::ServerContext *context,
+                       const ::posix_server::CloseRequest *request,
+                       ::posix_server::CloseResponse *response) override {
+    response->set_ret(close(request->fd()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Connect(grpc_impl::ServerContext *context,
+                         const ::posix_server::ConnectRequest *request,
+                         ::posix_server::ConnectResponse *response) override {
+    if (!request->has_addr()) {
+      return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                            "Missing address");
+    }
+    sockaddr_storage addr;
+    socklen_t addr_len;
+    auto err = proto_to_sockaddr(request->addr(), &addr, &addr_len);
+    if (!err.ok()) {
+      return err;
+    }
+
+    response->set_ret(connect(request->sockfd(),
+                              reinterpret_cast<sockaddr *>(&addr), addr_len));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Fcntl(grpc_impl::ServerContext *context,
+                       const ::posix_server::FcntlRequest *request,
+                       ::posix_server::FcntlResponse *response) override {
+    response->set_ret(::fcntl(request->fd(), request->cmd(), request->arg()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status GetSockName(
+      grpc_impl::ServerContext *context,
+      const ::posix_server::GetSockNameRequest *request,
+      ::posix_server::GetSockNameResponse *response) override {
+    sockaddr_storage addr;
+    socklen_t addrlen = sizeof(addr);
+    response->set_ret(getsockname(
+        request->sockfd(), reinterpret_cast<sockaddr *>(&addr), &addrlen));
+    response->set_errno_(errno);
+    return sockaddr_to_proto(addr, addrlen, response->mutable_addr());
+  }
+
+  ::grpc::Status GetSockOpt(
+      grpc_impl::ServerContext *context,
+      const ::posix_server::GetSockOptRequest *request,
+      ::posix_server::GetSockOptResponse *response) override {
+    switch (request->type()) {
+      case ::posix_server::GetSockOptRequest::BYTES: {
+        socklen_t optlen = request->optlen();
+        std::vector<char> buf(optlen);
+        response->set_ret(::getsockopt(request->sockfd(), request->level(),
+                                       request->optname(), buf.data(),
+                                       &optlen));
+        if (optlen >= 0) {
+          response->mutable_optval()->set_bytesval(buf.data(), optlen);
+        }
+        break;
+      }
+      case ::posix_server::GetSockOptRequest::INT: {
+        int intval = 0;
+        socklen_t optlen = sizeof(intval);
+        response->set_ret(::getsockopt(request->sockfd(), request->level(),
+                                       request->optname(), &intval, &optlen));
+        response->mutable_optval()->set_intval(intval);
+        break;
+      }
+      case ::posix_server::GetSockOptRequest::TIME: {
+        timeval tv;
+        socklen_t optlen = sizeof(tv);
+        response->set_ret(::getsockopt(request->sockfd(), request->level(),
+                                       request->optname(), &tv, &optlen));
+        response->mutable_optval()->mutable_timeval()->set_seconds(tv.tv_sec);
+        response->mutable_optval()->mutable_timeval()->set_microseconds(
+            tv.tv_usec);
+        break;
+      }
+      default:
+        return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                              "Unknown SockOpt Type");
+    }
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Listen(grpc_impl::ServerContext *context,
+                        const ::posix_server::ListenRequest *request,
+                        ::posix_server::ListenResponse *response) override {
+    response->set_ret(listen(request->sockfd(), request->backlog()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Send(::grpc::ServerContext *context,
+                      const ::posix_server::SendRequest *request,
+                      ::posix_server::SendResponse *response) override {
+    response->set_ret(::send(request->sockfd(), request->buf().data(),
+                             request->buf().size(), request->flags()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status SendTo(::grpc::ServerContext *context,
+                        const ::posix_server::SendToRequest *request,
+                        ::posix_server::SendToResponse *response) override {
+    if (!request->has_dest_addr()) {
+      return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                            "Missing address");
+    }
+    sockaddr_storage addr;
+    socklen_t addr_len;
+    auto err = proto_to_sockaddr(request->dest_addr(), &addr, &addr_len);
+    if (!err.ok()) {
+      return err;
+    }
+
+    response->set_ret(::sendto(request->sockfd(), request->buf().data(),
+                               request->buf().size(), request->flags(),
+                               reinterpret_cast<sockaddr *>(&addr), addr_len));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status SetSockOpt(
+      grpc_impl::ServerContext *context,
+      const ::posix_server::SetSockOptRequest *request,
+      ::posix_server::SetSockOptResponse *response) override {
+    switch (request->optval().val_case()) {
+      case ::posix_server::SockOptVal::kBytesval:
+        response->set_ret(setsockopt(request->sockfd(), request->level(),
+                                     request->optname(),
+                                     request->optval().bytesval().c_str(),
+                                     request->optval().bytesval().size()));
+        break;
+      case ::posix_server::SockOptVal::kIntval: {
+        int opt = request->optval().intval();
+        response->set_ret(::setsockopt(request->sockfd(), request->level(),
+                                       request->optname(), &opt, sizeof(opt)));
+        break;
+      }
+      case ::posix_server::SockOptVal::kTimeval: {
+        timeval tv = {.tv_sec = static_cast<__time_t>(
+                          request->optval().timeval().seconds()),
+                      .tv_usec = static_cast<__suseconds_t>(
+                          request->optval().timeval().microseconds())};
+        response->set_ret(setsockopt(request->sockfd(), request->level(),
+                                     request->optname(), &tv, sizeof(tv)));
+        break;
+      }
+      default:
+        return ::grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                              "Unknown SockOpt Type");
+    }
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Socket(grpc_impl::ServerContext *context,
+                        const ::posix_server::SocketRequest *request,
+                        ::posix_server::SocketResponse *response) override {
+    response->set_fd(
+        socket(request->domain(), request->type(), request->protocol()));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Recv(::grpc::ServerContext *context,
+                      const ::posix_server::RecvRequest *request,
+                      ::posix_server::RecvResponse *response) override {
+    std::vector<char> buf(request->len());
+    response->set_ret(
+        recv(request->sockfd(), buf.data(), buf.size(), request->flags()));
+    if (response->ret() >= 0) {
+      response->set_buf(buf.data(), response->ret());
+    }
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+};
+
+// Parse command line options. Returns a pointer to the first argument beyond
+// the options.
+void parse_command_line_options(int argc, char *argv[], std::string *ip,
+                                int *port) {
+  static struct option options[] = {{"ip", required_argument, NULL, 1},
+                                    {"port", required_argument, NULL, 2},
+                                    {0, 0, 0, 0}};
+
+  // Parse the arguments.
+  int c;
+  while ((c = getopt_long(argc, argv, "", options, NULL)) > 0) {
+    if (c == 1) {
+      *ip = optarg;
+    } else if (c == 2) {
+      *port = std::stoi(std::string(optarg));
+    }
+  }
+}
+
+void run_server(const std::string &ip, int port) {
+  PosixImpl posix_service;
+  grpc::ServerBuilder builder;
+  std::string server_address = ip + ":" + std::to_string(port);
+  // Set the authentication mechanism.
+  std::shared_ptr<grpc::ServerCredentials> creds =
+      grpc::InsecureServerCredentials();
+  builder.AddListeningPort(server_address, creds);
+  builder.RegisterService(&posix_service);
+
+  std::unique_ptr<grpc::Server> server(builder.BuildAndStart());
+  std::cerr << "Server listening on " << server_address << std::endl;
+  server->Wait();
+  std::cerr << "posix_server is finished." << std::endl;
+}
+
+int main(int argc, char *argv[]) {
+  std::cerr << "posix_server is starting." << std::endl;
+  std::string ip;
+  int port;
+  parse_command_line_options(argc, argv, &ip, &port);
+
+  std::cerr << "Got IP " << ip << " and port " << port << "." << std::endl;
+  run_server(ip, port);
+}
diff --git a/test/packetimpact/netdevs/BUILD b/test/packetimpact/netdevs/BUILD
new file mode 100644
index 000000000..422bb9b0c
--- /dev/null
+++ b/test/packetimpact/netdevs/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(
+    licenses = ["notice"],
+)
+
+go_library(
+    name = "netdevs",
+    srcs = ["netdevs.go"],
+    visibility = ["//test/packetimpact:__subpackages__"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+    ],
+)
diff --git a/test/packetimpact/netdevs/netdevs.go b/test/packetimpact/netdevs/netdevs.go
new file mode 100644
index 000000000..d2c9cfeaf
--- /dev/null
+++ b/test/packetimpact/netdevs/netdevs.go
@@ -0,0 +1,104 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package netdevs contains utilities for working with network devices.
+package netdevs
+
+import (
+	"fmt"
+	"net"
+	"regexp"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// A DeviceInfo represents a network device.
+type DeviceInfo struct {
+	MAC      net.HardwareAddr
+	IPv4Addr net.IP
+	IPv4Net  *net.IPNet
+	IPv6Addr net.IP
+	IPv6Net  *net.IPNet
+}
+
+var (
+	deviceLine = regexp.MustCompile(`^\s*\d+: (\w+)`)
+	linkLine   = regexp.MustCompile(`^\s*link/\w+ ([0-9a-fA-F:]+)`)
+	inetLine   = regexp.MustCompile(`^\s*inet ([0-9./]+)`)
+	inet6Line  = regexp.MustCompile(`^\s*inet6 ([0-9a-fA-Z:/]+)`)
+)
+
+// ParseDevices parses the output from `ip addr show` into a map from device
+// name to information about the device.
+func ParseDevices(cmdOutput string) (map[string]DeviceInfo, error) {
+	var currentDevice string
+	var currentInfo DeviceInfo
+	deviceInfos := make(map[string]DeviceInfo)
+	for _, line := range strings.Split(cmdOutput, "\n") {
+		if m := deviceLine.FindStringSubmatch(line); m != nil {
+			if currentDevice != "" {
+				deviceInfos[currentDevice] = currentInfo
+			}
+			currentInfo = DeviceInfo{}
+			currentDevice = m[1]
+		} else if m := linkLine.FindStringSubmatch(line); m != nil {
+			mac, err := net.ParseMAC(m[1])
+			if err != nil {
+				return nil, err
+			}
+			currentInfo.MAC = mac
+		} else if m := inetLine.FindStringSubmatch(line); m != nil {
+			ipv4Addr, ipv4Net, err := net.ParseCIDR(m[1])
+			if err != nil {
+				return nil, err
+			}
+			currentInfo.IPv4Addr = ipv4Addr
+			currentInfo.IPv4Net = ipv4Net
+		} else if m := inet6Line.FindStringSubmatch(line); m != nil {
+			ipv6Addr, ipv6Net, err := net.ParseCIDR(m[1])
+			if err != nil {
+				return nil, err
+			}
+			currentInfo.IPv6Addr = ipv6Addr
+			currentInfo.IPv6Net = ipv6Net
+		}
+	}
+	if currentDevice != "" {
+		deviceInfos[currentDevice] = currentInfo
+	}
+	return deviceInfos, nil
+}
+
+// MACToIP converts the MAC address to an IPv6 link local address as described
+// in RFC 4291 page 20: https://tools.ietf.org/html/rfc4291#page-20
+func MACToIP(mac net.HardwareAddr) net.IP {
+	addr := make([]byte, header.IPv6AddressSize)
+	addr[0] = 0xfe
+	addr[1] = 0x80
+	header.EthernetAdddressToModifiedEUI64IntoBuf(tcpip.LinkAddress(mac), addr[8:])
+	return net.IP(addr)
+}
+
+// FindDeviceByIP finds a DeviceInfo and device name from an IP address in the
+// output of ParseDevices.
+func FindDeviceByIP(ip net.IP, devices map[string]DeviceInfo) (string, DeviceInfo, error) {
+	for dev, info := range devices {
+		if info.IPv4Addr.Equal(ip) {
+			return dev, info, nil
+		}
+	}
+	return "", DeviceInfo{}, fmt.Errorf("can't find %s on any interface", ip)
+}
diff --git a/test/packetimpact/proto/BUILD b/test/packetimpact/proto/BUILD
new file mode 100644
index 000000000..4a4370f42
--- /dev/null
+++ b/test/packetimpact/proto/BUILD
@@ -0,0 +1,12 @@
+load("//tools:defs.bzl", "proto_library")
+
+package(
+    default_visibility = ["//test/packetimpact:__subpackages__"],
+    licenses = ["notice"],
+)
+
+proto_library(
+    name = "posix_server",
+    srcs = ["posix_server.proto"],
+    has_services = 1,
+)
diff --git a/test/packetimpact/proto/posix_server.proto b/test/packetimpact/proto/posix_server.proto
new file mode 100644
index 000000000..77da0fb3a
--- /dev/null
+++ b/test/packetimpact/proto/posix_server.proto
@@ -0,0 +1,230 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package posix_server;
+
+message SockaddrIn {
+  int32 family = 1;
+  uint32 port = 2;
+  bytes addr = 3;
+}
+
+message SockaddrIn6 {
+  uint32 family = 1;
+  uint32 port = 2;
+  uint32 flowinfo = 3;
+  bytes addr = 4;
+  uint32 scope_id = 5;
+}
+
+message Sockaddr {
+  oneof sockaddr {
+    SockaddrIn in = 1;
+    SockaddrIn6 in6 = 2;
+  }
+}
+
+message Timeval {
+  int64 seconds = 1;
+  int64 microseconds = 2;
+}
+
+message SockOptVal {
+  oneof val {
+    bytes bytesval = 1;
+    int32 intval = 2;
+    Timeval timeval = 3;
+  }
+}
+
+// Request and Response pairs for each Posix service RPC call, sorted.
+
+message AcceptRequest {
+  int32 sockfd = 1;
+}
+
+message AcceptResponse {
+  int32 fd = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+  Sockaddr addr = 3;
+}
+
+message BindRequest {
+  int32 sockfd = 1;
+  Sockaddr addr = 2;
+}
+
+message BindResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+}
+
+message CloseRequest {
+  int32 fd = 1;
+}
+
+message CloseResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+}
+
+message ConnectRequest {
+  int32 sockfd = 1;
+  Sockaddr addr = 2;
+}
+
+message ConnectResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+}
+
+message FcntlRequest {
+  int32 fd = 1;
+  int32 cmd = 2;
+  int32 arg = 3;
+}
+
+message FcntlResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+}
+
+message GetSockNameRequest {
+  int32 sockfd = 1;
+}
+
+message GetSockNameResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+  Sockaddr addr = 3;
+}
+
+message GetSockOptRequest {
+  int32 sockfd = 1;
+  int32 level = 2;
+  int32 optname = 3;
+  int32 optlen = 4;
+  enum SockOptType {
+    UNSPECIFIED = 0;
+    BYTES = 1;
+    INT = 2;
+    TIME = 3;
+  }
+  SockOptType type = 5;
+}
+
+message GetSockOptResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+  SockOptVal optval = 3;
+}
+
+message ListenRequest {
+  int32 sockfd = 1;
+  int32 backlog = 2;
+}
+
+message ListenResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+}
+
+message SendRequest {
+  int32 sockfd = 1;
+  bytes buf = 2;
+  int32 flags = 3;
+}
+
+message SendResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+}
+
+message SendToRequest {
+  int32 sockfd = 1;
+  bytes buf = 2;
+  int32 flags = 3;
+  Sockaddr dest_addr = 4;
+}
+
+message SendToResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+}
+
+message SetSockOptRequest {
+  int32 sockfd = 1;
+  int32 level = 2;
+  int32 optname = 3;
+  SockOptVal optval = 4;
+}
+
+message SetSockOptResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+}
+
+message SocketRequest {
+  int32 domain = 1;
+  int32 type = 2;
+  int32 protocol = 3;
+}
+
+message SocketResponse {
+  int32 fd = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+}
+
+message RecvRequest {
+  int32 sockfd = 1;
+  int32 len = 2;
+  int32 flags = 3;
+}
+
+message RecvResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;  // "errno" may fail to compile in c++.
+  bytes buf = 3;
+}
+
+service Posix {
+  // Call accept() on the DUT.
+  rpc Accept(AcceptRequest) returns (AcceptResponse);
+  // Call bind() on the DUT.
+  rpc Bind(BindRequest) returns (BindResponse);
+  // Call close() on the DUT.
+  rpc Close(CloseRequest) returns (CloseResponse);
+  // Call connect() on the DUT.
+  rpc Connect(ConnectRequest) returns (ConnectResponse);
+  // Call fcntl() on the DUT.
+  rpc Fcntl(FcntlRequest) returns (FcntlResponse);
+  // Call getsockname() on the DUT.
+  rpc GetSockName(GetSockNameRequest) returns (GetSockNameResponse);
+  // Call getsockopt() on the DUT.
+  rpc GetSockOpt(GetSockOptRequest) returns (GetSockOptResponse);
+  // Call listen() on the DUT.
+  rpc Listen(ListenRequest) returns (ListenResponse);
+  // Call send() on the DUT.
+  rpc Send(SendRequest) returns (SendResponse);
+  // Call sendto() on the DUT.
+  rpc SendTo(SendToRequest) returns (SendToResponse);
+  // Call setsockopt() on the DUT.
+  rpc SetSockOpt(SetSockOptRequest) returns (SetSockOptResponse);
+  // Call socket() on the DUT.
+  rpc Socket(SocketRequest) returns (SocketResponse);
+  // Call recv() on the DUT.
+  rpc Recv(RecvRequest) returns (RecvResponse);
+}
diff --git a/test/packetimpact/runner/BUILD b/test/packetimpact/runner/BUILD
new file mode 100644
index 000000000..0b68a760a
--- /dev/null
+++ b/test/packetimpact/runner/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_test")
+
+package(
+    default_visibility = ["//test/packetimpact:__subpackages__"],
+    licenses = ["notice"],
+)
+
+go_test(
+    name = "packetimpact_test",
+    srcs = ["packetimpact_test.go"],
+    tags = [
+        # Not intended to be run directly.
+        "local",
+        "manual",
+    ],
+    deps = [
+        "//pkg/test/dockerutil",
+        "//test/packetimpact/netdevs",
+    ],
+)
diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl
new file mode 100644
index 000000000..ea66b9756
--- /dev/null
+++ b/test/packetimpact/runner/defs.bzl
@@ -0,0 +1,136 @@
+"""Defines rules for packetimpact test targets."""
+
+load("//tools:defs.bzl", "go_test")
+
+def _packetimpact_test_impl(ctx):
+    test_runner = ctx.executable._test_runner
+    bench = ctx.actions.declare_file("%s-bench" % ctx.label.name)
+    bench_content = "\n".join([
+        "#!/bin/bash",
+        # This test will run part in a distinct user namespace. This can cause
+        # permission problems, because all runfiles may not be owned by the
+        # current user, and no other users will be mapped in that namespace.
+        # Make sure that everything is readable here.
+        "find . -type f -or -type d -exec chmod a+rx {} \\;",
+        "%s %s --testbench_binary %s $@\n" % (
+            test_runner.short_path,
+            " ".join(ctx.attr.flags),
+            ctx.files.testbench_binary[0].short_path,
+        ),
+    ])
+    ctx.actions.write(bench, bench_content, is_executable = True)
+
+    transitive_files = depset()
+    if hasattr(ctx.attr._test_runner, "data_runfiles"):
+        transitive_files = depset(ctx.attr._test_runner.data_runfiles.files)
+    runfiles = ctx.runfiles(
+        files = [test_runner] + ctx.files.testbench_binary + ctx.files._posix_server_binary,
+        transitive_files = transitive_files,
+        collect_default = True,
+        collect_data = True,
+    )
+    return [DefaultInfo(executable = bench, runfiles = runfiles)]
+
+_packetimpact_test = rule(
+    attrs = {
+        "_test_runner": attr.label(
+            executable = True,
+            cfg = "target",
+            default = ":packetimpact_test",
+        ),
+        "_posix_server_binary": attr.label(
+            cfg = "target",
+            default = "//test/packetimpact/dut:posix_server",
+        ),
+        "testbench_binary": attr.label(
+            cfg = "target",
+            mandatory = True,
+        ),
+        "flags": attr.string_list(
+            mandatory = False,
+            default = [],
+        ),
+    },
+    test = True,
+    implementation = _packetimpact_test_impl,
+)
+
+PACKETIMPACT_TAGS = ["local", "manual"]
+
+def packetimpact_linux_test(
+        name,
+        testbench_binary,
+        expect_failure = False,
+        **kwargs):
+    """Add a packetimpact test on linux.
+
+    Args:
+        name: name of the test
+        testbench_binary: the testbench binary
+        expect_failure: the test must fail
+        **kwargs: all the other args, forwarded to _packetimpact_test
+    """
+    expect_failure_flag = ["--expect_failure"] if expect_failure else []
+    _packetimpact_test(
+        name = name + "_linux_test",
+        testbench_binary = testbench_binary,
+        flags = ["--dut_platform", "linux"] + expect_failure_flag,
+        tags = PACKETIMPACT_TAGS + ["packetimpact"],
+        **kwargs
+    )
+
+def packetimpact_netstack_test(
+        name,
+        testbench_binary,
+        expect_failure = False,
+        **kwargs):
+    """Add a packetimpact test on netstack.
+
+    Args:
+        name: name of the test
+        testbench_binary: the testbench binary
+        expect_failure: the test must fail
+        **kwargs: all the other args, forwarded to _packetimpact_test
+    """
+    expect_failure_flag = []
+    if expect_failure:
+        expect_failure_flag = ["--expect_failure"]
+    _packetimpact_test(
+        name = name + "_netstack_test",
+        testbench_binary = testbench_binary,
+        # This is the default runtime unless
+        # "--test_arg=--runtime=OTHER_RUNTIME" is used to override the value.
+        flags = ["--dut_platform", "netstack", "--runtime=runsc-d"] + expect_failure_flag,
+        tags = PACKETIMPACT_TAGS + ["packetimpact"],
+        **kwargs
+    )
+
+def packetimpact_go_test(name, size = "small", pure = True, expect_linux_failure = False, expect_netstack_failure = False, **kwargs):
+    """Add packetimpact tests written in go.
+
+    Args:
+        name: name of the test
+        size: size of the test
+        pure: make a static go binary
+        expect_linux_failure: the test must fail for Linux
+        expect_netstack_failure: the test must fail for Netstack
+        **kwargs: all the other args, forwarded to go_test
+    """
+    testbench_binary = name + "_test"
+    go_test(
+        name = testbench_binary,
+        size = size,
+        pure = pure,
+        tags = PACKETIMPACT_TAGS,
+        **kwargs
+    )
+    packetimpact_linux_test(
+        name = name,
+        expect_failure = expect_linux_failure,
+        testbench_binary = testbench_binary,
+    )
+    packetimpact_netstack_test(
+        name = name,
+        expect_failure = expect_netstack_failure,
+        testbench_binary = testbench_binary,
+    )
diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go
new file mode 100644
index 000000000..e58a1fb1b
--- /dev/null
+++ b/test/packetimpact/runner/packetimpact_test.go
@@ -0,0 +1,332 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The runner starts docker containers and networking for a packetimpact test.
+package packetimpact_test
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math/rand"
+	"net"
+	"os"
+	"os/exec"
+	"path"
+	"strings"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/test/packetimpact/netdevs"
+)
+
+// stringList implements flag.Value.
+type stringList []string
+
+// String implements flag.Value.String.
+func (l *stringList) String() string {
+	return strings.Join(*l, ",")
+}
+
+// Set implements flag.Value.Set.
+func (l *stringList) Set(value string) error {
+	*l = append(*l, value)
+	return nil
+}
+
+var (
+	dutPlatform     = flag.String("dut_platform", "", "either \"linux\" or \"netstack\"")
+	testbenchBinary = flag.String("testbench_binary", "", "path to the testbench binary")
+	tshark          = flag.Bool("tshark", false, "use more verbose tshark in logs instead of tcpdump")
+	extraTestArgs   = stringList{}
+	expectFailure   = flag.Bool("expect_failure", false, "expect that the test will fail when run")
+
+	dutAddr       = net.IPv4(0, 0, 0, 10)
+	testbenchAddr = net.IPv4(0, 0, 0, 20)
+)
+
+const ctrlPort = "40000"
+
+// logger implements testutil.Logger.
+//
+// Labels logs based on their source and formats multi-line logs.
+type logger string
+
+// Name implements testutil.Logger.Name.
+func (l logger) Name() string {
+	return string(l)
+}
+
+// Logf implements testutil.Logger.Logf.
+func (l logger) Logf(format string, args ...interface{}) {
+	lines := strings.Split(fmt.Sprintf(format, args...), "\n")
+	log.Printf("%s: %s", l, lines[0])
+	for _, line := range lines[1:] {
+		log.Printf("%*s  %s", len(l), "", line)
+	}
+}
+
+func TestOne(t *testing.T) {
+	flag.Var(&extraTestArgs, "extra_test_arg", "extra arguments to pass to the testbench")
+	flag.Parse()
+	if *dutPlatform != "linux" && *dutPlatform != "netstack" {
+		t.Fatal("--dut_platform should be either linux or netstack")
+	}
+	if *testbenchBinary == "" {
+		t.Fatal("--testbench_binary is missing")
+	}
+	if *dutPlatform == "netstack" {
+		if _, err := dockerutil.RuntimePath(); err != nil {
+			t.Fatal("--runtime is missing or invalid with --dut_platform=netstack:", err)
+		}
+	}
+	dockerutil.EnsureSupportedDockerVersion()
+
+	// Create the networks needed for the test. One control network is needed for
+	// the gRPC control packets and one test network on which to transmit the test
+	// packets.
+	ctrlNet := dockerutil.NewDockerNetwork(logger("ctrlNet"))
+	testNet := dockerutil.NewDockerNetwork(logger("testNet"))
+	for _, dn := range []*dockerutil.DockerNetwork{ctrlNet, testNet} {
+		for {
+			if err := createDockerNetwork(dn); err != nil {
+				t.Log("creating docker network:", err)
+				const wait = 100 * time.Millisecond
+				t.Logf("sleeping %s and will try creating docker network again", wait)
+				// This can fail if another docker network claimed the same IP so we'll
+				// just try again.
+				time.Sleep(wait)
+				continue
+			}
+			break
+		}
+		defer func(dn *dockerutil.DockerNetwork) {
+			if err := dn.Cleanup(); err != nil {
+				t.Errorf("unable to cleanup container %s: %s", dn.Name, err)
+			}
+		}(dn)
+	}
+
+	tmpDir, err := ioutil.TempDir("", "container-output")
+	if err != nil {
+		t.Fatal("creating temp dir:", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	const testOutputDir = "/tmp/testoutput"
+
+	runOpts := dockerutil.RunOpts{
+		Image:      "packetimpact",
+		CapAdd:     []string{"NET_ADMIN"},
+		Extra:      []string{"--sysctl", "net.ipv6.conf.all.disable_ipv6=0", "--rm", "-v", tmpDir + ":" + testOutputDir},
+		Foreground: true,
+	}
+
+	// Create the Docker container for the DUT.
+	dut := dockerutil.MakeDocker(logger("dut"))
+	if *dutPlatform == "linux" {
+		dut.Runtime = ""
+	}
+
+	const containerPosixServerBinary = "/packetimpact/posix_server"
+	dut.CopyFiles("/packetimpact", "/test/packetimpact/dut/posix_server")
+
+	if err := dut.Create(runOpts, containerPosixServerBinary, "--ip=0.0.0.0", "--port="+ctrlPort); err != nil {
+		t.Fatalf("unable to create container %s: %s", dut.Name, err)
+	}
+	defer dut.CleanUp()
+
+	// Add ctrlNet as eth1 and testNet as eth2.
+	const testNetDev = "eth2"
+	if err := addNetworks(dut, dutAddr, []*dockerutil.DockerNetwork{ctrlNet, testNet}); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := dut.Start(); err != nil {
+		t.Fatalf("unable to start container %s: %s", dut.Name, err)
+	}
+
+	if _, err := dut.WaitForOutput("Server listening.*\n", 60*time.Second); err != nil {
+		t.Fatalf("%s on container %s never listened: %s", containerPosixServerBinary, dut.Name, err)
+	}
+
+	dutTestDevice, dutDeviceInfo, err := deviceByIP(dut, addressInSubnet(dutAddr, *testNet.Subnet))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	remoteMAC := dutDeviceInfo.MAC
+	remoteIPv6 := dutDeviceInfo.IPv6Addr
+	// Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
+	// needed.
+	if remoteIPv6 == nil {
+		if _, err := dut.Exec(dockerutil.RunOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil {
+			t.Fatalf("unable to ip addr add on container %s: %s", dut.Name, err)
+		}
+		// Now try again, to make sure that it worked.
+		_, dutDeviceInfo, err = deviceByIP(dut, addressInSubnet(dutAddr, *testNet.Subnet))
+		if err != nil {
+			t.Fatal(err)
+		}
+		remoteIPv6 = dutDeviceInfo.IPv6Addr
+		if remoteIPv6 == nil {
+			t.Fatal("unable to set IPv6 address on container", dut.Name)
+		}
+	}
+
+	// Create the Docker container for the testbench.
+	testbench := dockerutil.MakeDocker(logger("testbench"))
+	testbench.Runtime = "" // The testbench always runs on Linux.
+
+	tbb := path.Base(*testbenchBinary)
+	containerTestbenchBinary := "/packetimpact/" + tbb
+	testbench.CopyFiles("/packetimpact", "/test/packetimpact/tests/"+tbb)
+
+	// Run tcpdump in the test bench unbuffered, without DNS resolution, just on
+	// the interface with the test packets.
+	snifferArgs := []string{
+		"tcpdump",
+		"-S", "-vvv", "-U", "-n",
+		"-i", testNetDev,
+		"-w", testOutputDir + "/dump.pcap",
+	}
+	snifferRegex := "tcpdump: listening.*\n"
+	if *tshark {
+		// Run tshark in the test bench unbuffered, without DNS resolution, just on
+		// the interface with the test packets.
+		snifferArgs = []string{
+			"tshark", "-V", "-l", "-n", "-i", testNetDev,
+			"-o", "tcp.check_checksum:TRUE",
+			"-o", "udp.check_checksum:TRUE",
+		}
+		snifferRegex = "Capturing on.*\n"
+	}
+
+	defer func() {
+		if err := exec.Command("/bin/cp", "-r", tmpDir, os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR")).Run(); err != nil {
+			t.Error("unable to copy container output files:", err)
+		}
+	}()
+
+	if err := testbench.Create(runOpts, snifferArgs...); err != nil {
+		t.Fatalf("unable to create container %s: %s", testbench.Name, err)
+	}
+	defer testbench.CleanUp()
+
+	// Add ctrlNet as eth1 and testNet as eth2.
+	if err := addNetworks(testbench, testbenchAddr, []*dockerutil.DockerNetwork{ctrlNet, testNet}); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := testbench.Start(); err != nil {
+		t.Fatalf("unable to start container %s: %s", testbench.Name, err)
+	}
+
+	// Kill so that it will flush output.
+	defer testbench.Exec(dockerutil.RunOpts{}, "killall", snifferArgs[0])
+
+	if _, err := testbench.WaitForOutput(snifferRegex, 60*time.Second); err != nil {
+		t.Fatalf("sniffer on %s never listened: %s", dut.Name, err)
+	}
+
+	// Because the Linux kernel receives the SYN-ACK but didn't send the SYN it
+	// will issue a RST. To prevent this IPtables can be used to filter out all
+	// incoming packets. The raw socket that packetimpact tests use will still see
+	// everything.
+	if _, err := testbench.Exec(dockerutil.RunOpts{}, "iptables", "-A", "INPUT", "-i", testNetDev, "-j", "DROP"); err != nil {
+		t.Fatalf("unable to Exec iptables on container %s: %s", testbench.Name, err)
+	}
+
+	// FIXME(b/156449515): Some piece of the system has a race. The old
+	// bash script version had a sleep, so we have one too. The race should
+	// be fixed and this sleep removed.
+	time.Sleep(time.Second)
+
+	// Start a packetimpact test on the test bench. The packetimpact test sends
+	// and receives packets and also sends POSIX socket commands to the
+	// posix_server to be executed on the DUT.
+	testArgs := []string{containerTestbenchBinary}
+	testArgs = append(testArgs, extraTestArgs...)
+	testArgs = append(testArgs,
+		"--posix_server_ip", addressInSubnet(dutAddr, *ctrlNet.Subnet).String(),
+		"--posix_server_port", ctrlPort,
+		"--remote_ipv4", addressInSubnet(dutAddr, *testNet.Subnet).String(),
+		"--local_ipv4", addressInSubnet(testbenchAddr, *testNet.Subnet).String(),
+		"--remote_ipv6", remoteIPv6.String(),
+		"--remote_mac", remoteMAC.String(),
+		"--device", testNetDev,
+	)
+	_, err = testbench.Exec(dockerutil.RunOpts{}, testArgs...)
+	if !*expectFailure && err != nil {
+		t.Fatal("test failed:", err)
+	}
+	if *expectFailure && err == nil {
+		t.Fatal("test failure expected but the test succeeded, enable the test and mark the corresponding bug as fixed")
+	}
+}
+
+func addNetworks(d *dockerutil.Docker, addr net.IP, networks []*dockerutil.DockerNetwork) error {
+	for _, dn := range networks {
+		ip := addressInSubnet(addr, *dn.Subnet)
+		// Connect to the network with the specified IP address.
+		if err := dn.Connect(d, "--ip", ip.String()); err != nil {
+			return fmt.Errorf("unable to connect container %s to network %s: %w", d.Name, dn.Name, err)
+		}
+	}
+	return nil
+}
+
+// addressInSubnet combines the subnet provided with the address and returns a
+// new address. The return address bits come from the subnet where the mask is 1
+// and from the ip address where the mask is 0.
+func addressInSubnet(addr net.IP, subnet net.IPNet) net.IP {
+	var octets []byte
+	for i := 0; i < 4; i++ {
+		octets = append(octets, (subnet.IP.To4()[i]&subnet.Mask[i])+(addr.To4()[i]&(^subnet.Mask[i])))
+	}
+	return net.IP(octets)
+}
+
+// makeDockerNetwork makes a randomly-named network that will start with the
+// namePrefix. The network will be a random /24 subnet.
+func createDockerNetwork(n *dockerutil.DockerNetwork) error {
+	randSource := rand.NewSource(time.Now().UnixNano())
+	r1 := rand.New(randSource)
+	// Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
+	ip := net.IPv4(byte(r1.Intn(224-192)+192), byte(r1.Intn(256)), byte(r1.Intn(256)), 0)
+	n.Subnet = &net.IPNet{
+		IP:   ip,
+		Mask: ip.DefaultMask(),
+	}
+	return n.Create()
+}
+
+// deviceByIP finds a deviceInfo and device name from an IP address.
+func deviceByIP(d *dockerutil.Docker, ip net.IP) (string, netdevs.DeviceInfo, error) {
+	out, err := d.Exec(dockerutil.RunOpts{}, "ip", "addr", "show")
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w", d.Name, err)
+	}
+	devs, err := netdevs.ParseDevices(out)
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("parsing devices from %s container: %w", d.Name, err)
+	}
+	testDevice, deviceInfo, err := netdevs.FindDeviceByIP(ip, devs)
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("can't find deviceInfo for container %s: %w", d.Name, err)
+	}
+	return testDevice, deviceInfo, nil
+}
diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD
new file mode 100644
index 000000000..d19ec07d4
--- /dev/null
+++ b/test/packetimpact/testbench/BUILD
@@ -0,0 +1,46 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(
+    default_visibility = ["//test/packetimpact:__subpackages__"],
+    licenses = ["notice"],
+)
+
+go_library(
+    name = "testbench",
+    srcs = [
+        "connections.go",
+        "dut.go",
+        "dut_client.go",
+        "layers.go",
+        "rawsockets.go",
+        "testbench.go",
+    ],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+        "//pkg/usermem",
+        "//test/packetimpact/netdevs",
+        "//test/packetimpact/proto:posix_server_go_proto",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+        "@com_github_mohae_deepcopy//:go_default_library",
+        "@org_golang_google_grpc//:go_default_library",
+        "@org_golang_google_grpc//keepalive:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+        "@org_uber_go_multierr//:go_default_library",
+    ],
+)
+
+go_test(
+    name = "testbench_test",
+    size = "small",
+    srcs = ["layers_test.go"],
+    library = ":testbench",
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "@com_github_mohae_deepcopy//:go_default_library",
+    ],
+)
diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
new file mode 100644
index 000000000..6e85d6fab
--- /dev/null
+++ b/test/packetimpact/testbench/connections.go
@@ -0,0 +1,921 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testbench has utilities to send and receive packets and also command
+// the DUT to run POSIX functions.
+package testbench
+
+import (
+	"fmt"
+	"math/rand"
+	"net"
+	"testing"
+	"time"
+
+	"github.com/mohae/deepcopy"
+	"go.uber.org/multierr"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+func portFromSockaddr(sa unix.Sockaddr) (uint16, error) {
+	switch sa := sa.(type) {
+	case *unix.SockaddrInet4:
+		return uint16(sa.Port), nil
+	case *unix.SockaddrInet6:
+		return uint16(sa.Port), nil
+	}
+	return 0, fmt.Errorf("sockaddr type %T does not contain port", sa)
+}
+
+// pickPort makes a new socket and returns the socket FD and port. The domain should be AF_INET or AF_INET6. The caller must close the FD when done with
+// the port if there is no error.
+func pickPort(domain, typ int) (fd int, sa unix.Sockaddr, err error) {
+	fd, err = unix.Socket(domain, typ, 0)
+	if err != nil {
+		return -1, nil, err
+	}
+	defer func() {
+		if err != nil {
+			err = multierr.Append(err, unix.Close(fd))
+		}
+	}()
+	switch domain {
+	case unix.AF_INET:
+		var sa4 unix.SockaddrInet4
+		copy(sa4.Addr[:], net.ParseIP(LocalIPv4).To4())
+		sa = &sa4
+	case unix.AF_INET6:
+		var sa6 unix.SockaddrInet6
+		copy(sa6.Addr[:], net.ParseIP(LocalIPv6).To16())
+		sa = &sa6
+	default:
+		return -1, nil, fmt.Errorf("invalid domain %d, it should be one of unix.AF_INET or unix.AF_INET6", domain)
+	}
+	if err = unix.Bind(fd, sa); err != nil {
+		return -1, nil, err
+	}
+	sa, err = unix.Getsockname(fd)
+	if err != nil {
+		return -1, nil, err
+	}
+	return fd, sa, nil
+}
+
+// layerState stores the state of a layer of a connection.
+type layerState interface {
+	// outgoing returns an outgoing layer to be sent in a frame. It should not
+	// update layerState, that is done in layerState.sent.
+	outgoing() Layer
+
+	// incoming creates an expected Layer for comparing against a received Layer.
+	// Because the expectation can depend on values in the received Layer, it is
+	// an input to incoming. For example, the ACK number needs to be checked in a
+	// TCP packet but only if the ACK flag is set in the received packet. It
+	// should not update layerState, that is done in layerState.received. The
+	// caller takes ownership of the returned Layer.
+	incoming(received Layer) Layer
+
+	// sent updates the layerState based on the Layer that was sent. The input is
+	// a Layer with all prev and next pointers populated so that the entire frame
+	// as it was sent is available.
+	sent(sent Layer) error
+
+	// received updates the layerState based on a Layer that is receieved. The
+	// input is a Layer with all prev and next pointers populated so that the
+	// entire frame as it was receieved is available.
+	received(received Layer) error
+
+	// close frees associated resources held by the LayerState.
+	close() error
+}
+
+// etherState maintains state about an Ethernet connection.
+type etherState struct {
+	out, in Ether
+}
+
+var _ layerState = (*etherState)(nil)
+
+// newEtherState creates a new etherState.
+func newEtherState(out, in Ether) (*etherState, error) {
+	lMAC, err := tcpip.ParseMACAddress(LocalMAC)
+	if err != nil {
+		return nil, fmt.Errorf("parsing local MAC: %q: %w", LocalMAC, err)
+	}
+
+	rMAC, err := tcpip.ParseMACAddress(RemoteMAC)
+	if err != nil {
+		return nil, fmt.Errorf("parsing remote MAC: %q: %w", RemoteMAC, err)
+	}
+	s := etherState{
+		out: Ether{SrcAddr: &lMAC, DstAddr: &rMAC},
+		in:  Ether{SrcAddr: &rMAC, DstAddr: &lMAC},
+	}
+	if err := s.out.merge(&out); err != nil {
+		return nil, err
+	}
+	if err := s.in.merge(&in); err != nil {
+		return nil, err
+	}
+	return &s, nil
+}
+
+func (s *etherState) outgoing() Layer {
+	return deepcopy.Copy(&s.out).(Layer)
+}
+
+// incoming implements layerState.incoming.
+func (s *etherState) incoming(Layer) Layer {
+	return deepcopy.Copy(&s.in).(Layer)
+}
+
+func (*etherState) sent(Layer) error {
+	return nil
+}
+
+func (*etherState) received(Layer) error {
+	return nil
+}
+
+func (*etherState) close() error {
+	return nil
+}
+
+// ipv4State maintains state about an IPv4 connection.
+type ipv4State struct {
+	out, in IPv4
+}
+
+var _ layerState = (*ipv4State)(nil)
+
+// newIPv4State creates a new ipv4State.
+func newIPv4State(out, in IPv4) (*ipv4State, error) {
+	lIP := tcpip.Address(net.ParseIP(LocalIPv4).To4())
+	rIP := tcpip.Address(net.ParseIP(RemoteIPv4).To4())
+	s := ipv4State{
+		out: IPv4{SrcAddr: &lIP, DstAddr: &rIP},
+		in:  IPv4{SrcAddr: &rIP, DstAddr: &lIP},
+	}
+	if err := s.out.merge(&out); err != nil {
+		return nil, err
+	}
+	if err := s.in.merge(&in); err != nil {
+		return nil, err
+	}
+	return &s, nil
+}
+
+func (s *ipv4State) outgoing() Layer {
+	return deepcopy.Copy(&s.out).(Layer)
+}
+
+// incoming implements layerState.incoming.
+func (s *ipv4State) incoming(Layer) Layer {
+	return deepcopy.Copy(&s.in).(Layer)
+}
+
+func (*ipv4State) sent(Layer) error {
+	return nil
+}
+
+func (*ipv4State) received(Layer) error {
+	return nil
+}
+
+func (*ipv4State) close() error {
+	return nil
+}
+
+// ipv6State maintains state about an IPv6 connection.
+type ipv6State struct {
+	out, in IPv6
+}
+
+var _ layerState = (*ipv6State)(nil)
+
+// newIPv6State creates a new ipv6State.
+func newIPv6State(out, in IPv6) (*ipv6State, error) {
+	lIP := tcpip.Address(net.ParseIP(LocalIPv6).To16())
+	rIP := tcpip.Address(net.ParseIP(RemoteIPv6).To16())
+	s := ipv6State{
+		out: IPv6{SrcAddr: &lIP, DstAddr: &rIP},
+		in:  IPv6{SrcAddr: &rIP, DstAddr: &lIP},
+	}
+	if err := s.out.merge(&out); err != nil {
+		return nil, err
+	}
+	if err := s.in.merge(&in); err != nil {
+		return nil, err
+	}
+	return &s, nil
+}
+
+// outgoing returns an outgoing layer to be sent in a frame.
+func (s *ipv6State) outgoing() Layer {
+	return deepcopy.Copy(&s.out).(Layer)
+}
+
+func (s *ipv6State) incoming(Layer) Layer {
+	return deepcopy.Copy(&s.in).(Layer)
+}
+
+func (s *ipv6State) sent(Layer) error {
+	// Nothing to do.
+	return nil
+}
+
+func (s *ipv6State) received(Layer) error {
+	// Nothing to do.
+	return nil
+}
+
+// close cleans up any resources held.
+func (s *ipv6State) close() error {
+	return nil
+}
+
+// tcpState maintains state about a TCP connection.
+type tcpState struct {
+	out, in                   TCP
+	localSeqNum, remoteSeqNum *seqnum.Value
+	synAck                    *TCP
+	portPickerFD              int
+	finSent                   bool
+}
+
+var _ layerState = (*tcpState)(nil)
+
+// SeqNumValue is a helper routine that allocates a new seqnum.Value value to
+// store v and returns a pointer to it.
+func SeqNumValue(v seqnum.Value) *seqnum.Value {
+	return &v
+}
+
+// newTCPState creates a new TCPState.
+func newTCPState(domain int, out, in TCP) (*tcpState, unix.Sockaddr, error) {
+	portPickerFD, localAddr, err := pickPort(domain, unix.SOCK_STREAM)
+	if err != nil {
+		return nil, nil, err
+	}
+	localPort, err := portFromSockaddr(localAddr)
+	if err != nil {
+		return nil, nil, err
+	}
+	s := tcpState{
+		out:          TCP{SrcPort: &localPort},
+		in:           TCP{DstPort: &localPort},
+		localSeqNum:  SeqNumValue(seqnum.Value(rand.Uint32())),
+		portPickerFD: portPickerFD,
+		finSent:      false,
+	}
+	if err := s.out.merge(&out); err != nil {
+		return nil, nil, err
+	}
+	if err := s.in.merge(&in); err != nil {
+		return nil, nil, err
+	}
+	return &s, localAddr, nil
+}
+
+func (s *tcpState) outgoing() Layer {
+	newOutgoing := deepcopy.Copy(s.out).(TCP)
+	if s.localSeqNum != nil {
+		newOutgoing.SeqNum = Uint32(uint32(*s.localSeqNum))
+	}
+	if s.remoteSeqNum != nil {
+		newOutgoing.AckNum = Uint32(uint32(*s.remoteSeqNum))
+	}
+	return &newOutgoing
+}
+
+// incoming implements layerState.incoming.
+func (s *tcpState) incoming(received Layer) Layer {
+	tcpReceived, ok := received.(*TCP)
+	if !ok {
+		return nil
+	}
+	newIn := deepcopy.Copy(s.in).(TCP)
+	if s.remoteSeqNum != nil {
+		newIn.SeqNum = Uint32(uint32(*s.remoteSeqNum))
+	}
+	if s.localSeqNum != nil && (*tcpReceived.Flags&header.TCPFlagAck) != 0 {
+		// The caller didn't specify an AckNum so we'll expect the calculated one,
+		// but only if the ACK flag is set because the AckNum is not valid in a
+		// header if ACK is not set.
+		newIn.AckNum = Uint32(uint32(*s.localSeqNum))
+	}
+	return &newIn
+}
+
+func (s *tcpState) sent(sent Layer) error {
+	tcp, ok := sent.(*TCP)
+	if !ok {
+		return fmt.Errorf("can't update tcpState with %T Layer", sent)
+	}
+	if !s.finSent {
+		// update localSeqNum by the payload only when FIN is not yet sent by us
+		for current := tcp.next(); current != nil; current = current.next() {
+			s.localSeqNum.UpdateForward(seqnum.Size(current.length()))
+		}
+	}
+	if tcp.Flags != nil && *tcp.Flags&(header.TCPFlagSyn|header.TCPFlagFin) != 0 {
+		s.localSeqNum.UpdateForward(1)
+	}
+	if *tcp.Flags&(header.TCPFlagFin) != 0 {
+		s.finSent = true
+	}
+	return nil
+}
+
+func (s *tcpState) received(l Layer) error {
+	tcp, ok := l.(*TCP)
+	if !ok {
+		return fmt.Errorf("can't update tcpState with %T Layer", l)
+	}
+	s.remoteSeqNum = SeqNumValue(seqnum.Value(*tcp.SeqNum))
+	if *tcp.Flags&(header.TCPFlagSyn|header.TCPFlagFin) != 0 {
+		s.remoteSeqNum.UpdateForward(1)
+	}
+	for current := tcp.next(); current != nil; current = current.next() {
+		s.remoteSeqNum.UpdateForward(seqnum.Size(current.length()))
+	}
+	return nil
+}
+
+// close frees the port associated with this connection.
+func (s *tcpState) close() error {
+	if err := unix.Close(s.portPickerFD); err != nil {
+		return err
+	}
+	s.portPickerFD = -1
+	return nil
+}
+
+// udpState maintains state about a UDP connection.
+type udpState struct {
+	out, in      UDP
+	portPickerFD int
+}
+
+var _ layerState = (*udpState)(nil)
+
+// newUDPState creates a new udpState.
+func newUDPState(domain int, out, in UDP) (*udpState, unix.Sockaddr, error) {
+	portPickerFD, localAddr, err := pickPort(domain, unix.SOCK_DGRAM)
+	if err != nil {
+		return nil, nil, err
+	}
+	localPort, err := portFromSockaddr(localAddr)
+	if err != nil {
+		return nil, nil, err
+	}
+	s := udpState{
+		out:          UDP{SrcPort: &localPort},
+		in:           UDP{DstPort: &localPort},
+		portPickerFD: portPickerFD,
+	}
+	if err := s.out.merge(&out); err != nil {
+		return nil, nil, err
+	}
+	if err := s.in.merge(&in); err != nil {
+		return nil, nil, err
+	}
+	return &s, localAddr, nil
+}
+
+func (s *udpState) outgoing() Layer {
+	return deepcopy.Copy(&s.out).(Layer)
+}
+
+// incoming implements layerState.incoming.
+func (s *udpState) incoming(Layer) Layer {
+	return deepcopy.Copy(&s.in).(Layer)
+}
+
+func (*udpState) sent(l Layer) error {
+	return nil
+}
+
+func (*udpState) received(l Layer) error {
+	return nil
+}
+
+// close frees the port associated with this connection.
+func (s *udpState) close() error {
+	if err := unix.Close(s.portPickerFD); err != nil {
+		return err
+	}
+	s.portPickerFD = -1
+	return nil
+}
+
+// Connection holds a collection of layer states for maintaining a connection
+// along with sockets for sniffer and injecting packets.
+type Connection struct {
+	layerStates []layerState
+	injector    Injector
+	sniffer     Sniffer
+	localAddr   unix.Sockaddr
+	t           *testing.T
+}
+
+// Returns the default incoming frame against which to match. If received is
+// longer than layerStates then that may still count as a match. The reverse is
+// never a match and nil is returned.
+func (conn *Connection) incoming(received Layers) Layers {
+	if len(received) < len(conn.layerStates) {
+		return nil
+	}
+	in := Layers{}
+	for i, s := range conn.layerStates {
+		toMatch := s.incoming(received[i])
+		if toMatch == nil {
+			return nil
+		}
+		in = append(in, toMatch)
+	}
+	return in
+}
+
+func (conn *Connection) match(override, received Layers) bool {
+	toMatch := conn.incoming(received)
+	if toMatch == nil {
+		return false // Not enough layers in gotLayers for matching.
+	}
+	if err := toMatch.merge(override); err != nil {
+		return false // Failing to merge is not matching.
+	}
+	return toMatch.match(received)
+}
+
+// Close frees associated resources held by the Connection.
+func (conn *Connection) Close() {
+	errs := multierr.Combine(conn.sniffer.close(), conn.injector.close())
+	for _, s := range conn.layerStates {
+		if err := s.close(); err != nil {
+			errs = multierr.Append(errs, fmt.Errorf("unable to close %+v: %s", s, err))
+		}
+	}
+	if errs != nil {
+		conn.t.Fatalf("unable to close %+v: %s", conn, errs)
+	}
+}
+
+// CreateFrame builds a frame for the connection with layer overriding defaults
+// of the innermost layer and additionalLayers added after it.
+func (conn *Connection) CreateFrame(layer Layer, additionalLayers ...Layer) Layers {
+	var layersToSend Layers
+	for _, s := range conn.layerStates {
+		layersToSend = append(layersToSend, s.outgoing())
+	}
+	if err := layersToSend[len(layersToSend)-1].merge(layer); err != nil {
+		conn.t.Fatalf("can't merge %+v into %+v: %s", layer, layersToSend[len(layersToSend)-1], err)
+	}
+	layersToSend = append(layersToSend, additionalLayers...)
+	return layersToSend
+}
+
+// SendFrame sends a frame on the wire and updates the state of all layers.
+func (conn *Connection) SendFrame(frame Layers) {
+	outBytes, err := frame.ToBytes()
+	if err != nil {
+		conn.t.Fatalf("can't build outgoing packet: %s", err)
+	}
+	conn.injector.Send(outBytes)
+
+	// frame might have nil values where the caller wanted to use default values.
+	// sentFrame will have no nil values in it because it comes from parsing the
+	// bytes that were actually sent.
+	sentFrame := parse(parseEther, outBytes)
+	// Update the state of each layer based on what was sent.
+	for i, s := range conn.layerStates {
+		if err := s.sent(sentFrame[i]); err != nil {
+			conn.t.Fatalf("Unable to update the state of %+v with %s: %s", s, sentFrame[i], err)
+		}
+	}
+}
+
+// Send a packet with reasonable defaults. Potentially override the final layer
+// in the connection with the provided layer and add additionLayers.
+func (conn *Connection) Send(layer Layer, additionalLayers ...Layer) {
+	conn.SendFrame(conn.CreateFrame(layer, additionalLayers...))
+}
+
+// recvFrame gets the next successfully parsed frame (of type Layers) within the
+// timeout provided. If no parsable frame arrives before the timeout, it returns
+// nil.
+func (conn *Connection) recvFrame(timeout time.Duration) Layers {
+	if timeout <= 0 {
+		return nil
+	}
+	b := conn.sniffer.Recv(timeout)
+	if b == nil {
+		return nil
+	}
+	return parse(parseEther, b)
+}
+
+// layersError stores the Layers that we got and the Layers that we wanted to
+// match.
+type layersError struct {
+	got, want Layers
+}
+
+func (e *layersError) Error() string {
+	return e.got.diff(e.want)
+}
+
+// Expect expects a frame with the final layerStates layer matching the
+// provided Layer within the timeout specified. If it doesn't arrive in time,
+// an error is returned.
+func (conn *Connection) Expect(layer Layer, timeout time.Duration) (Layer, error) {
+	// Make a frame that will ignore all but the final layer.
+	layers := make([]Layer, len(conn.layerStates))
+	layers[len(layers)-1] = layer
+
+	gotFrame, err := conn.ExpectFrame(layers, timeout)
+	if err != nil {
+		return nil, err
+	}
+	if len(conn.layerStates)-1 < len(gotFrame) {
+		return gotFrame[len(conn.layerStates)-1], nil
+	}
+	conn.t.Fatal("the received frame should be at least as long as the expected layers")
+	panic("unreachable")
+}
+
+// ExpectFrame expects a frame that matches the provided Layers within the
+// timeout specified. If one arrives in time, the Layers is returned without an
+// error. If it doesn't arrive in time, it returns nil and error is non-nil.
+func (conn *Connection) ExpectFrame(layers Layers, timeout time.Duration) (Layers, error) {
+	deadline := time.Now().Add(timeout)
+	var errs error
+	for {
+		var gotLayers Layers
+		if timeout = time.Until(deadline); timeout > 0 {
+			gotLayers = conn.recvFrame(timeout)
+		}
+		if gotLayers == nil {
+			if errs == nil {
+				return nil, fmt.Errorf("got no frames matching %v during %s", layers, timeout)
+			}
+			return nil, fmt.Errorf("got no frames matching %v during %s: got %w", layers, timeout, errs)
+		}
+		if conn.match(layers, gotLayers) {
+			for i, s := range conn.layerStates {
+				if err := s.received(gotLayers[i]); err != nil {
+					conn.t.Fatal(err)
+				}
+			}
+			return gotLayers, nil
+		}
+		errs = multierr.Combine(errs, &layersError{got: gotLayers, want: conn.incoming(gotLayers)})
+	}
+}
+
+// Drain drains the sniffer's receive buffer by receiving packets until there's
+// nothing else to receive.
+func (conn *Connection) Drain() {
+	conn.sniffer.Drain()
+}
+
+// TCPIPv4 maintains the state for all the layers in a TCP/IPv4 connection.
+type TCPIPv4 Connection
+
+// NewTCPIPv4 creates a new TCPIPv4 connection with reasonable defaults.
+func NewTCPIPv4(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv4 {
+	etherState, err := newEtherState(Ether{}, Ether{})
+	if err != nil {
+		t.Fatalf("can't make etherState: %s", err)
+	}
+	ipv4State, err := newIPv4State(IPv4{}, IPv4{})
+	if err != nil {
+		t.Fatalf("can't make ipv4State: %s", err)
+	}
+	tcpState, localAddr, err := newTCPState(unix.AF_INET, outgoingTCP, incomingTCP)
+	if err != nil {
+		t.Fatalf("can't make tcpState: %s", err)
+	}
+	injector, err := NewInjector(t)
+	if err != nil {
+		t.Fatalf("can't make injector: %s", err)
+	}
+	sniffer, err := NewSniffer(t)
+	if err != nil {
+		t.Fatalf("can't make sniffer: %s", err)
+	}
+
+	return TCPIPv4{
+		layerStates: []layerState{etherState, ipv4State, tcpState},
+		injector:    injector,
+		sniffer:     sniffer,
+		localAddr:   localAddr,
+		t:           t,
+	}
+}
+
+// Connect performs a TCP 3-way handshake. The input Connection should have a
+// final TCP Layer.
+func (conn *TCPIPv4) Connect() {
+	conn.t.Helper()
+
+	// Send the SYN.
+	conn.Send(TCP{Flags: Uint8(header.TCPFlagSyn)})
+
+	// Wait for the SYN-ACK.
+	synAck, err := conn.Expect(TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second)
+	if err != nil {
+		conn.t.Fatalf("didn't get synack during handshake: %s", err)
+	}
+	conn.layerStates[len(conn.layerStates)-1].(*tcpState).synAck = synAck
+
+	// Send an ACK.
+	conn.Send(TCP{Flags: Uint8(header.TCPFlagAck)})
+}
+
+// ConnectWithOptions performs a TCP 3-way handshake with given TCP options.
+// The input Connection should have a final TCP Layer.
+func (conn *TCPIPv4) ConnectWithOptions(options []byte) {
+	conn.t.Helper()
+
+	// Send the SYN.
+	conn.Send(TCP{Flags: Uint8(header.TCPFlagSyn), Options: options})
+
+	// Wait for the SYN-ACK.
+	synAck, err := conn.Expect(TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second)
+	if err != nil {
+		conn.t.Fatalf("didn't get synack during handshake: %s", err)
+	}
+	conn.layerStates[len(conn.layerStates)-1].(*tcpState).synAck = synAck
+
+	// Send an ACK.
+	conn.Send(TCP{Flags: Uint8(header.TCPFlagAck)})
+}
+
+// ExpectData is a convenient method that expects a Layer and the Layer after
+// it. If it doens't arrive in time, it returns nil.
+func (conn *TCPIPv4) ExpectData(tcp *TCP, payload *Payload, timeout time.Duration) (Layers, error) {
+	expected := make([]Layer, len(conn.layerStates))
+	expected[len(expected)-1] = tcp
+	if payload != nil {
+		expected = append(expected, payload)
+	}
+	return (*Connection)(conn).ExpectFrame(expected, timeout)
+}
+
+// ExpectNextData attempts to receive the next incoming segment for the
+// connection and expects that to match the given layers.
+//
+// It differs from ExpectData() in that here we are only interested in the next
+// received segment, while ExpectData() can receive multiple segments for the
+// connection until there is a match with given layers or a timeout.
+func (conn *TCPIPv4) ExpectNextData(tcp *TCP, payload *Payload, timeout time.Duration) (Layers, error) {
+	// Receive the first incoming TCP segment for this connection.
+	got, err := conn.ExpectData(&TCP{}, nil, timeout)
+	if err != nil {
+		return nil, err
+	}
+
+	expected := make([]Layer, len(conn.layerStates))
+	expected[len(expected)-1] = tcp
+	if payload != nil {
+		expected = append(expected, payload)
+		tcp.SeqNum = Uint32(uint32(*conn.RemoteSeqNum()) - uint32(payload.Length()))
+	}
+	if !(*Connection)(conn).match(expected, got) {
+		return nil, fmt.Errorf("next frame is not matching %s during %s: got %s", expected, timeout, got)
+	}
+	return got, nil
+}
+
+// Send a packet with reasonable defaults. Potentially override the TCP layer in
+// the connection with the provided layer and add additionLayers.
+func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
+	(*Connection)(conn).Send(&tcp, additionalLayers...)
+}
+
+// Close frees associated resources held by the TCPIPv4 connection.
+func (conn *TCPIPv4) Close() {
+	(*Connection)(conn).Close()
+}
+
+// Expect expects a frame with the TCP layer matching the provided TCP within
+// the timeout specified. If it doesn't arrive in time, an error is returned.
+func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) (*TCP, error) {
+	layer, err := (*Connection)(conn).Expect(&tcp, timeout)
+	if layer == nil {
+		return nil, err
+	}
+	gotTCP, ok := layer.(*TCP)
+	if !ok {
+		conn.t.Fatalf("expected %s to be TCP", layer)
+	}
+	return gotTCP, err
+}
+
+func (conn *TCPIPv4) state() *tcpState {
+	state, ok := conn.layerStates[len(conn.layerStates)-1].(*tcpState)
+	if !ok {
+		conn.t.Fatalf("expected final state of %v to be tcpState", conn.layerStates)
+	}
+	return state
+}
+
+// RemoteSeqNum returns the next expected sequence number from the DUT.
+func (conn *TCPIPv4) RemoteSeqNum() *seqnum.Value {
+	return conn.state().remoteSeqNum
+}
+
+// LocalSeqNum returns the next sequence number to send from the testbench.
+func (conn *TCPIPv4) LocalSeqNum() *seqnum.Value {
+	return conn.state().localSeqNum
+}
+
+// SynAck returns the SynAck that was part of the handshake.
+func (conn *TCPIPv4) SynAck() *TCP {
+	return conn.state().synAck
+}
+
+// LocalAddr gets the local socket address of this connection.
+func (conn *TCPIPv4) LocalAddr() unix.Sockaddr {
+	return conn.localAddr
+}
+
+// IPv6Conn maintains the state for all the layers in a IPv6 connection.
+type IPv6Conn Connection
+
+// NewIPv6Conn creates a new IPv6Conn connection with reasonable defaults.
+func NewIPv6Conn(t *testing.T, outgoingIPv6, incomingIPv6 IPv6) IPv6Conn {
+	etherState, err := newEtherState(Ether{}, Ether{})
+	if err != nil {
+		t.Fatalf("can't make EtherState: %s", err)
+	}
+	ipv6State, err := newIPv6State(outgoingIPv6, incomingIPv6)
+	if err != nil {
+		t.Fatalf("can't make IPv6State: %s", err)
+	}
+
+	injector, err := NewInjector(t)
+	if err != nil {
+		t.Fatalf("can't make injector: %s", err)
+	}
+	sniffer, err := NewSniffer(t)
+	if err != nil {
+		t.Fatalf("can't make sniffer: %s", err)
+	}
+
+	return IPv6Conn{
+		layerStates: []layerState{etherState, ipv6State},
+		injector:    injector,
+		sniffer:     sniffer,
+		t:           t,
+	}
+}
+
+// SendFrame sends a frame on the wire and updates the state of all layers.
+func (conn *IPv6Conn) SendFrame(frame Layers) {
+	(*Connection)(conn).SendFrame(frame)
+}
+
+// CreateFrame builds a frame for the connection with ipv6 overriding the ipv6
+// layer defaults and additionalLayers added after it.
+func (conn *IPv6Conn) CreateFrame(ipv6 IPv6, additionalLayers ...Layer) Layers {
+	return (*Connection)(conn).CreateFrame(&ipv6, additionalLayers...)
+}
+
+// Close to clean up any resources held.
+func (conn *IPv6Conn) Close() {
+	(*Connection)(conn).Close()
+}
+
+// ExpectFrame expects a frame that matches the provided Layers within the
+// timeout specified. If it doesn't arrive in time, an error is returned.
+func (conn *IPv6Conn) ExpectFrame(frame Layers, timeout time.Duration) (Layers, error) {
+	return (*Connection)(conn).ExpectFrame(frame, timeout)
+}
+
+// Drain drains the sniffer's receive buffer by receiving packets until there's
+// nothing else to receive.
+func (conn *TCPIPv4) Drain() {
+	conn.sniffer.Drain()
+}
+
+// UDPIPv4 maintains the state for all the layers in a UDP/IPv4 connection.
+type UDPIPv4 Connection
+
+// NewUDPIPv4 creates a new UDPIPv4 connection with reasonable defaults.
+func NewUDPIPv4(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv4 {
+	etherState, err := newEtherState(Ether{}, Ether{})
+	if err != nil {
+		t.Fatalf("can't make etherState: %s", err)
+	}
+	ipv4State, err := newIPv4State(IPv4{}, IPv4{})
+	if err != nil {
+		t.Fatalf("can't make ipv4State: %s", err)
+	}
+	udpState, localAddr, err := newUDPState(unix.AF_INET, outgoingUDP, incomingUDP)
+	if err != nil {
+		t.Fatalf("can't make udpState: %s", err)
+	}
+	injector, err := NewInjector(t)
+	if err != nil {
+		t.Fatalf("can't make injector: %s", err)
+	}
+	sniffer, err := NewSniffer(t)
+	if err != nil {
+		t.Fatalf("can't make sniffer: %s", err)
+	}
+
+	return UDPIPv4{
+		layerStates: []layerState{etherState, ipv4State, udpState},
+		injector:    injector,
+		sniffer:     sniffer,
+		localAddr:   localAddr,
+		t:           t,
+	}
+}
+
+// LocalAddr gets the local socket address of this connection.
+func (conn *UDPIPv4) LocalAddr() unix.Sockaddr {
+	return conn.localAddr
+}
+
+// CreateFrame builds a frame for the connection with layer overriding defaults
+// of the innermost layer and additionalLayers added after it.
+func (conn *UDPIPv4) CreateFrame(layer Layer, additionalLayers ...Layer) Layers {
+	return (*Connection)(conn).CreateFrame(layer, additionalLayers...)
+}
+
+// Send a packet with reasonable defaults. Potentially override the UDP layer in
+// the connection with the provided layer and add additionLayers.
+func (conn *UDPIPv4) Send(udp UDP, additionalLayers ...Layer) {
+	(*Connection)(conn).Send(&udp, additionalLayers...)
+}
+
+// SendFrame sends a frame on the wire and updates the state of all layers.
+func (conn *UDPIPv4) SendFrame(frame Layers) {
+	(*Connection)(conn).SendFrame(frame)
+}
+
+// SendIP sends a packet with additionalLayers following the IP layer in the
+// connection.
+func (conn *UDPIPv4) SendIP(additionalLayers ...Layer) {
+	var layersToSend Layers
+	for _, s := range conn.layerStates[:len(conn.layerStates)-1] {
+		layersToSend = append(layersToSend, s.outgoing())
+	}
+	layersToSend = append(layersToSend, additionalLayers...)
+	conn.SendFrame(layersToSend)
+}
+
+// Expect expects a frame with the UDP layer matching the provided UDP within
+// the timeout specified. If it doesn't arrive in time, an error is returned.
+func (conn *UDPIPv4) Expect(udp UDP, timeout time.Duration) (*UDP, error) {
+	conn.t.Helper()
+	layer, err := (*Connection)(conn).Expect(&udp, timeout)
+	if layer == nil {
+		return nil, err
+	}
+	gotUDP, ok := layer.(*UDP)
+	if !ok {
+		conn.t.Fatalf("expected %s to be UDP", layer)
+	}
+	return gotUDP, err
+}
+
+// ExpectData is a convenient method that expects a Layer and the Layer after
+// it. If it doens't arrive in time, it returns nil.
+func (conn *UDPIPv4) ExpectData(udp UDP, payload Payload, timeout time.Duration) (Layers, error) {
+	conn.t.Helper()
+	expected := make([]Layer, len(conn.layerStates))
+	expected[len(expected)-1] = &udp
+	if payload.length() != 0 {
+		expected = append(expected, &payload)
+	}
+	return (*Connection)(conn).ExpectFrame(expected, timeout)
+}
+
+// Close frees associated resources held by the UDPIPv4 connection.
+func (conn *UDPIPv4) Close() {
+	(*Connection)(conn).Close()
+}
+
+// Drain drains the sniffer's receive buffer by receiving packets until there's
+// nothing else to receive.
+func (conn *UDPIPv4) Drain() {
+	conn.sniffer.Drain()
+}
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
new file mode 100644
index 000000000..2a2afecb5
--- /dev/null
+++ b/test/packetimpact/testbench/dut.go
@@ -0,0 +1,658 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testbench
+
+import (
+	"context"
+	"flag"
+	"net"
+	"strconv"
+	"syscall"
+	"testing"
+
+	pb "gvisor.dev/gvisor/test/packetimpact/proto/posix_server_go_proto"
+
+	"golang.org/x/sys/unix"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/keepalive"
+)
+
+// DUT communicates with the DUT to force it to make POSIX calls.
+type DUT struct {
+	t           *testing.T
+	conn        *grpc.ClientConn
+	posixServer POSIXClient
+}
+
+// NewDUT creates a new connection with the DUT over gRPC.
+func NewDUT(t *testing.T) DUT {
+	flag.Parse()
+	if err := genPseudoFlags(); err != nil {
+		t.Fatal("generating psuedo flags:", err)
+	}
+
+	posixServerAddress := POSIXServerIP + ":" + strconv.Itoa(POSIXServerPort)
+	conn, err := grpc.Dial(posixServerAddress, grpc.WithInsecure(), grpc.WithKeepaliveParams(keepalive.ClientParameters{Timeout: RPCKeepalive}))
+	if err != nil {
+		t.Fatalf("failed to grpc.Dial(%s): %s", posixServerAddress, err)
+	}
+	posixServer := NewPOSIXClient(conn)
+	return DUT{
+		t:           t,
+		conn:        conn,
+		posixServer: posixServer,
+	}
+}
+
+// TearDown closes the underlying connection.
+func (dut *DUT) TearDown() {
+	dut.conn.Close()
+}
+
+func (dut *DUT) sockaddrToProto(sa unix.Sockaddr) *pb.Sockaddr {
+	dut.t.Helper()
+	switch s := sa.(type) {
+	case *unix.SockaddrInet4:
+		return &pb.Sockaddr{
+			Sockaddr: &pb.Sockaddr_In{
+				In: &pb.SockaddrIn{
+					Family: unix.AF_INET,
+					Port:   uint32(s.Port),
+					Addr:   s.Addr[:],
+				},
+			},
+		}
+	case *unix.SockaddrInet6:
+		return &pb.Sockaddr{
+			Sockaddr: &pb.Sockaddr_In6{
+				In6: &pb.SockaddrIn6{
+					Family:   unix.AF_INET6,
+					Port:     uint32(s.Port),
+					Flowinfo: 0,
+					ScopeId:  s.ZoneId,
+					Addr:     s.Addr[:],
+				},
+			},
+		}
+	}
+	dut.t.Fatalf("can't parse Sockaddr: %+v", sa)
+	return nil
+}
+
+func (dut *DUT) protoToSockaddr(sa *pb.Sockaddr) unix.Sockaddr {
+	dut.t.Helper()
+	switch s := sa.Sockaddr.(type) {
+	case *pb.Sockaddr_In:
+		ret := unix.SockaddrInet4{
+			Port: int(s.In.GetPort()),
+		}
+		copy(ret.Addr[:], s.In.GetAddr())
+		return &ret
+	case *pb.Sockaddr_In6:
+		ret := unix.SockaddrInet6{
+			Port:   int(s.In6.GetPort()),
+			ZoneId: s.In6.GetScopeId(),
+		}
+		copy(ret.Addr[:], s.In6.GetAddr())
+	}
+	dut.t.Fatalf("can't parse Sockaddr: %+v", sa)
+	return nil
+}
+
+// CreateBoundSocket makes a new socket on the DUT, with type typ and protocol
+// proto, and bound to the IP address addr. Returns the new file descriptor and
+// the port that was selected on the DUT.
+func (dut *DUT) CreateBoundSocket(typ, proto int32, addr net.IP) (int32, uint16) {
+	dut.t.Helper()
+	var fd int32
+	if addr.To4() != nil {
+		fd = dut.Socket(unix.AF_INET, typ, proto)
+		sa := unix.SockaddrInet4{}
+		copy(sa.Addr[:], addr.To4())
+		dut.Bind(fd, &sa)
+	} else if addr.To16() != nil {
+		fd = dut.Socket(unix.AF_INET6, typ, proto)
+		sa := unix.SockaddrInet6{}
+		copy(sa.Addr[:], addr.To16())
+		dut.Bind(fd, &sa)
+	} else {
+		dut.t.Fatalf("unknown ip addr type for remoteIP")
+	}
+	sa := dut.GetSockName(fd)
+	var port int
+	switch s := sa.(type) {
+	case *unix.SockaddrInet4:
+		port = s.Port
+	case *unix.SockaddrInet6:
+		port = s.Port
+	default:
+		dut.t.Fatalf("unknown sockaddr type from getsockname: %t", sa)
+	}
+	return fd, uint16(port)
+}
+
+// CreateListener makes a new TCP connection. If it fails, the test ends.
+func (dut *DUT) CreateListener(typ, proto, backlog int32) (int32, uint16) {
+	fd, remotePort := dut.CreateBoundSocket(typ, proto, net.ParseIP(RemoteIPv4))
+	dut.Listen(fd, backlog)
+	return fd, remotePort
+}
+
+// All the functions that make gRPC calls to the POSIX service are below, sorted
+// alphabetically.
+
+// Accept calls accept on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// AcceptWithErrno.
+func (dut *DUT) Accept(sockfd int32) (int32, unix.Sockaddr) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	fd, sa, err := dut.AcceptWithErrno(ctx, sockfd)
+	if fd < 0 {
+		dut.t.Fatalf("failed to accept: %s", err)
+	}
+	return fd, sa
+}
+
+// AcceptWithErrno calls accept on the DUT.
+func (dut *DUT) AcceptWithErrno(ctx context.Context, sockfd int32) (int32, unix.Sockaddr, error) {
+	dut.t.Helper()
+	req := pb.AcceptRequest{
+		Sockfd: sockfd,
+	}
+	resp, err := dut.posixServer.Accept(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Accept: %s", err)
+	}
+	return resp.GetFd(), dut.protoToSockaddr(resp.GetAddr()), syscall.Errno(resp.GetErrno_())
+}
+
+// Bind calls bind on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is
+// needed, use BindWithErrno.
+func (dut *DUT) Bind(fd int32, sa unix.Sockaddr) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.BindWithErrno(ctx, fd, sa)
+	if ret != 0 {
+		dut.t.Fatalf("failed to bind socket: %s", err)
+	}
+}
+
+// BindWithErrno calls bind on the DUT.
+func (dut *DUT) BindWithErrno(ctx context.Context, fd int32, sa unix.Sockaddr) (int32, error) {
+	dut.t.Helper()
+	req := pb.BindRequest{
+		Sockfd: fd,
+		Addr:   dut.sockaddrToProto(sa),
+	}
+	resp, err := dut.posixServer.Bind(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Bind: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// Close calls close on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// CloseWithErrno.
+func (dut *DUT) Close(fd int32) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.CloseWithErrno(ctx, fd)
+	if ret != 0 {
+		dut.t.Fatalf("failed to close: %s", err)
+	}
+}
+
+// CloseWithErrno calls close on the DUT.
+func (dut *DUT) CloseWithErrno(ctx context.Context, fd int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.CloseRequest{
+		Fd: fd,
+	}
+	resp, err := dut.posixServer.Close(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Close: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// Connect calls connect on the DUT and causes a fatal test failure if it
+// doesn't succeed. If more control over the timeout or error handling is
+// needed, use ConnectWithErrno.
+func (dut *DUT) Connect(fd int32, sa unix.Sockaddr) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.ConnectWithErrno(ctx, fd, sa)
+	// Ignore 'operation in progress' error that can be returned when the socket
+	// is non-blocking.
+	if err != syscall.Errno(unix.EINPROGRESS) && ret != 0 {
+		dut.t.Fatalf("failed to connect socket: %s", err)
+	}
+}
+
+// ConnectWithErrno calls bind on the DUT.
+func (dut *DUT) ConnectWithErrno(ctx context.Context, fd int32, sa unix.Sockaddr) (int32, error) {
+	dut.t.Helper()
+	req := pb.ConnectRequest{
+		Sockfd: fd,
+		Addr:   dut.sockaddrToProto(sa),
+	}
+	resp, err := dut.posixServer.Connect(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Connect: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// Fcntl calls fcntl on the DUT and causes a fatal test failure if it
+// doesn't succeed. If more control over the timeout or error handling is
+// needed, use FcntlWithErrno.
+func (dut *DUT) Fcntl(fd, cmd, arg int32) int32 {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.FcntlWithErrno(ctx, fd, cmd, arg)
+	if ret == -1 {
+		dut.t.Fatalf("failed to Fcntl: ret=%d, errno=%s", ret, err)
+	}
+	return ret
+}
+
+// FcntlWithErrno calls fcntl on the DUT.
+func (dut *DUT) FcntlWithErrno(ctx context.Context, fd, cmd, arg int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.FcntlRequest{
+		Fd:  fd,
+		Cmd: cmd,
+		Arg: arg,
+	}
+	resp, err := dut.posixServer.Fcntl(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Fcntl: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// GetSockName calls getsockname on the DUT and causes a fatal test failure if
+// it doesn't succeed. If more control over the timeout or error handling is
+// needed, use GetSockNameWithErrno.
+func (dut *DUT) GetSockName(sockfd int32) unix.Sockaddr {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, sa, err := dut.GetSockNameWithErrno(ctx, sockfd)
+	if ret != 0 {
+		dut.t.Fatalf("failed to getsockname: %s", err)
+	}
+	return sa
+}
+
+// GetSockNameWithErrno calls getsockname on the DUT.
+func (dut *DUT) GetSockNameWithErrno(ctx context.Context, sockfd int32) (int32, unix.Sockaddr, error) {
+	dut.t.Helper()
+	req := pb.GetSockNameRequest{
+		Sockfd: sockfd,
+	}
+	resp, err := dut.posixServer.GetSockName(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Bind: %s", err)
+	}
+	return resp.GetRet(), dut.protoToSockaddr(resp.GetAddr()), syscall.Errno(resp.GetErrno_())
+}
+
+func (dut *DUT) getSockOpt(ctx context.Context, sockfd, level, optname, optlen int32, typ pb.GetSockOptRequest_SockOptType) (int32, *pb.SockOptVal, error) {
+	dut.t.Helper()
+	req := pb.GetSockOptRequest{
+		Sockfd:  sockfd,
+		Level:   level,
+		Optname: optname,
+		Optlen:  optlen,
+		Type:    typ,
+	}
+	resp, err := dut.posixServer.GetSockOpt(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call GetSockOpt: %s", err)
+	}
+	optval := resp.GetOptval()
+	if optval == nil {
+		dut.t.Fatalf("GetSockOpt response does not contain a value")
+	}
+	return resp.GetRet(), optval, syscall.Errno(resp.GetErrno_())
+}
+
+// GetSockOpt calls getsockopt on the DUT and causes a fatal test failure if it
+// doesn't succeed. If more control over the timeout or error handling is
+// needed, use GetSockOptWithErrno. Because endianess and the width of values
+// might differ between the testbench and DUT architectures, prefer to use a
+// more specific GetSockOptXxx function.
+func (dut *DUT) GetSockOpt(sockfd, level, optname, optlen int32) []byte {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, optval, err := dut.GetSockOptWithErrno(ctx, sockfd, level, optname, optlen)
+	if ret != 0 {
+		dut.t.Fatalf("failed to GetSockOpt: %s", err)
+	}
+	return optval
+}
+
+// GetSockOptWithErrno calls getsockopt on the DUT. Because endianess and the
+// width of values might differ between the testbench and DUT architectures,
+// prefer to use a more specific GetSockOptXxxWithErrno function.
+func (dut *DUT) GetSockOptWithErrno(ctx context.Context, sockfd, level, optname, optlen int32) (int32, []byte, error) {
+	dut.t.Helper()
+	ret, optval, errno := dut.getSockOpt(ctx, sockfd, level, optname, optlen, pb.GetSockOptRequest_BYTES)
+	bytesval, ok := optval.Val.(*pb.SockOptVal_Bytesval)
+	if !ok {
+		dut.t.Fatalf("GetSockOpt got value type: %T, want bytes", optval)
+	}
+	return ret, bytesval.Bytesval, errno
+}
+
+// GetSockOptInt calls getsockopt on the DUT and causes a fatal test failure
+// if it doesn't succeed. If more control over the int optval or error handling
+// is needed, use GetSockOptIntWithErrno.
+func (dut *DUT) GetSockOptInt(sockfd, level, optname int32) int32 {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, intval, err := dut.GetSockOptIntWithErrno(ctx, sockfd, level, optname)
+	if ret != 0 {
+		dut.t.Fatalf("failed to GetSockOptInt: %s", err)
+	}
+	return intval
+}
+
+// GetSockOptIntWithErrno calls getsockopt with an integer optval.
+func (dut *DUT) GetSockOptIntWithErrno(ctx context.Context, sockfd, level, optname int32) (int32, int32, error) {
+	dut.t.Helper()
+	ret, optval, errno := dut.getSockOpt(ctx, sockfd, level, optname, 0, pb.GetSockOptRequest_INT)
+	intval, ok := optval.Val.(*pb.SockOptVal_Intval)
+	if !ok {
+		dut.t.Fatalf("GetSockOpt got value type: %T, want int", optval)
+	}
+	return ret, intval.Intval, errno
+}
+
+// GetSockOptTimeval calls getsockopt on the DUT and causes a fatal test failure
+// if it doesn't succeed. If more control over the timeout or error handling is
+// needed, use GetSockOptTimevalWithErrno.
+func (dut *DUT) GetSockOptTimeval(sockfd, level, optname int32) unix.Timeval {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, timeval, err := dut.GetSockOptTimevalWithErrno(ctx, sockfd, level, optname)
+	if ret != 0 {
+		dut.t.Fatalf("failed to GetSockOptTimeval: %s", err)
+	}
+	return timeval
+}
+
+// GetSockOptTimevalWithErrno calls getsockopt and returns a timeval.
+func (dut *DUT) GetSockOptTimevalWithErrno(ctx context.Context, sockfd, level, optname int32) (int32, unix.Timeval, error) {
+	dut.t.Helper()
+	ret, optval, errno := dut.getSockOpt(ctx, sockfd, level, optname, 0, pb.GetSockOptRequest_TIME)
+	tv, ok := optval.Val.(*pb.SockOptVal_Timeval)
+	if !ok {
+		dut.t.Fatalf("GetSockOpt got value type: %T, want timeval", optval)
+	}
+	timeval := unix.Timeval{
+		Sec:  tv.Timeval.Seconds,
+		Usec: tv.Timeval.Microseconds,
+	}
+	return ret, timeval, errno
+}
+
+// Listen calls listen on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// ListenWithErrno.
+func (dut *DUT) Listen(sockfd, backlog int32) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.ListenWithErrno(ctx, sockfd, backlog)
+	if ret != 0 {
+		dut.t.Fatalf("failed to listen: %s", err)
+	}
+}
+
+// ListenWithErrno calls listen on the DUT.
+func (dut *DUT) ListenWithErrno(ctx context.Context, sockfd, backlog int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.ListenRequest{
+		Sockfd:  sockfd,
+		Backlog: backlog,
+	}
+	resp, err := dut.posixServer.Listen(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Listen: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// Send calls send on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// SendWithErrno.
+func (dut *DUT) Send(sockfd int32, buf []byte, flags int32) int32 {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.SendWithErrno(ctx, sockfd, buf, flags)
+	if ret == -1 {
+		dut.t.Fatalf("failed to send: %s", err)
+	}
+	return ret
+}
+
+// SendWithErrno calls send on the DUT.
+func (dut *DUT) SendWithErrno(ctx context.Context, sockfd int32, buf []byte, flags int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.SendRequest{
+		Sockfd: sockfd,
+		Buf:    buf,
+		Flags:  flags,
+	}
+	resp, err := dut.posixServer.Send(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Send: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// SendTo calls sendto on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// SendToWithErrno.
+func (dut *DUT) SendTo(sockfd int32, buf []byte, flags int32, destAddr unix.Sockaddr) int32 {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.SendToWithErrno(ctx, sockfd, buf, flags, destAddr)
+	if ret == -1 {
+		dut.t.Fatalf("failed to sendto: %s", err)
+	}
+	return ret
+}
+
+// SendToWithErrno calls sendto on the DUT.
+func (dut *DUT) SendToWithErrno(ctx context.Context, sockfd int32, buf []byte, flags int32, destAddr unix.Sockaddr) (int32, error) {
+	dut.t.Helper()
+	req := pb.SendToRequest{
+		Sockfd:   sockfd,
+		Buf:      buf,
+		Flags:    flags,
+		DestAddr: dut.sockaddrToProto(destAddr),
+	}
+	resp, err := dut.posixServer.SendTo(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("faled to call SendTo: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// SetNonBlocking will set O_NONBLOCK flag for fd if nonblocking
+// is true, otherwise it will clear the flag.
+func (dut *DUT) SetNonBlocking(fd int32, nonblocking bool) {
+	dut.t.Helper()
+	flags := dut.Fcntl(fd, unix.F_GETFL, 0)
+	if nonblocking {
+		flags |= unix.O_NONBLOCK
+	} else {
+		flags &= ^unix.O_NONBLOCK
+	}
+	dut.Fcntl(fd, unix.F_SETFL, flags)
+}
+
+func (dut *DUT) setSockOpt(ctx context.Context, sockfd, level, optname int32, optval *pb.SockOptVal) (int32, error) {
+	dut.t.Helper()
+	req := pb.SetSockOptRequest{
+		Sockfd:  sockfd,
+		Level:   level,
+		Optname: optname,
+		Optval:  optval,
+	}
+	resp, err := dut.posixServer.SetSockOpt(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call SetSockOpt: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
+// SetSockOpt calls setsockopt on the DUT and causes a fatal test failure if it
+// doesn't succeed. If more control over the timeout or error handling is
+// needed, use SetSockOptWithErrno. Because endianess and the width of values
+// might differ between the testbench and DUT architectures, prefer to use a
+// more specific SetSockOptXxx function.
+func (dut *DUT) SetSockOpt(sockfd, level, optname int32, optval []byte) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.SetSockOptWithErrno(ctx, sockfd, level, optname, optval)
+	if ret != 0 {
+		dut.t.Fatalf("failed to SetSockOpt: %s", err)
+	}
+}
+
+// SetSockOptWithErrno calls setsockopt on the DUT. Because endianess and the
+// width of values might differ between the testbench and DUT architectures,
+// prefer to use a more specific SetSockOptXxxWithErrno function.
+func (dut *DUT) SetSockOptWithErrno(ctx context.Context, sockfd, level, optname int32, optval []byte) (int32, error) {
+	dut.t.Helper()
+	return dut.setSockOpt(ctx, sockfd, level, optname, &pb.SockOptVal{Val: &pb.SockOptVal_Bytesval{optval}})
+}
+
+// SetSockOptInt calls setsockopt on the DUT and causes a fatal test failure
+// if it doesn't succeed. If more control over the int optval or error handling
+// is needed, use SetSockOptIntWithErrno.
+func (dut *DUT) SetSockOptInt(sockfd, level, optname, optval int32) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.SetSockOptIntWithErrno(ctx, sockfd, level, optname, optval)
+	if ret != 0 {
+		dut.t.Fatalf("failed to SetSockOptInt: %s", err)
+	}
+}
+
+// SetSockOptIntWithErrno calls setsockopt with an integer optval.
+func (dut *DUT) SetSockOptIntWithErrno(ctx context.Context, sockfd, level, optname, optval int32) (int32, error) {
+	dut.t.Helper()
+	return dut.setSockOpt(ctx, sockfd, level, optname, &pb.SockOptVal{Val: &pb.SockOptVal_Intval{optval}})
+}
+
+// SetSockOptTimeval calls setsockopt on the DUT and causes a fatal test failure
+// if it doesn't succeed. If more control over the timeout or error handling is
+// needed, use SetSockOptTimevalWithErrno.
+func (dut *DUT) SetSockOptTimeval(sockfd, level, optname int32, tv *unix.Timeval) {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.SetSockOptTimevalWithErrno(ctx, sockfd, level, optname, tv)
+	if ret != 0 {
+		dut.t.Fatalf("failed to SetSockOptTimeval: %s", err)
+	}
+}
+
+// SetSockOptTimevalWithErrno calls setsockopt with the timeval converted to
+// bytes.
+func (dut *DUT) SetSockOptTimevalWithErrno(ctx context.Context, sockfd, level, optname int32, tv *unix.Timeval) (int32, error) {
+	dut.t.Helper()
+	timeval := pb.Timeval{
+		Seconds:      int64(tv.Sec),
+		Microseconds: int64(tv.Usec),
+	}
+	return dut.setSockOpt(ctx, sockfd, level, optname, &pb.SockOptVal{Val: &pb.SockOptVal_Timeval{&timeval}})
+}
+
+// Socket calls socket on the DUT and returns the file descriptor. If socket
+// fails on the DUT, the test ends.
+func (dut *DUT) Socket(domain, typ, proto int32) int32 {
+	dut.t.Helper()
+	fd, err := dut.SocketWithErrno(domain, typ, proto)
+	if fd < 0 {
+		dut.t.Fatalf("failed to create socket: %s", err)
+	}
+	return fd
+}
+
+// SocketWithErrno calls socket on the DUT and returns the fd and errno.
+func (dut *DUT) SocketWithErrno(domain, typ, proto int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.SocketRequest{
+		Domain:   domain,
+		Type:     typ,
+		Protocol: proto,
+	}
+	ctx := context.Background()
+	resp, err := dut.posixServer.Socket(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Socket: %s", err)
+	}
+	return resp.GetFd(), syscall.Errno(resp.GetErrno_())
+}
+
+// Recv calls recv on the DUT and causes a fatal test failure if it doesn't
+// succeed. If more control over the timeout or error handling is needed, use
+// RecvWithErrno.
+func (dut *DUT) Recv(sockfd, len, flags int32) []byte {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, buf, err := dut.RecvWithErrno(ctx, sockfd, len, flags)
+	if ret == -1 {
+		dut.t.Fatalf("failed to recv: %s", err)
+	}
+	return buf
+}
+
+// RecvWithErrno calls recv on the DUT.
+func (dut *DUT) RecvWithErrno(ctx context.Context, sockfd, len, flags int32) (int32, []byte, error) {
+	dut.t.Helper()
+	req := pb.RecvRequest{
+		Sockfd: sockfd,
+		Len:    len,
+		Flags:  flags,
+	}
+	resp, err := dut.posixServer.Recv(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Recv: %s", err)
+	}
+	return resp.GetRet(), resp.GetBuf(), syscall.Errno(resp.GetErrno_())
+}
diff --git a/pkg/sentry/arch/arch_state_aarch64.go b/test/packetimpact/testbench/dut_client.go
index 0136a85ad..d0e68c5da 100644
--- a/pkg/sentry/arch/arch_state_aarch64.go
+++ b/test/packetimpact/testbench/dut_client.go
@@ -12,27 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build arm64
-
-package arch
+package testbench
 
 import (
-	"syscall"
+	"google.golang.org/grpc"
+	pb "gvisor.dev/gvisor/test/packetimpact/proto/posix_server_go_proto"
 )
 
-type syscallPtraceRegs struct {
-	Regs   [31]uint64
-	Sp     uint64
-	Pc     uint64
-	Pstate uint64
-}
-
-// saveRegs is invoked by stateify.
-func (s *State) saveRegs() syscallPtraceRegs {
-	return syscallPtraceRegs(s.Regs)
-}
+// PosixClient is a gRPC client for the Posix service.
+type POSIXClient pb.PosixClient
 
-// loadRegs is invoked by stateify.
-func (s *State) loadRegs(r syscallPtraceRegs) {
-	s.Regs = syscall.PtraceRegs(r)
+// NewPOSIXClient makes a new gRPC client for the POSIX service.
+func NewPOSIXClient(c grpc.ClientConnInterface) POSIXClient {
+	return pb.NewPosixClient(c)
 }
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
new file mode 100644
index 000000000..560c4111b
--- /dev/null
+++ b/test/packetimpact/testbench/layers.go
@@ -0,0 +1,1248 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testbench
+
+import (
+	"encoding/hex"
+	"fmt"
+	"reflect"
+	"strings"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"go.uber.org/multierr"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// Layer is the interface that all encapsulations must implement.
+//
+// A Layer is an encapsulation in a packet, such as TCP, IPv4, IPv6, etc. A
+// Layer contains all the fields of the encapsulation. Each field is a pointer
+// and may be nil.
+type Layer interface {
+	fmt.Stringer
+
+	// ToBytes converts the Layer into bytes. In places where the Layer's field
+	// isn't nil, the value that is pointed to is used. When the field is nil, a
+	// reasonable default for the Layer is used. For example, "64" for IPv4 TTL
+	// and a calculated checksum for TCP or IP. Some layers require information
+	// from the previous or next layers in order to compute a default, such as
+	// TCP's checksum or Ethernet's type, so each Layer has a doubly-linked list
+	// to the layer's neighbors.
+	ToBytes() ([]byte, error)
+
+	// match checks if the current Layer matches the provided Layer. If either
+	// Layer has a nil in a given field, that field is considered matching.
+	// Otherwise, the values pointed to by the fields must match. The LayerBase is
+	// ignored.
+	match(Layer) bool
+
+	// length in bytes of the current encapsulation
+	length() int
+
+	// next gets a pointer to the encapsulated Layer.
+	next() Layer
+
+	// prev gets a pointer to the Layer encapsulating this one.
+	Prev() Layer
+
+	// setNext sets the pointer to the encapsulated Layer.
+	setNext(Layer)
+
+	// setPrev sets the pointer to the Layer encapsulating this one.
+	setPrev(Layer)
+
+	// merge overrides the values in the interface with the provided values.
+	merge(Layer) error
+}
+
+// LayerBase is the common elements of all layers.
+type LayerBase struct {
+	nextLayer Layer
+	prevLayer Layer
+}
+
+func (lb *LayerBase) next() Layer {
+	return lb.nextLayer
+}
+
+// Prev returns the previous layer.
+func (lb *LayerBase) Prev() Layer {
+	return lb.prevLayer
+}
+
+func (lb *LayerBase) setNext(l Layer) {
+	lb.nextLayer = l
+}
+
+func (lb *LayerBase) setPrev(l Layer) {
+	lb.prevLayer = l
+}
+
+// equalLayer compares that two Layer structs match while ignoring field in
+// which either input has a nil and also ignoring the LayerBase of the inputs.
+func equalLayer(x, y Layer) bool {
+	if x == nil || y == nil {
+		return true
+	}
+	// opt ignores comparison pairs where either of the inputs is a nil.
+	opt := cmp.FilterValues(func(x, y interface{}) bool {
+		for _, l := range []interface{}{x, y} {
+			v := reflect.ValueOf(l)
+			if (v.Kind() == reflect.Ptr || v.Kind() == reflect.Slice) && v.IsNil() {
+				return true
+			}
+		}
+		return false
+	}, cmp.Ignore())
+	return cmp.Equal(x, y, opt, cmpopts.IgnoreTypes(LayerBase{}))
+}
+
+// mergeLayer merges y into x. Any fields for which y has a non-nil value, that
+// value overwrite the corresponding fields in x.
+func mergeLayer(x, y Layer) error {
+	if y == nil {
+		return nil
+	}
+	if reflect.TypeOf(x) != reflect.TypeOf(y) {
+		return fmt.Errorf("can't merge %T into %T", y, x)
+	}
+	vx := reflect.ValueOf(x).Elem()
+	vy := reflect.ValueOf(y).Elem()
+	t := vy.Type()
+	for i := 0; i < vy.NumField(); i++ {
+		t := t.Field(i)
+		if t.Anonymous {
+			// Ignore the LayerBase in the Layer struct.
+			continue
+		}
+		v := vy.Field(i)
+		if v.IsNil() {
+			continue
+		}
+		vx.Field(i).Set(v)
+	}
+	return nil
+}
+
+func stringLayer(l Layer) string {
+	v := reflect.ValueOf(l).Elem()
+	t := v.Type()
+	var ret []string
+	for i := 0; i < v.NumField(); i++ {
+		t := t.Field(i)
+		if t.Anonymous {
+			// Ignore the LayerBase in the Layer struct.
+			continue
+		}
+		v := v.Field(i)
+		if v.IsNil() {
+			continue
+		}
+		v = reflect.Indirect(v)
+		if v.Kind() == reflect.Slice && v.Type().Elem().Kind() == reflect.Uint8 {
+			ret = append(ret, fmt.Sprintf("%s:\n%v", t.Name, hex.Dump(v.Bytes())))
+		} else {
+			ret = append(ret, fmt.Sprintf("%s:%v", t.Name, v))
+		}
+	}
+	return fmt.Sprintf("&%s{%s}", t, strings.Join(ret, " "))
+}
+
+// Ether can construct and match an ethernet encapsulation.
+type Ether struct {
+	LayerBase
+	SrcAddr *tcpip.LinkAddress
+	DstAddr *tcpip.LinkAddress
+	Type    *tcpip.NetworkProtocolNumber
+}
+
+func (l *Ether) String() string {
+	return stringLayer(l)
+}
+
+// ToBytes implements Layer.ToBytes.
+func (l *Ether) ToBytes() ([]byte, error) {
+	b := make([]byte, header.EthernetMinimumSize)
+	h := header.Ethernet(b)
+	fields := &header.EthernetFields{}
+	if l.SrcAddr != nil {
+		fields.SrcAddr = *l.SrcAddr
+	}
+	if l.DstAddr != nil {
+		fields.DstAddr = *l.DstAddr
+	}
+	if l.Type != nil {
+		fields.Type = *l.Type
+	} else {
+		switch n := l.next().(type) {
+		case *IPv4:
+			fields.Type = header.IPv4ProtocolNumber
+		case *IPv6:
+			fields.Type = header.IPv6ProtocolNumber
+		default:
+			return nil, fmt.Errorf("ethernet header's next layer is unrecognized: %#v", n)
+		}
+	}
+	h.Encode(fields)
+	return h, nil
+}
+
+// LinkAddress is a helper routine that allocates a new tcpip.LinkAddress value
+// to store v and returns a pointer to it.
+func LinkAddress(v tcpip.LinkAddress) *tcpip.LinkAddress {
+	return &v
+}
+
+// NetworkProtocolNumber is a helper routine that allocates a new
+// tcpip.NetworkProtocolNumber value to store v and returns a pointer to it.
+func NetworkProtocolNumber(v tcpip.NetworkProtocolNumber) *tcpip.NetworkProtocolNumber {
+	return &v
+}
+
+// layerParser parses the input bytes and returns a Layer along with the next
+// layerParser to run. If there is no more parsing to do, the returned
+// layerParser is nil.
+type layerParser func([]byte) (Layer, layerParser)
+
+// parse parses bytes starting with the first layerParser and using successive
+// layerParsers until all the bytes are parsed.
+func parse(parser layerParser, b []byte) Layers {
+	var layers Layers
+	for {
+		var layer Layer
+		layer, parser = parser(b)
+		layers = append(layers, layer)
+		if parser == nil {
+			break
+		}
+		b = b[layer.length():]
+	}
+	layers.linkLayers()
+	return layers
+}
+
+// parseEther parses the bytes assuming that they start with an ethernet header
+// and continues parsing further encapsulations.
+func parseEther(b []byte) (Layer, layerParser) {
+	h := header.Ethernet(b)
+	ether := Ether{
+		SrcAddr: LinkAddress(h.SourceAddress()),
+		DstAddr: LinkAddress(h.DestinationAddress()),
+		Type:    NetworkProtocolNumber(h.Type()),
+	}
+	var nextParser layerParser
+	switch h.Type() {
+	case header.IPv4ProtocolNumber:
+		nextParser = parseIPv4
+	case header.IPv6ProtocolNumber:
+		nextParser = parseIPv6
+	default:
+		// Assume that the rest is a payload.
+		nextParser = parsePayload
+	}
+	return &ether, nextParser
+}
+
+func (l *Ether) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *Ether) length() int {
+	return header.EthernetMinimumSize
+}
+
+// merge implements Layer.merge.
+func (l *Ether) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
+// IPv4 can construct and match an IPv4 encapsulation.
+type IPv4 struct {
+	LayerBase
+	IHL            *uint8
+	TOS            *uint8
+	TotalLength    *uint16
+	ID             *uint16
+	Flags          *uint8
+	FragmentOffset *uint16
+	TTL            *uint8
+	Protocol       *uint8
+	Checksum       *uint16
+	SrcAddr        *tcpip.Address
+	DstAddr        *tcpip.Address
+}
+
+func (l *IPv4) String() string {
+	return stringLayer(l)
+}
+
+// ToBytes implements Layer.ToBytes.
+func (l *IPv4) ToBytes() ([]byte, error) {
+	b := make([]byte, header.IPv4MinimumSize)
+	h := header.IPv4(b)
+	fields := &header.IPv4Fields{
+		IHL:            20,
+		TOS:            0,
+		TotalLength:    0,
+		ID:             0,
+		Flags:          0,
+		FragmentOffset: 0,
+		TTL:            64,
+		Protocol:       0,
+		Checksum:       0,
+		SrcAddr:        tcpip.Address(""),
+		DstAddr:        tcpip.Address(""),
+	}
+	if l.TOS != nil {
+		fields.TOS = *l.TOS
+	}
+	if l.TotalLength != nil {
+		fields.TotalLength = *l.TotalLength
+	} else {
+		fields.TotalLength = uint16(l.length())
+		current := l.next()
+		for current != nil {
+			fields.TotalLength += uint16(current.length())
+			current = current.next()
+		}
+	}
+	if l.ID != nil {
+		fields.ID = *l.ID
+	}
+	if l.Flags != nil {
+		fields.Flags = *l.Flags
+	}
+	if l.FragmentOffset != nil {
+		fields.FragmentOffset = *l.FragmentOffset
+	}
+	if l.TTL != nil {
+		fields.TTL = *l.TTL
+	}
+	if l.Protocol != nil {
+		fields.Protocol = *l.Protocol
+	} else {
+		switch n := l.next().(type) {
+		case *TCP:
+			fields.Protocol = uint8(header.TCPProtocolNumber)
+		case *UDP:
+			fields.Protocol = uint8(header.UDPProtocolNumber)
+		case *ICMPv4:
+			fields.Protocol = uint8(header.ICMPv4ProtocolNumber)
+		default:
+			// TODO(b/150301488): Support more protocols as needed.
+			return nil, fmt.Errorf("ipv4 header's next layer is unrecognized: %#v", n)
+		}
+	}
+	if l.SrcAddr != nil {
+		fields.SrcAddr = *l.SrcAddr
+	}
+	if l.DstAddr != nil {
+		fields.DstAddr = *l.DstAddr
+	}
+	if l.Checksum != nil {
+		fields.Checksum = *l.Checksum
+	}
+	h.Encode(fields)
+	if l.Checksum == nil {
+		h.SetChecksum(^h.CalculateChecksum())
+	}
+	return h, nil
+}
+
+// Uint16 is a helper routine that allocates a new
+// uint16 value to store v and returns a pointer to it.
+func Uint16(v uint16) *uint16 {
+	return &v
+}
+
+// Uint8 is a helper routine that allocates a new
+// uint8 value to store v and returns a pointer to it.
+func Uint8(v uint8) *uint8 {
+	return &v
+}
+
+// Address is a helper routine that allocates a new tcpip.Address value to store
+// v and returns a pointer to it.
+func Address(v tcpip.Address) *tcpip.Address {
+	return &v
+}
+
+// parseIPv4 parses the bytes assuming that they start with an ipv4 header and
+// continues parsing further encapsulations.
+func parseIPv4(b []byte) (Layer, layerParser) {
+	h := header.IPv4(b)
+	tos, _ := h.TOS()
+	ipv4 := IPv4{
+		IHL:            Uint8(h.HeaderLength()),
+		TOS:            &tos,
+		TotalLength:    Uint16(h.TotalLength()),
+		ID:             Uint16(h.ID()),
+		Flags:          Uint8(h.Flags()),
+		FragmentOffset: Uint16(h.FragmentOffset()),
+		TTL:            Uint8(h.TTL()),
+		Protocol:       Uint8(h.Protocol()),
+		Checksum:       Uint16(h.Checksum()),
+		SrcAddr:        Address(h.SourceAddress()),
+		DstAddr:        Address(h.DestinationAddress()),
+	}
+	var nextParser layerParser
+	switch h.TransportProtocol() {
+	case header.TCPProtocolNumber:
+		nextParser = parseTCP
+	case header.UDPProtocolNumber:
+		nextParser = parseUDP
+	case header.ICMPv4ProtocolNumber:
+		nextParser = parseICMPv4
+	default:
+		// Assume that the rest is a payload.
+		nextParser = parsePayload
+	}
+	return &ipv4, nextParser
+}
+
+func (l *IPv4) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *IPv4) length() int {
+	if l.IHL == nil {
+		return header.IPv4MinimumSize
+	}
+	return int(*l.IHL)
+}
+
+// merge implements Layer.merge.
+func (l *IPv4) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
+// IPv6 can construct and match an IPv6 encapsulation.
+type IPv6 struct {
+	LayerBase
+	TrafficClass  *uint8
+	FlowLabel     *uint32
+	PayloadLength *uint16
+	NextHeader    *uint8
+	HopLimit      *uint8
+	SrcAddr       *tcpip.Address
+	DstAddr       *tcpip.Address
+}
+
+func (l *IPv6) String() string {
+	return stringLayer(l)
+}
+
+// ToBytes implements Layer.ToBytes.
+func (l *IPv6) ToBytes() ([]byte, error) {
+	b := make([]byte, header.IPv6MinimumSize)
+	h := header.IPv6(b)
+	fields := &header.IPv6Fields{
+		HopLimit: 64,
+	}
+	if l.TrafficClass != nil {
+		fields.TrafficClass = *l.TrafficClass
+	}
+	if l.FlowLabel != nil {
+		fields.FlowLabel = *l.FlowLabel
+	}
+	if l.PayloadLength != nil {
+		fields.PayloadLength = *l.PayloadLength
+	} else {
+		for current := l.next(); current != nil; current = current.next() {
+			fields.PayloadLength += uint16(current.length())
+		}
+	}
+	if l.NextHeader != nil {
+		fields.NextHeader = *l.NextHeader
+	} else {
+		switch n := l.next().(type) {
+		case *TCP:
+			fields.NextHeader = uint8(header.TCPProtocolNumber)
+		case *UDP:
+			fields.NextHeader = uint8(header.UDPProtocolNumber)
+		case *ICMPv6:
+			fields.NextHeader = uint8(header.ICMPv6ProtocolNumber)
+		default:
+			// TODO(b/150301488): Support more protocols as needed.
+			return nil, fmt.Errorf("ToBytes can't deduce the IPv6 header's next protocol: %#v", n)
+		}
+	}
+	if l.HopLimit != nil {
+		fields.HopLimit = *l.HopLimit
+	}
+	if l.SrcAddr != nil {
+		fields.SrcAddr = *l.SrcAddr
+	}
+	if l.DstAddr != nil {
+		fields.DstAddr = *l.DstAddr
+	}
+	h.Encode(fields)
+	return h, nil
+}
+
+// parseIPv6 parses the bytes assuming that they start with an ipv6 header and
+// continues parsing further encapsulations.
+func parseIPv6(b []byte) (Layer, layerParser) {
+	h := header.IPv6(b)
+	tos, flowLabel := h.TOS()
+	ipv6 := IPv6{
+		TrafficClass:  &tos,
+		FlowLabel:     &flowLabel,
+		PayloadLength: Uint16(h.PayloadLength()),
+		NextHeader:    Uint8(h.NextHeader()),
+		HopLimit:      Uint8(h.HopLimit()),
+		SrcAddr:       Address(h.SourceAddress()),
+		DstAddr:       Address(h.DestinationAddress()),
+	}
+	var nextParser layerParser
+	switch h.TransportProtocol() {
+	case header.TCPProtocolNumber:
+		nextParser = parseTCP
+	case header.UDPProtocolNumber:
+		nextParser = parseUDP
+	case header.ICMPv6ProtocolNumber:
+		nextParser = parseICMPv6
+	default:
+		// Assume that the rest is a payload.
+		nextParser = parsePayload
+	}
+	return &ipv6, nextParser
+}
+
+func (l *IPv6) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *IPv6) length() int {
+	return header.IPv6MinimumSize
+}
+
+// merge overrides the values in l with the values from other but only in fields
+// where the value is not nil.
+func (l *IPv6) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
+// ICMPv6 can construct and match an ICMPv6 encapsulation.
+type ICMPv6 struct {
+	LayerBase
+	Type       *header.ICMPv6Type
+	Code       *byte
+	Checksum   *uint16
+	NDPPayload []byte
+}
+
+func (l *ICMPv6) String() string {
+	// TODO(eyalsoha): Do something smarter here when *l.Type is ParameterProblem?
+	// We could parse the contents of the Payload as if it were an IPv6 packet.
+	return stringLayer(l)
+}
+
+// ToBytes implements Layer.ToBytes.
+func (l *ICMPv6) ToBytes() ([]byte, error) {
+	b := make([]byte, header.ICMPv6HeaderSize+len(l.NDPPayload))
+	h := header.ICMPv6(b)
+	if l.Type != nil {
+		h.SetType(*l.Type)
+	}
+	if l.Code != nil {
+		h.SetCode(*l.Code)
+	}
+	copy(h.NDPPayload(), l.NDPPayload)
+	if l.Checksum != nil {
+		h.SetChecksum(*l.Checksum)
+	} else {
+		ipv6 := l.Prev().(*IPv6)
+		h.SetChecksum(header.ICMPv6Checksum(h, *ipv6.SrcAddr, *ipv6.DstAddr, buffer.VectorisedView{}))
+	}
+	return h, nil
+}
+
+// ICMPv6Type is a helper routine that allocates a new ICMPv6Type value to store
+// v and returns a pointer to it.
+func ICMPv6Type(v header.ICMPv6Type) *header.ICMPv6Type {
+	return &v
+}
+
+// Byte is a helper routine that allocates a new byte value to store
+// v and returns a pointer to it.
+func Byte(v byte) *byte {
+	return &v
+}
+
+// parseICMPv6 parses the bytes assuming that they start with an ICMPv6 header.
+func parseICMPv6(b []byte) (Layer, layerParser) {
+	h := header.ICMPv6(b)
+	icmpv6 := ICMPv6{
+		Type:       ICMPv6Type(h.Type()),
+		Code:       Byte(h.Code()),
+		Checksum:   Uint16(h.Checksum()),
+		NDPPayload: h.NDPPayload(),
+	}
+	return &icmpv6, nil
+}
+
+func (l *ICMPv6) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *ICMPv6) length() int {
+	return header.ICMPv6HeaderSize + len(l.NDPPayload)
+}
+
+// merge overrides the values in l with the values from other but only in fields
+// where the value is not nil.
+func (l *ICMPv6) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
+// ICMPv4Type is a helper routine that allocates a new header.ICMPv4Type value
+// to store t and returns a pointer to it.
+func ICMPv4Type(t header.ICMPv4Type) *header.ICMPv4Type {
+	return &t
+}
+
+// ICMPv4 can construct and match an ICMPv4 encapsulation.
+type ICMPv4 struct {
+	LayerBase
+	Type     *header.ICMPv4Type
+	Code     *uint8
+	Checksum *uint16
+}
+
+func (l *ICMPv4) String() string {
+	return stringLayer(l)
+}
+
+// ToBytes implements Layer.ToBytes.
+func (l *ICMPv4) ToBytes() ([]byte, error) {
+	b := make([]byte, header.ICMPv4MinimumSize)
+	h := header.ICMPv4(b)
+	if l.Type != nil {
+		h.SetType(*l.Type)
+	}
+	if l.Code != nil {
+		h.SetCode(byte(*l.Code))
+	}
+	if l.Checksum != nil {
+		h.SetChecksum(*l.Checksum)
+		return h, nil
+	}
+	payload, err := payload(l)
+	if err != nil {
+		return nil, err
+	}
+	h.SetChecksum(header.ICMPv4Checksum(h, payload))
+	return h, nil
+}
+
+// parseICMPv4 parses the bytes as an ICMPv4 header, returning a Layer and a
+// parser for the encapsulated payload.
+func parseICMPv4(b []byte) (Layer, layerParser) {
+	h := header.ICMPv4(b)
+	icmpv4 := ICMPv4{
+		Type:     ICMPv4Type(h.Type()),
+		Code:     Uint8(h.Code()),
+		Checksum: Uint16(h.Checksum()),
+	}
+	return &icmpv4, parsePayload
+}
+
+func (l *ICMPv4) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *ICMPv4) length() int {
+	return header.ICMPv4MinimumSize
+}
+
+// merge overrides the values in l with the values from other but only in fields
+// where the value is not nil.
+func (l *ICMPv4) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
+// TCP can construct and match a TCP encapsulation.
+type TCP struct {
+	LayerBase
+	SrcPort       *uint16
+	DstPort       *uint16
+	SeqNum        *uint32
+	AckNum        *uint32
+	DataOffset    *uint8
+	Flags         *uint8
+	WindowSize    *uint16
+	Checksum      *uint16
+	UrgentPointer *uint16
+	Options       []byte
+}
+
+func (l *TCP) String() string {
+	return stringLayer(l)
+}
+
+// ToBytes implements Layer.ToBytes.
+func (l *TCP) ToBytes() ([]byte, error) {
+	b := make([]byte, l.length())
+	h := header.TCP(b)
+	if l.SrcPort != nil {
+		h.SetSourcePort(*l.SrcPort)
+	}
+	if l.DstPort != nil {
+		h.SetDestinationPort(*l.DstPort)
+	}
+	if l.SeqNum != nil {
+		h.SetSequenceNumber(*l.SeqNum)
+	}
+	if l.AckNum != nil {
+		h.SetAckNumber(*l.AckNum)
+	}
+	if l.DataOffset != nil {
+		h.SetDataOffset(*l.DataOffset)
+	} else {
+		h.SetDataOffset(uint8(l.length()))
+	}
+	if l.Flags != nil {
+		h.SetFlags(*l.Flags)
+	}
+	if l.WindowSize != nil {
+		h.SetWindowSize(*l.WindowSize)
+	} else {
+		h.SetWindowSize(32768)
+	}
+	if l.UrgentPointer != nil {
+		h.SetUrgentPoiner(*l.UrgentPointer)
+	}
+	copy(b[header.TCPMinimumSize:], l.Options)
+	header.AddTCPOptionPadding(b[header.TCPMinimumSize:], len(l.Options))
+	if l.Checksum != nil {
+		h.SetChecksum(*l.Checksum)
+		return h, nil
+	}
+	if err := setTCPChecksum(&h, l); err != nil {
+		return nil, err
+	}
+	return h, nil
+}
+
+// totalLength returns the length of the provided layer and all following
+// layers.
+func totalLength(l Layer) int {
+	var totalLength int
+	for ; l != nil; l = l.next() {
+		totalLength += l.length()
+	}
+	return totalLength
+}
+
+// payload returns a buffer.VectorisedView of l's payload.
+func payload(l Layer) (buffer.VectorisedView, error) {
+	var payloadBytes buffer.VectorisedView
+	for current := l.next(); current != nil; current = current.next() {
+		payload, err := current.ToBytes()
+		if err != nil {
+			return buffer.VectorisedView{}, fmt.Errorf("can't get bytes for next header: %s", payload)
+		}
+		payloadBytes.AppendView(payload)
+	}
+	return payloadBytes, nil
+}
+
+// layerChecksum calculates the checksum of the Layer header, including the
+// peusdeochecksum of the layer before it and all the bytes after it.
+func layerChecksum(l Layer, protoNumber tcpip.TransportProtocolNumber) (uint16, error) {
+	totalLength := uint16(totalLength(l))
+	var xsum uint16
+	switch s := l.Prev().(type) {
+	case *IPv4:
+		xsum = header.PseudoHeaderChecksum(protoNumber, *s.SrcAddr, *s.DstAddr, totalLength)
+	default:
+		// TODO(b/150301488): Support more protocols, like IPv6.
+		return 0, fmt.Errorf("can't get src and dst addr from previous layer: %#v", s)
+	}
+	payloadBytes, err := payload(l)
+	if err != nil {
+		return 0, err
+	}
+	xsum = header.ChecksumVV(payloadBytes, xsum)
+	return xsum, nil
+}
+
+// setTCPChecksum calculates the checksum of the TCP header and sets it in h.
+func setTCPChecksum(h *header.TCP, tcp *TCP) error {
+	h.SetChecksum(0)
+	xsum, err := layerChecksum(tcp, header.TCPProtocolNumber)
+	if err != nil {
+		return err
+	}
+	h.SetChecksum(^h.CalculateChecksum(xsum))
+	return nil
+}
+
+// Uint32 is a helper routine that allocates a new
+// uint32 value to store v and returns a pointer to it.
+func Uint32(v uint32) *uint32 {
+	return &v
+}
+
+// parseTCP parses the bytes assuming that they start with a tcp header and
+// continues parsing further encapsulations.
+func parseTCP(b []byte) (Layer, layerParser) {
+	h := header.TCP(b)
+	tcp := TCP{
+		SrcPort:       Uint16(h.SourcePort()),
+		DstPort:       Uint16(h.DestinationPort()),
+		SeqNum:        Uint32(h.SequenceNumber()),
+		AckNum:        Uint32(h.AckNumber()),
+		DataOffset:    Uint8(h.DataOffset()),
+		Flags:         Uint8(h.Flags()),
+		WindowSize:    Uint16(h.WindowSize()),
+		Checksum:      Uint16(h.Checksum()),
+		UrgentPointer: Uint16(h.UrgentPointer()),
+		Options:       b[header.TCPMinimumSize:h.DataOffset()],
+	}
+	return &tcp, parsePayload
+}
+
+func (l *TCP) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *TCP) length() int {
+	if l.DataOffset == nil {
+		// TCP header including the options must end on a 32-bit
+		// boundary; the user could potentially give us a slice
+		// whose length is not a multiple of 4 bytes, so we have
+		// to do the alignment here.
+		optlen := (len(l.Options) + 3) & ^3
+		return header.TCPMinimumSize + optlen
+	}
+	return int(*l.DataOffset)
+}
+
+// merge implements Layer.merge.
+func (l *TCP) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
+// UDP can construct and match a UDP encapsulation.
+type UDP struct {
+	LayerBase
+	SrcPort  *uint16
+	DstPort  *uint16
+	Length   *uint16
+	Checksum *uint16
+}
+
+func (l *UDP) String() string {
+	return stringLayer(l)
+}
+
+// ToBytes implements Layer.ToBytes.
+func (l *UDP) ToBytes() ([]byte, error) {
+	b := make([]byte, header.UDPMinimumSize)
+	h := header.UDP(b)
+	if l.SrcPort != nil {
+		h.SetSourcePort(*l.SrcPort)
+	}
+	if l.DstPort != nil {
+		h.SetDestinationPort(*l.DstPort)
+	}
+	if l.Length != nil {
+		h.SetLength(*l.Length)
+	} else {
+		h.SetLength(uint16(totalLength(l)))
+	}
+	if l.Checksum != nil {
+		h.SetChecksum(*l.Checksum)
+		return h, nil
+	}
+	if err := setUDPChecksum(&h, l); err != nil {
+		return nil, err
+	}
+	return h, nil
+}
+
+// setUDPChecksum calculates the checksum of the UDP header and sets it in h.
+func setUDPChecksum(h *header.UDP, udp *UDP) error {
+	h.SetChecksum(0)
+	xsum, err := layerChecksum(udp, header.UDPProtocolNumber)
+	if err != nil {
+		return err
+	}
+	h.SetChecksum(^h.CalculateChecksum(xsum))
+	return nil
+}
+
+// parseUDP parses the bytes assuming that they start with a udp header and
+// returns the parsed layer and the next parser to use.
+func parseUDP(b []byte) (Layer, layerParser) {
+	h := header.UDP(b)
+	udp := UDP{
+		SrcPort:  Uint16(h.SourcePort()),
+		DstPort:  Uint16(h.DestinationPort()),
+		Length:   Uint16(h.Length()),
+		Checksum: Uint16(h.Checksum()),
+	}
+	return &udp, parsePayload
+}
+
+func (l *UDP) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *UDP) length() int {
+	return header.UDPMinimumSize
+}
+
+// merge implements Layer.merge.
+func (l *UDP) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
+// Payload has bytes beyond OSI layer 4.
+type Payload struct {
+	LayerBase
+	Bytes []byte
+}
+
+func (l *Payload) String() string {
+	return stringLayer(l)
+}
+
+// parsePayload parses the bytes assuming that they start with a payload and
+// continue to the end. There can be no further encapsulations.
+func parsePayload(b []byte) (Layer, layerParser) {
+	payload := Payload{
+		Bytes: b,
+	}
+	return &payload, nil
+}
+
+// ToBytes implements Layer.ToBytes.
+func (l *Payload) ToBytes() ([]byte, error) {
+	return l.Bytes, nil
+}
+
+// Length returns payload byte length.
+func (l *Payload) Length() int {
+	return l.length()
+}
+
+func (l *Payload) match(other Layer) bool {
+	return equalLayer(l, other)
+}
+
+func (l *Payload) length() int {
+	return len(l.Bytes)
+}
+
+// merge implements Layer.merge.
+func (l *Payload) merge(other Layer) error {
+	return mergeLayer(l, other)
+}
+
+// Layers is an array of Layer and supports similar functions to Layer.
+type Layers []Layer
+
+// linkLayers sets the linked-list ponters in ls.
+func (ls *Layers) linkLayers() {
+	for i, l := range *ls {
+		if i > 0 {
+			l.setPrev((*ls)[i-1])
+		} else {
+			l.setPrev(nil)
+		}
+		if i+1 < len(*ls) {
+			l.setNext((*ls)[i+1])
+		} else {
+			l.setNext(nil)
+		}
+	}
+}
+
+// ToBytes converts the Layers into bytes. It creates a linked list of the Layer
+// structs and then concatentates the output of ToBytes on each Layer.
+func (ls *Layers) ToBytes() ([]byte, error) {
+	ls.linkLayers()
+	outBytes := []byte{}
+	for _, l := range *ls {
+		layerBytes, err := l.ToBytes()
+		if err != nil {
+			return nil, err
+		}
+		outBytes = append(outBytes, layerBytes...)
+	}
+	return outBytes, nil
+}
+
+func (ls *Layers) match(other Layers) bool {
+	if len(*ls) > len(other) {
+		return false
+	}
+	for i, l := range *ls {
+		if !equalLayer(l, other[i]) {
+			return false
+		}
+	}
+	return true
+}
+
+// layerDiff stores the diffs for each field along with the label for the Layer.
+// If rows is nil, that means that there was no diff.
+type layerDiff struct {
+	label string
+	rows  []layerDiffRow
+}
+
+// layerDiffRow stores the fields and corresponding values for two got and want
+// layers. If the value was nil then the string stored is the empty string.
+type layerDiffRow struct {
+	field, got, want string
+}
+
+// diffLayer extracts all differing fields between two layers.
+func diffLayer(got, want Layer) []layerDiffRow {
+	vGot := reflect.ValueOf(got).Elem()
+	vWant := reflect.ValueOf(want).Elem()
+	if vGot.Type() != vWant.Type() {
+		return nil
+	}
+	t := vGot.Type()
+	var result []layerDiffRow
+	for i := 0; i < t.NumField(); i++ {
+		t := t.Field(i)
+		if t.Anonymous {
+			// Ignore the LayerBase in the Layer struct.
+			continue
+		}
+		vGot := vGot.Field(i)
+		vWant := vWant.Field(i)
+		gotString := ""
+		if !vGot.IsNil() {
+			gotString = fmt.Sprint(reflect.Indirect(vGot))
+		}
+		wantString := ""
+		if !vWant.IsNil() {
+			wantString = fmt.Sprint(reflect.Indirect(vWant))
+		}
+		result = append(result, layerDiffRow{t.Name, gotString, wantString})
+	}
+	return result
+}
+
+// layerType returns a concise string describing the type of the Layer, like
+// "TCP", or "IPv6".
+func layerType(l Layer) string {
+	return reflect.TypeOf(l).Elem().Name()
+}
+
+// diff compares Layers and returns a representation of the difference. Each
+// Layer in the Layers is pairwise compared. If an element in either is nil, it
+// is considered a match with the other Layer. If two Layers have differing
+// types, they don't match regardless of the contents. If two Layers have the
+// same type then the fields in the Layer are pairwise compared. Fields that are
+// nil always match. Two non-nil fields only match if they point to equal
+// values. diff returns an empty string if and only if *ls and other match.
+func (ls *Layers) diff(other Layers) string {
+	var allDiffs []layerDiff
+	// Check the cases where one list is longer than the other, where one or both
+	// elements are nil, where the sides have different types, and where the sides
+	// have the same type.
+	for i := 0; i < len(*ls) || i < len(other); i++ {
+		if i >= len(*ls) {
+			// Matching ls against other where other is longer than ls. missing
+			// matches everything so we just include a label without any rows. Having
+			// no rows is a sign that there was no diff.
+			allDiffs = append(allDiffs, layerDiff{
+				label: "missing matches " + layerType(other[i]),
+			})
+			continue
+		}
+
+		if i >= len(other) {
+			// Matching ls against other where ls is longer than other. missing
+			// matches everything so we just include a label without any rows. Having
+			// no rows is a sign that there was no diff.
+			allDiffs = append(allDiffs, layerDiff{
+				label: layerType((*ls)[i]) + " matches missing",
+			})
+			continue
+		}
+
+		if (*ls)[i] == nil && other[i] == nil {
+			// Matching ls against other where both elements are nil. nil matches
+			// everything so we just include a label without any rows. Having no rows
+			// is a sign that there was no diff.
+			allDiffs = append(allDiffs, layerDiff{
+				label: "nil matches nil",
+			})
+			continue
+		}
+
+		if (*ls)[i] == nil {
+			// Matching ls against other where the element in ls is nil. nil matches
+			// everything so we just include a label without any rows. Having no rows
+			// is a sign that there was no diff.
+			allDiffs = append(allDiffs, layerDiff{
+				label: "nil matches " + layerType(other[i]),
+			})
+			continue
+		}
+
+		if other[i] == nil {
+			// Matching ls against other where the element in other is nil. nil
+			// matches everything so we just include a label without any rows. Having
+			// no rows is a sign that there was no diff.
+			allDiffs = append(allDiffs, layerDiff{
+				label: layerType((*ls)[i]) + " matches nil",
+			})
+			continue
+		}
+
+		if reflect.TypeOf((*ls)[i]) == reflect.TypeOf(other[i]) {
+			// Matching ls against other where both elements have the same type. Match
+			// each field pairwise and only report a diff if there is a mismatch,
+			// which is only when both sides are non-nil and have differring values.
+			diff := diffLayer((*ls)[i], other[i])
+			var layerDiffRows []layerDiffRow
+			for _, d := range diff {
+				if d.got == "" || d.want == "" || d.got == d.want {
+					continue
+				}
+				layerDiffRows = append(layerDiffRows, layerDiffRow{
+					d.field,
+					d.got,
+					d.want,
+				})
+			}
+			if len(layerDiffRows) > 0 {
+				allDiffs = append(allDiffs, layerDiff{
+					label: layerType((*ls)[i]),
+					rows:  layerDiffRows,
+				})
+			} else {
+				allDiffs = append(allDiffs, layerDiff{
+					label: layerType((*ls)[i]) + " matches " + layerType(other[i]),
+					// Having no rows is a sign that there was no diff.
+				})
+			}
+			continue
+		}
+		// Neither side is nil and the types are different, so we'll display one
+		// side then the other.
+		allDiffs = append(allDiffs, layerDiff{
+			label: layerType((*ls)[i]) + " doesn't match " + layerType(other[i]),
+		})
+		diff := diffLayer((*ls)[i], (*ls)[i])
+		layerDiffRows := []layerDiffRow{}
+		for _, d := range diff {
+			if len(d.got) == 0 {
+				continue
+			}
+			layerDiffRows = append(layerDiffRows, layerDiffRow{
+				d.field,
+				d.got,
+				"",
+			})
+		}
+		allDiffs = append(allDiffs, layerDiff{
+			label: layerType((*ls)[i]),
+			rows:  layerDiffRows,
+		})
+
+		layerDiffRows = []layerDiffRow{}
+		diff = diffLayer(other[i], other[i])
+		for _, d := range diff {
+			if len(d.want) == 0 {
+				continue
+			}
+			layerDiffRows = append(layerDiffRows, layerDiffRow{
+				d.field,
+				"",
+				d.want,
+			})
+		}
+		allDiffs = append(allDiffs, layerDiff{
+			label: layerType(other[i]),
+			rows:  layerDiffRows,
+		})
+	}
+
+	output := ""
+	// These are for output formatting.
+	maxLabelLen, maxFieldLen, maxGotLen, maxWantLen := 0, 0, 0, 0
+	foundOne := false
+	for _, l := range allDiffs {
+		if len(l.label) > maxLabelLen && len(l.rows) > 0 {
+			maxLabelLen = len(l.label)
+		}
+		if l.rows != nil {
+			foundOne = true
+		}
+		for _, r := range l.rows {
+			if len(r.field) > maxFieldLen {
+				maxFieldLen = len(r.field)
+			}
+			if l := len(fmt.Sprint(r.got)); l > maxGotLen {
+				maxGotLen = l
+			}
+			if l := len(fmt.Sprint(r.want)); l > maxWantLen {
+				maxWantLen = l
+			}
+		}
+	}
+	if !foundOne {
+		return ""
+	}
+	for _, l := range allDiffs {
+		if len(l.rows) == 0 {
+			output += "(" + l.label + ")\n"
+			continue
+		}
+		for i, r := range l.rows {
+			var label string
+			if i == 0 {
+				label = l.label + ":"
+			}
+			output += fmt.Sprintf(
+				"%*s %*s %*v %*v\n",
+				maxLabelLen+1, label,
+				maxFieldLen+1, r.field+":",
+				maxGotLen, r.got,
+				maxWantLen, r.want,
+			)
+		}
+	}
+	return output
+}
+
+// merge merges the other Layers into ls. If the other Layers is longer, those
+// additional Layer structs are added to ls. The errors from merging are
+// collected and returned.
+func (ls *Layers) merge(other Layers) error {
+	var errs error
+	for i, o := range other {
+		if i < len(*ls) {
+			errs = multierr.Combine(errs, (*ls)[i].merge(o))
+		} else {
+			*ls = append(*ls, o)
+		}
+	}
+	return errs
+}
diff --git a/test/packetimpact/testbench/layers_test.go b/test/packetimpact/testbench/layers_test.go
new file mode 100644
index 000000000..c7f00e70d
--- /dev/null
+++ b/test/packetimpact/testbench/layers_test.go
@@ -0,0 +1,507 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testbench
+
+import (
+	"bytes"
+	"net"
+	"testing"
+
+	"github.com/mohae/deepcopy"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+func TestLayerMatch(t *testing.T) {
+	var nilPayload *Payload
+	noPayload := &Payload{}
+	emptyPayload := &Payload{Bytes: []byte{}}
+	fullPayload := &Payload{Bytes: []byte{1, 2, 3}}
+	emptyTCP := &TCP{SrcPort: Uint16(1234), LayerBase: LayerBase{nextLayer: emptyPayload}}
+	fullTCP := &TCP{SrcPort: Uint16(1234), LayerBase: LayerBase{nextLayer: fullPayload}}
+	for _, tt := range []struct {
+		a, b Layer
+		want bool
+	}{
+		{nilPayload, nilPayload, true},
+		{nilPayload, noPayload, true},
+		{nilPayload, emptyPayload, true},
+		{nilPayload, fullPayload, true},
+		{noPayload, noPayload, true},
+		{noPayload, emptyPayload, true},
+		{noPayload, fullPayload, true},
+		{emptyPayload, emptyPayload, true},
+		{emptyPayload, fullPayload, false},
+		{fullPayload, fullPayload, true},
+		{emptyTCP, fullTCP, true},
+	} {
+		if got := tt.a.match(tt.b); got != tt.want {
+			t.Errorf("%s.match(%s) = %t, want %t", tt.a, tt.b, got, tt.want)
+		}
+		if got := tt.b.match(tt.a); got != tt.want {
+			t.Errorf("%s.match(%s) = %t, want %t", tt.b, tt.a, got, tt.want)
+		}
+	}
+}
+
+func TestLayerMergeMismatch(t *testing.T) {
+	tcp := &TCP{}
+	otherTCP := &TCP{}
+	ipv4 := &IPv4{}
+	ether := &Ether{}
+	for _, tt := range []struct {
+		a, b    Layer
+		success bool
+	}{
+		{tcp, tcp, true},
+		{tcp, otherTCP, true},
+		{tcp, ipv4, false},
+		{tcp, ether, false},
+		{tcp, nil, true},
+
+		{otherTCP, otherTCP, true},
+		{otherTCP, ipv4, false},
+		{otherTCP, ether, false},
+		{otherTCP, nil, true},
+
+		{ipv4, ipv4, true},
+		{ipv4, ether, false},
+		{ipv4, nil, true},
+
+		{ether, ether, true},
+		{ether, nil, true},
+	} {
+		if err := tt.a.merge(tt.b); (err == nil) != tt.success {
+			t.Errorf("%s.merge(%s) got %s, wanted the opposite", tt.a, tt.b, err)
+		}
+		if tt.b != nil {
+			if err := tt.b.merge(tt.a); (err == nil) != tt.success {
+				t.Errorf("%s.merge(%s) got %s, wanted the opposite", tt.b, tt.a, err)
+			}
+		}
+	}
+}
+
+func TestLayerMerge(t *testing.T) {
+	zero := Uint32(0)
+	one := Uint32(1)
+	two := Uint32(2)
+	empty := []byte{}
+	foo := []byte("foo")
+	bar := []byte("bar")
+	for _, tt := range []struct {
+		a, b Layer
+		want Layer
+	}{
+		{&TCP{AckNum: nil}, &TCP{AckNum: nil}, &TCP{AckNum: nil}},
+		{&TCP{AckNum: nil}, &TCP{AckNum: zero}, &TCP{AckNum: zero}},
+		{&TCP{AckNum: nil}, &TCP{AckNum: one}, &TCP{AckNum: one}},
+		{&TCP{AckNum: nil}, &TCP{AckNum: two}, &TCP{AckNum: two}},
+		{&TCP{AckNum: nil}, nil, &TCP{AckNum: nil}},
+
+		{&TCP{AckNum: zero}, &TCP{AckNum: nil}, &TCP{AckNum: zero}},
+		{&TCP{AckNum: zero}, &TCP{AckNum: zero}, &TCP{AckNum: zero}},
+		{&TCP{AckNum: zero}, &TCP{AckNum: one}, &TCP{AckNum: one}},
+		{&TCP{AckNum: zero}, &TCP{AckNum: two}, &TCP{AckNum: two}},
+		{&TCP{AckNum: zero}, nil, &TCP{AckNum: zero}},
+
+		{&TCP{AckNum: one}, &TCP{AckNum: nil}, &TCP{AckNum: one}},
+		{&TCP{AckNum: one}, &TCP{AckNum: zero}, &TCP{AckNum: zero}},
+		{&TCP{AckNum: one}, &TCP{AckNum: one}, &TCP{AckNum: one}},
+		{&TCP{AckNum: one}, &TCP{AckNum: two}, &TCP{AckNum: two}},
+		{&TCP{AckNum: one}, nil, &TCP{AckNum: one}},
+
+		{&TCP{AckNum: two}, &TCP{AckNum: nil}, &TCP{AckNum: two}},
+		{&TCP{AckNum: two}, &TCP{AckNum: zero}, &TCP{AckNum: zero}},
+		{&TCP{AckNum: two}, &TCP{AckNum: one}, &TCP{AckNum: one}},
+		{&TCP{AckNum: two}, &TCP{AckNum: two}, &TCP{AckNum: two}},
+		{&TCP{AckNum: two}, nil, &TCP{AckNum: two}},
+
+		{&Payload{Bytes: nil}, &Payload{Bytes: nil}, &Payload{Bytes: nil}},
+		{&Payload{Bytes: nil}, &Payload{Bytes: empty}, &Payload{Bytes: empty}},
+		{&Payload{Bytes: nil}, &Payload{Bytes: foo}, &Payload{Bytes: foo}},
+		{&Payload{Bytes: nil}, &Payload{Bytes: bar}, &Payload{Bytes: bar}},
+		{&Payload{Bytes: nil}, nil, &Payload{Bytes: nil}},
+
+		{&Payload{Bytes: empty}, &Payload{Bytes: nil}, &Payload{Bytes: empty}},
+		{&Payload{Bytes: empty}, &Payload{Bytes: empty}, &Payload{Bytes: empty}},
+		{&Payload{Bytes: empty}, &Payload{Bytes: foo}, &Payload{Bytes: foo}},
+		{&Payload{Bytes: empty}, &Payload{Bytes: bar}, &Payload{Bytes: bar}},
+		{&Payload{Bytes: empty}, nil, &Payload{Bytes: empty}},
+
+		{&Payload{Bytes: foo}, &Payload{Bytes: nil}, &Payload{Bytes: foo}},
+		{&Payload{Bytes: foo}, &Payload{Bytes: empty}, &Payload{Bytes: empty}},
+		{&Payload{Bytes: foo}, &Payload{Bytes: foo}, &Payload{Bytes: foo}},
+		{&Payload{Bytes: foo}, &Payload{Bytes: bar}, &Payload{Bytes: bar}},
+		{&Payload{Bytes: foo}, nil, &Payload{Bytes: foo}},
+
+		{&Payload{Bytes: bar}, &Payload{Bytes: nil}, &Payload{Bytes: bar}},
+		{&Payload{Bytes: bar}, &Payload{Bytes: empty}, &Payload{Bytes: empty}},
+		{&Payload{Bytes: bar}, &Payload{Bytes: foo}, &Payload{Bytes: foo}},
+		{&Payload{Bytes: bar}, &Payload{Bytes: bar}, &Payload{Bytes: bar}},
+		{&Payload{Bytes: bar}, nil, &Payload{Bytes: bar}},
+	} {
+		a := deepcopy.Copy(tt.a).(Layer)
+		if err := a.merge(tt.b); err != nil {
+			t.Errorf("%s.merge(%s) = %s, wanted nil", tt.a, tt.b, err)
+			continue
+		}
+		if a.String() != tt.want.String() {
+			t.Errorf("%s.merge(%s) merge result got %s, want %s", tt.a, tt.b, a, tt.want)
+		}
+	}
+}
+
+func TestLayerStringFormat(t *testing.T) {
+	for _, tt := range []struct {
+		name string
+		l    Layer
+		want string
+	}{
+		{
+			name: "TCP",
+			l: &TCP{
+				SrcPort:    Uint16(34785),
+				DstPort:    Uint16(47767),
+				SeqNum:     Uint32(3452155723),
+				AckNum:     Uint32(2596996163),
+				DataOffset: Uint8(5),
+				Flags:      Uint8(20),
+				WindowSize: Uint16(64240),
+				Checksum:   Uint16(0x2e2b),
+			},
+			want: "&testbench.TCP{" +
+				"SrcPort:34785 " +
+				"DstPort:47767 " +
+				"SeqNum:3452155723 " +
+				"AckNum:2596996163 " +
+				"DataOffset:5 " +
+				"Flags:20 " +
+				"WindowSize:64240 " +
+				"Checksum:11819" +
+				"}",
+		},
+		{
+			name: "UDP",
+			l: &UDP{
+				SrcPort: Uint16(34785),
+				DstPort: Uint16(47767),
+				Length:  Uint16(12),
+			},
+			want: "&testbench.UDP{" +
+				"SrcPort:34785 " +
+				"DstPort:47767 " +
+				"Length:12" +
+				"}",
+		},
+		{
+			name: "IPv4",
+			l: &IPv4{
+				IHL:            Uint8(5),
+				TOS:            Uint8(0),
+				TotalLength:    Uint16(44),
+				ID:             Uint16(0),
+				Flags:          Uint8(2),
+				FragmentOffset: Uint16(0),
+				TTL:            Uint8(64),
+				Protocol:       Uint8(6),
+				Checksum:       Uint16(0x2e2b),
+				SrcAddr:        Address(tcpip.Address([]byte{197, 34, 63, 10})),
+				DstAddr:        Address(tcpip.Address([]byte{197, 34, 63, 20})),
+			},
+			want: "&testbench.IPv4{" +
+				"IHL:5 " +
+				"TOS:0 " +
+				"TotalLength:44 " +
+				"ID:0 " +
+				"Flags:2 " +
+				"FragmentOffset:0 " +
+				"TTL:64 " +
+				"Protocol:6 " +
+				"Checksum:11819 " +
+				"SrcAddr:197.34.63.10 " +
+				"DstAddr:197.34.63.20" +
+				"}",
+		},
+		{
+			name: "Ether",
+			l: &Ether{
+				SrcAddr: LinkAddress(tcpip.LinkAddress([]byte{0x02, 0x42, 0xc5, 0x22, 0x3f, 0x0a})),
+				DstAddr: LinkAddress(tcpip.LinkAddress([]byte{0x02, 0x42, 0xc5, 0x22, 0x3f, 0x14})),
+				Type:    NetworkProtocolNumber(4),
+			},
+			want: "&testbench.Ether{" +
+				"SrcAddr:02:42:c5:22:3f:0a " +
+				"DstAddr:02:42:c5:22:3f:14 " +
+				"Type:4" +
+				"}",
+		},
+		{
+			name: "Payload",
+			l: &Payload{
+				Bytes: []byte("Hooray for packetimpact."),
+			},
+			want: "&testbench.Payload{Bytes:\n" +
+				"00000000  48 6f 6f 72 61 79 20 66  6f 72 20 70 61 63 6b 65  |Hooray for packe|\n" +
+				"00000010  74 69 6d 70 61 63 74 2e                           |timpact.|\n" +
+				"}",
+		},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := tt.l.String(); got != tt.want {
+				t.Errorf("%s.String() = %s, want: %s", tt.name, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestConnectionMatch(t *testing.T) {
+	conn := Connection{
+		layerStates: []layerState{&etherState{}},
+	}
+	protoNum0 := tcpip.NetworkProtocolNumber(0)
+	protoNum1 := tcpip.NetworkProtocolNumber(1)
+	for _, tt := range []struct {
+		description        string
+		override, received Layers
+		wantMatch          bool
+	}{
+		{
+			description: "shorter override",
+			override:    []Layer{&Ether{}},
+			received:    []Layer{&Ether{}, &Payload{Bytes: []byte("hello")}},
+			wantMatch:   true,
+		},
+		{
+			description: "longer override",
+			override:    []Layer{&Ether{}, &Payload{Bytes: []byte("hello")}},
+			received:    []Layer{&Ether{}},
+			wantMatch:   false,
+		},
+		{
+			description: "ether layer mismatch",
+			override:    []Layer{&Ether{Type: &protoNum0}},
+			received:    []Layer{&Ether{Type: &protoNum1}},
+			wantMatch:   false,
+		},
+		{
+			description: "both nil",
+			override:    nil,
+			received:    nil,
+			wantMatch:   false,
+		},
+		{
+			description: "nil override",
+			override:    nil,
+			received:    []Layer{&Ether{}},
+			wantMatch:   true,
+		},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			if gotMatch := conn.match(tt.override, tt.received); gotMatch != tt.wantMatch {
+				t.Fatalf("conn.match(%s, %s) = %t, want %t", tt.override, tt.received, gotMatch, tt.wantMatch)
+			}
+		})
+	}
+}
+
+func TestLayersDiff(t *testing.T) {
+	for _, tt := range []struct {
+		x, y Layers
+		want string
+	}{
+		{
+			Layers{&Ether{Type: NetworkProtocolNumber(12)}, &TCP{DataOffset: Uint8(5), SeqNum: Uint32(5)}},
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			"Ether:       Type: 12 13\n" +
+				"  TCP:     SeqNum:  5  6\n" +
+				"       DataOffset:  5  7\n",
+		},
+		{
+			Layers{&Ether{Type: NetworkProtocolNumber(12)}, &UDP{SrcPort: Uint16(123)}},
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			"Ether:       Type:  12 13\n" +
+				"(UDP doesn't match TCP)\n" +
+				"  UDP:    SrcPort: 123   \n" +
+				"  TCP:     SeqNum:      6\n" +
+				"       DataOffset:      7\n",
+		},
+		{
+			Layers{&UDP{SrcPort: Uint16(123)}},
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			"(UDP doesn't match Ether)\n" +
+				"  UDP: SrcPort: 123   \n" +
+				"Ether:    Type:     13\n" +
+				"(missing matches TCP)\n",
+		},
+		{
+			Layers{nil, &UDP{SrcPort: Uint16(123)}},
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			"(nil matches Ether)\n" +
+				"(UDP doesn't match TCP)\n" +
+				"UDP:    SrcPort: 123  \n" +
+				"TCP:     SeqNum:     6\n" +
+				"     DataOffset:     7\n",
+		},
+		{
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &IPv4{IHL: Uint8(4)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			Layers{&Ether{Type: NetworkProtocolNumber(13)}, &IPv4{IHL: Uint8(6)}, &TCP{DataOffset: Uint8(7), SeqNum: Uint32(6)}},
+			"(Ether matches Ether)\n" +
+				"IPv4: IHL: 4 6\n" +
+				"(TCP matches TCP)\n",
+		},
+		{
+			Layers{&Payload{Bytes: []byte("foo")}},
+			Layers{&Payload{Bytes: []byte("bar")}},
+			"Payload: Bytes: [102 111 111] [98 97 114]\n",
+		},
+		{
+			Layers{&Payload{Bytes: []byte("")}},
+			Layers{&Payload{}},
+			"",
+		},
+		{
+			Layers{&Payload{Bytes: []byte("")}},
+			Layers{&Payload{Bytes: []byte("")}},
+			"",
+		},
+		{
+			Layers{&UDP{}},
+			Layers{&TCP{}},
+			"(UDP doesn't match TCP)\n" +
+				"(UDP)\n" +
+				"(TCP)\n",
+		},
+	} {
+		if got := tt.x.diff(tt.y); got != tt.want {
+			t.Errorf("%s.diff(%s) = %q, want %q", tt.x, tt.y, got, tt.want)
+		}
+		if tt.x.match(tt.y) != (tt.x.diff(tt.y) == "") {
+			t.Errorf("match and diff of %s and %s disagree", tt.x, tt.y)
+		}
+		if tt.y.match(tt.x) != (tt.y.diff(tt.x) == "") {
+			t.Errorf("match and diff of %s and %s disagree", tt.y, tt.x)
+		}
+	}
+}
+
+func TestTCPOptions(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		wantBytes   []byte
+		wantLayers  Layers
+	}{
+		{
+			description: "without payload",
+			wantBytes: []byte{
+				// IPv4 Header
+				0x45, 0x00, 0x00, 0x2c, 0x00, 0x01, 0x00, 0x00, 0x40, 0x06,
+				0xf9, 0x77, 0xc0, 0xa8, 0x00, 0x02, 0xc0, 0xa8, 0x00, 0x01,
+				// TCP Header
+				0x30, 0x39, 0xd4, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x60, 0x02, 0x20, 0x00, 0xf5, 0x1c, 0x00, 0x00,
+				// WindowScale Option
+				0x03, 0x03, 0x02,
+				// NOP Option
+				0x00,
+			},
+			wantLayers: []Layer{
+				&IPv4{
+					IHL:            Uint8(20),
+					TOS:            Uint8(0),
+					TotalLength:    Uint16(44),
+					ID:             Uint16(1),
+					Flags:          Uint8(0),
+					FragmentOffset: Uint16(0),
+					TTL:            Uint8(64),
+					Protocol:       Uint8(uint8(header.TCPProtocolNumber)),
+					Checksum:       Uint16(0xf977),
+					SrcAddr:        Address(tcpip.Address(net.ParseIP("192.168.0.2").To4())),
+					DstAddr:        Address(tcpip.Address(net.ParseIP("192.168.0.1").To4())),
+				},
+				&TCP{
+					SrcPort:       Uint16(12345),
+					DstPort:       Uint16(54321),
+					SeqNum:        Uint32(0),
+					AckNum:        Uint32(0),
+					Flags:         Uint8(header.TCPFlagSyn),
+					WindowSize:    Uint16(8192),
+					Checksum:      Uint16(0xf51c),
+					UrgentPointer: Uint16(0),
+					Options:       []byte{3, 3, 2, 0},
+				},
+				&Payload{Bytes: nil},
+			},
+		},
+		{
+			description: "with payload",
+			wantBytes: []byte{
+				// IPv4 header
+				0x45, 0x00, 0x00, 0x37, 0x00, 0x01, 0x00, 0x00, 0x40, 0x06,
+				0xf9, 0x6c, 0xc0, 0xa8, 0x00, 0x02, 0xc0, 0xa8, 0x00, 0x01,
+				// TCP header
+				0x30, 0x39, 0xd4, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x60, 0x02, 0x20, 0x00, 0xe5, 0x21, 0x00, 0x00,
+				// WindowScale Option
+				0x03, 0x03, 0x02,
+				// NOP Option
+				0x00,
+				// Payload: "Sample Data"
+				0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x44, 0x61, 0x74, 0x61,
+			},
+			wantLayers: []Layer{
+				&IPv4{
+					IHL:            Uint8(20),
+					TOS:            Uint8(0),
+					TotalLength:    Uint16(55),
+					ID:             Uint16(1),
+					Flags:          Uint8(0),
+					FragmentOffset: Uint16(0),
+					TTL:            Uint8(64),
+					Protocol:       Uint8(uint8(header.TCPProtocolNumber)),
+					Checksum:       Uint16(0xf96c),
+					SrcAddr:        Address(tcpip.Address(net.ParseIP("192.168.0.2").To4())),
+					DstAddr:        Address(tcpip.Address(net.ParseIP("192.168.0.1").To4())),
+				},
+				&TCP{
+					SrcPort:       Uint16(12345),
+					DstPort:       Uint16(54321),
+					SeqNum:        Uint32(0),
+					AckNum:        Uint32(0),
+					Flags:         Uint8(header.TCPFlagSyn),
+					WindowSize:    Uint16(8192),
+					Checksum:      Uint16(0xe521),
+					UrgentPointer: Uint16(0),
+					Options:       []byte{3, 3, 2, 0},
+				},
+				&Payload{Bytes: []byte("Sample Data")},
+			},
+		},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			layers := parse(parseIPv4, tt.wantBytes)
+			if !layers.match(tt.wantLayers) {
+				t.Fatalf("match failed with diff: %s", layers.diff(tt.wantLayers))
+			}
+			gotBytes, err := layers.ToBytes()
+			if err != nil {
+				t.Fatalf("ToBytes() failed on %s: %s", &layers, err)
+			}
+			if !bytes.Equal(tt.wantBytes, gotBytes) {
+				t.Fatalf("mismatching bytes, gotBytes: %x, wantBytes: %x", gotBytes, tt.wantBytes)
+			}
+		})
+	}
+}
diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go
new file mode 100644
index 000000000..278229b7e
--- /dev/null
+++ b/test/packetimpact/testbench/rawsockets.go
@@ -0,0 +1,178 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testbench
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math"
+	"net"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Sniffer can sniff raw packets on the wire.
+type Sniffer struct {
+	t  *testing.T
+	fd int
+}
+
+func htons(x uint16) uint16 {
+	buf := [2]byte{}
+	binary.BigEndian.PutUint16(buf[:], x)
+	return usermem.ByteOrder.Uint16(buf[:])
+}
+
+// NewSniffer creates a Sniffer connected to *device.
+func NewSniffer(t *testing.T) (Sniffer, error) {
+	snifferFd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, int(htons(unix.ETH_P_ALL)))
+	if err != nil {
+		return Sniffer{}, err
+	}
+	if err := unix.SetsockoptInt(snifferFd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, 1); err != nil {
+		t.Fatalf("can't set sockopt SO_RCVBUFFORCE to 1: %s", err)
+	}
+	if err := unix.SetsockoptInt(snifferFd, unix.SOL_SOCKET, unix.SO_RCVBUF, 1e7); err != nil {
+		t.Fatalf("can't setsockopt SO_RCVBUF to 10M: %s", err)
+	}
+	return Sniffer{
+		t:  t,
+		fd: snifferFd,
+	}, nil
+}
+
+// maxReadSize should be large enough for the maximum frame size in bytes. If a
+// packet too large for the buffer arrives, the test will get a fatal error.
+const maxReadSize int = 65536
+
+// Recv tries to read one frame until the timeout is up.
+func (s *Sniffer) Recv(timeout time.Duration) []byte {
+	deadline := time.Now().Add(timeout)
+	for {
+		timeout = deadline.Sub(time.Now())
+		if timeout <= 0 {
+			return nil
+		}
+		whole, frac := math.Modf(timeout.Seconds())
+		tv := unix.Timeval{
+			Sec:  int64(whole),
+			Usec: int64(frac * float64(time.Microsecond/time.Second)),
+		}
+
+		if err := unix.SetsockoptTimeval(s.fd, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &tv); err != nil {
+			s.t.Fatalf("can't setsockopt SO_RCVTIMEO: %s", err)
+		}
+
+		buf := make([]byte, maxReadSize)
+		nread, _, err := unix.Recvfrom(s.fd, buf, unix.MSG_TRUNC)
+		if err == unix.EINTR || err == unix.EAGAIN {
+			// There was a timeout.
+			continue
+		}
+		if err != nil {
+			s.t.Fatalf("can't read: %s", err)
+		}
+		if nread > maxReadSize {
+			s.t.Fatalf("received a truncated frame of %d bytes", nread)
+		}
+		return buf[:nread]
+	}
+}
+
+// Drain drains the Sniffer's socket receive buffer by receiving until there's
+// nothing else to receive.
+func (s *Sniffer) Drain() {
+	s.t.Helper()
+	flags, err := unix.FcntlInt(uintptr(s.fd), unix.F_GETFL, 0)
+	if err != nil {
+		s.t.Fatalf("failed to get sniffer socket fd flags: %s", err)
+	}
+	if _, err := unix.FcntlInt(uintptr(s.fd), unix.F_SETFL, flags|unix.O_NONBLOCK); err != nil {
+		s.t.Fatalf("failed to make sniffer socket non-blocking: %s", err)
+	}
+	for {
+		buf := make([]byte, maxReadSize)
+		_, _, err := unix.Recvfrom(s.fd, buf, unix.MSG_TRUNC)
+		if err == unix.EINTR || err == unix.EAGAIN || err == unix.EWOULDBLOCK {
+			break
+		}
+	}
+	if _, err := unix.FcntlInt(uintptr(s.fd), unix.F_SETFL, flags); err != nil {
+		s.t.Fatalf("failed to restore sniffer socket fd flags: %s", err)
+	}
+}
+
+// close the socket that Sniffer is using.
+func (s *Sniffer) close() error {
+	if err := unix.Close(s.fd); err != nil {
+		return fmt.Errorf("can't close sniffer socket: %w", err)
+	}
+	s.fd = -1
+	return nil
+}
+
+// Injector can inject raw frames.
+type Injector struct {
+	t  *testing.T
+	fd int
+}
+
+// NewInjector creates a new injector on *device.
+func NewInjector(t *testing.T) (Injector, error) {
+	ifInfo, err := net.InterfaceByName(Device)
+	if err != nil {
+		return Injector{}, err
+	}
+
+	var haddr [8]byte
+	copy(haddr[:], ifInfo.HardwareAddr)
+	sa := unix.SockaddrLinklayer{
+		Protocol: unix.ETH_P_IP,
+		Ifindex:  ifInfo.Index,
+		Halen:    uint8(len(ifInfo.HardwareAddr)),
+		Addr:     haddr,
+	}
+
+	injectFd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, int(htons(unix.ETH_P_ALL)))
+	if err != nil {
+		return Injector{}, err
+	}
+	if err := unix.Bind(injectFd, &sa); err != nil {
+		return Injector{}, err
+	}
+	return Injector{
+		t:  t,
+		fd: injectFd,
+	}, nil
+}
+
+// Send a raw frame.
+func (i *Injector) Send(b []byte) {
+	if _, err := unix.Write(i.fd, b); err != nil {
+		i.t.Fatalf("can't write: %s of len %d", err, len(b))
+	}
+}
+
+// close the underlying socket.
+func (i *Injector) close() error {
+	if err := unix.Close(i.fd); err != nil {
+		return fmt.Errorf("can't close sniffer socket: %w", err)
+	}
+	i.fd = -1
+	return nil
+}
diff --git a/test/packetimpact/testbench/testbench.go b/test/packetimpact/testbench/testbench.go
new file mode 100644
index 000000000..4de2aa1d3
--- /dev/null
+++ b/test/packetimpact/testbench/testbench.go
@@ -0,0 +1,90 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testbench
+
+import (
+	"flag"
+	"fmt"
+	"net"
+	"os/exec"
+	"time"
+
+	"gvisor.dev/gvisor/test/packetimpact/netdevs"
+)
+
+var (
+	// Device is the local device on the test network.
+	Device = ""
+	// LocalIPv4 is the local IPv4 address on the test network.
+	LocalIPv4 = ""
+	// LocalIPv6 is the local IPv6 address on the test network.
+	LocalIPv6 = ""
+	// LocalMAC is the local MAC address on the test network.
+	LocalMAC = ""
+	// POSIXServerIP is the POSIX server's IP address on the control network.
+	POSIXServerIP = ""
+	// POSIXServerPort is the UDP port the POSIX server is bound to on the
+	// control network.
+	POSIXServerPort = 40000
+	// RemoteIPv4 is the DUT's IPv4 address on the test network.
+	RemoteIPv4 = ""
+	// RemoteIPv6 is the DUT's IPv6 address on the test network.
+	RemoteIPv6 = ""
+	// RemoteMAC is the DUT's MAC address on the test network.
+	RemoteMAC = ""
+	// RPCKeepalive is the gRPC keepalive.
+	RPCKeepalive = 10 * time.Second
+	// RPCTimeout is the gRPC timeout.
+	RPCTimeout = 100 * time.Millisecond
+)
+
+// RegisterFlags defines flags and associates them with the package-level
+// exported variables above. It should be called by tests in their init
+// functions.
+func RegisterFlags(fs *flag.FlagSet) {
+	fs.StringVar(&POSIXServerIP, "posix_server_ip", POSIXServerIP, "ip address to listen to for UDP commands")
+	fs.IntVar(&POSIXServerPort, "posix_server_port", POSIXServerPort, "port to listen to for UDP commands")
+	fs.DurationVar(&RPCTimeout, "rpc_timeout", RPCTimeout, "gRPC timeout")
+	fs.DurationVar(&RPCKeepalive, "rpc_keepalive", RPCKeepalive, "gRPC keepalive")
+	fs.StringVar(&LocalIPv4, "local_ipv4", LocalIPv4, "local IPv4 address for test packets")
+	fs.StringVar(&RemoteIPv4, "remote_ipv4", RemoteIPv4, "remote IPv4 address for test packets")
+	fs.StringVar(&RemoteIPv6, "remote_ipv6", RemoteIPv6, "remote IPv6 address for test packets")
+	fs.StringVar(&RemoteMAC, "remote_mac", RemoteMAC, "remote mac address for test packets")
+	fs.StringVar(&Device, "device", Device, "local device for test packets")
+}
+
+// genPseudoFlags populates flag-like global config based on real flags.
+//
+// genPseudoFlags must only be called after flag.Parse.
+func genPseudoFlags() error {
+	out, err := exec.Command("ip", "addr", "show").CombinedOutput()
+	if err != nil {
+		return fmt.Errorf("listing devices: %q: %w", string(out), err)
+	}
+	devs, err := netdevs.ParseDevices(string(out))
+	if err != nil {
+		return fmt.Errorf("parsing devices: %w", err)
+	}
+
+	_, deviceInfo, err := netdevs.FindDeviceByIP(net.ParseIP(LocalIPv4), devs)
+	if err != nil {
+		return fmt.Errorf("can't find deviceInfo: %w", err)
+	}
+
+	LocalMAC = deviceInfo.MAC.String()
+	LocalIPv6 = deviceInfo.IPv6Addr.String()
+
+	return nil
+}
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
new file mode 100644
index 000000000..2a41ef326
--- /dev/null
+++ b/test/packetimpact/tests/BUILD
@@ -0,0 +1,242 @@
+load("//test/packetimpact/runner:defs.bzl", "packetimpact_go_test")
+
+package(
+    default_visibility = ["//test/packetimpact:__subpackages__"],
+    licenses = ["notice"],
+)
+
+packetimpact_go_test(
+    name = "fin_wait2_timeout",
+    srcs = ["fin_wait2_timeout_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "ipv4_id_uniqueness",
+    srcs = ["ipv4_id_uniqueness_test.go"],
+    # TODO(b/157506701) Fix netstack then remove the line below.
+    expect_netstack_failure = True,
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "udp_recv_multicast",
+    srcs = ["udp_recv_multicast_test.go"],
+    # TODO(b/152813495): Fix netstack then remove the line below.
+    expect_netstack_failure = True,
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "udp_icmp_error_propagation",
+    srcs = ["udp_icmp_error_propagation_test.go"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_window_shrink",
+    srcs = ["tcp_window_shrink_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_zero_window_probe",
+    srcs = ["tcp_zero_window_probe_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_zero_window_probe_retransmit",
+    srcs = ["tcp_zero_window_probe_retransmit_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_zero_window_probe_usertimeout",
+    srcs = ["tcp_zero_window_probe_usertimeout_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_retransmits",
+    srcs = ["tcp_retransmits_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_outside_the_window",
+    srcs = ["tcp_outside_the_window_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_noaccept_close_rst",
+    srcs = ["tcp_noaccept_close_rst_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_send_window_sizes_piggyback",
+    srcs = ["tcp_send_window_sizes_piggyback_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_close_wait_ack",
+    srcs = ["tcp_close_wait_ack_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_paws_mechanism",
+    srcs = ["tcp_paws_mechanism_test.go"],
+    # TODO(b/156682000): Fix netstack then remove the line below.
+    expect_netstack_failure = True,
+    deps = [
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_user_timeout",
+    srcs = ["tcp_user_timeout_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_queue_receive_in_syn_sent",
+    srcs = ["tcp_queue_receive_in_syn_sent_test.go"],
+    # TODO(b/157658105): Fix netstack then remove the line below.
+    expect_netstack_failure = True,
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_synsent_reset",
+    srcs = ["tcp_synsent_reset_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_synrcvd_reset",
+    srcs = ["tcp_synrcvd_reset_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_splitseg_mss",
+    srcs = ["tcp_splitseg_mss_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_cork_mss",
+    srcs = ["tcp_cork_mss_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "icmpv6_param_problem",
+    srcs = ["icmpv6_param_problem_test.go"],
+    # TODO(b/153485026): Fix netstack then remove the line below.
+    expect_netstack_failure = True,
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "udp_send_recv_dgram",
+    srcs = ["udp_send_recv_dgram_test.go"],
+    deps = [
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/test/packetimpact/tests/fin_wait2_timeout_test.go b/test/packetimpact/tests/fin_wait2_timeout_test.go
new file mode 100644
index 000000000..407565078
--- /dev/null
+++ b/test/packetimpact/tests/fin_wait2_timeout_test.go
@@ -0,0 +1,75 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fin_wait2_timeout_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestFinWait2Timeout(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		linger2     bool
+	}{
+		{"WithLinger2", true},
+		{"WithoutLinger2", false},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+			defer dut.Close(listenFd)
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close()
+			conn.Connect()
+
+			acceptFd, _ := dut.Accept(listenFd)
+			if tt.linger2 {
+				tv := unix.Timeval{Sec: 1, Usec: 0}
+				dut.SetSockOptTimeval(acceptFd, unix.SOL_TCP, unix.TCP_LINGER2, &tv)
+			}
+			dut.Close(acceptFd)
+
+			if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+				t.Fatalf("expected a FIN-ACK within 1 second but got none: %s", err)
+			}
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+
+			time.Sleep(5 * time.Second)
+			conn.Drain()
+
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+			if tt.linger2 {
+				if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
+					t.Fatalf("expected a RST packet within a second but got none: %s", err)
+				}
+			} else {
+				if got, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, 10*time.Second); got != nil || err == nil {
+					t.Fatalf("expected no RST packets within ten seconds but got one: %s", got)
+				}
+			}
+		})
+	}
+}
diff --git a/test/packetimpact/tests/icmpv6_param_problem_test.go b/test/packetimpact/tests/icmpv6_param_problem_test.go
new file mode 100644
index 000000000..961059fc1
--- /dev/null
+++ b/test/packetimpact/tests/icmpv6_param_problem_test.go
@@ -0,0 +1,78 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package icmpv6_param_problem_test
+
+import (
+	"encoding/binary"
+	"flag"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestICMPv6ParamProblemTest sends a packet with a bad next header. The DUT
+// should respond with an ICMPv6 Parameter Problem message.
+func TestICMPv6ParamProblemTest(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{})
+	defer conn.Close()
+	ipv6 := testbench.IPv6{
+		// 254 is reserved and used for experimentation and testing. This should
+		// cause an error.
+		NextHeader: testbench.Uint8(254),
+	}
+	icmpv6 := testbench.ICMPv6{
+		Type:       testbench.ICMPv6Type(header.ICMPv6EchoRequest),
+		NDPPayload: []byte("hello world"),
+	}
+
+	toSend := conn.CreateFrame(ipv6, &icmpv6)
+	conn.SendFrame(toSend)
+
+	// Build the expected ICMPv6 payload, which includes an index to the
+	// problematic byte and also the problematic packet as described in
+	// https://tools.ietf.org/html/rfc4443#page-12 .
+	ipv6Sent := toSend[1:]
+	expectedPayload, err := ipv6Sent.ToBytes()
+	if err != nil {
+		t.Fatalf("can't convert %s to bytes: %s", ipv6Sent, err)
+	}
+
+	// The problematic field is the NextHeader.
+	b := make([]byte, 4)
+	binary.BigEndian.PutUint32(b, header.IPv6NextHeaderOffset)
+	expectedPayload = append(b, expectedPayload...)
+	expectedICMPv6 := testbench.ICMPv6{
+		Type:       testbench.ICMPv6Type(header.ICMPv6ParamProblem),
+		NDPPayload: expectedPayload,
+	}
+
+	paramProblem := testbench.Layers{
+		&testbench.Ether{},
+		&testbench.IPv6{},
+		&expectedICMPv6,
+	}
+	timeout := time.Second
+	if _, err := conn.ExpectFrame(paramProblem, timeout); err != nil {
+		t.Errorf("expected %s within %s but got none: %s", paramProblem, timeout, err)
+	}
+}
diff --git a/test/packetimpact/tests/ipv4_id_uniqueness_test.go b/test/packetimpact/tests/ipv4_id_uniqueness_test.go
new file mode 100644
index 000000000..4efb9829c
--- /dev/null
+++ b/test/packetimpact/tests/ipv4_id_uniqueness_test.go
@@ -0,0 +1,111 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv4_id_uniqueness_test
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func recvTCPSegment(conn *testbench.TCPIPv4, expect *testbench.TCP, expectPayload *testbench.Payload) (uint16, error) {
+	layers, err := conn.ExpectData(expect, expectPayload, time.Second)
+	if err != nil {
+		return 0, fmt.Errorf("failed to receive TCP segment: %s", err)
+	}
+	if len(layers) < 2 {
+		return 0, fmt.Errorf("got packet with layers: %v, expected to have at least 2 layers (link and network)", layers)
+	}
+	ipv4, ok := layers[1].(*testbench.IPv4)
+	if !ok {
+		return 0, fmt.Errorf("got network layer: %T, expected: *IPv4", layers[1])
+	}
+	if *ipv4.Flags&header.IPv4FlagDontFragment != 0 {
+		return 0, fmt.Errorf("got IPv4 DF=1, expected DF=0")
+	}
+	return *ipv4.ID, nil
+}
+
+// RFC 6864 section 4.2 states: "The IPv4 ID of non-atomic datagrams MUST NOT
+// be reused when sending a copy of an earlier non-atomic datagram."
+//
+// This test creates a TCP connection, uses the IP_MTU_DISCOVER socket option
+// to force the DF bit to be 0, and checks that a retransmitted segment has a
+// different IPv4 Identification value than the original segment.
+func TestIPv4RetransmitIdentificationUniqueness(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+
+	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFD)
+
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	conn.Connect()
+	remoteFD, _ := dut.Accept(listenFD)
+	defer dut.Close(remoteFD)
+
+	dut.SetSockOptInt(remoteFD, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+	// TODO(b/129291778) The following socket option clears the DF bit on
+	// IP packets sent over the socket, and is currently not supported by
+	// gVisor. gVisor by default sends packets with DF=0 anyway, so the
+	// socket option being not supported does not affect the operation of
+	// this test. Once the socket option is supported, the following call
+	// can be changed to simply assert success.
+	ret, errno := dut.SetSockOptIntWithErrno(context.Background(), remoteFD, unix.IPPROTO_IP, linux.IP_MTU_DISCOVER, linux.IP_PMTUDISC_DONT)
+	if ret == -1 && errno != unix.ENOTSUP {
+		t.Fatalf("failed to set IP_MTU_DISCOVER socket option to IP_PMTUDISC_DONT: %s", errno)
+	}
+
+	sampleData := []byte("Sample Data")
+	samplePayload := &testbench.Payload{Bytes: sampleData}
+
+	dut.Send(remoteFD, sampleData, 0)
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("failed to receive TCP segment sent for RTT calculation: %s", err)
+	}
+	// Let the DUT estimate RTO with RTT from the DATA-ACK.
+	// TODO(gvisor.dev/issue/2685) Estimate RTO during handshake, after which
+	// we can skip sending this ACK.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+
+	expectTCP := &testbench.TCP{SeqNum: testbench.Uint32(uint32(*conn.RemoteSeqNum()))}
+	dut.Send(remoteFD, sampleData, 0)
+	originalID, err := recvTCPSegment(&conn, expectTCP, samplePayload)
+	if err != nil {
+		t.Fatalf("failed to receive TCP segment: %s", err)
+	}
+
+	retransmitID, err := recvTCPSegment(&conn, expectTCP, samplePayload)
+	if err != nil {
+		t.Fatalf("failed to receive retransmitted TCP segment: %s", err)
+	}
+	if originalID == retransmitID {
+		t.Fatalf("unexpectedly got retransmitted TCP segment with same IPv4 ID field=%d", originalID)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_close_wait_ack_test.go b/test/packetimpact/tests/tcp_close_wait_ack_test.go
new file mode 100644
index 000000000..6e7ff41d7
--- /dev/null
+++ b/test/packetimpact/tests/tcp_close_wait_ack_test.go
@@ -0,0 +1,108 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_close_wait_ack_test
+
+import (
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestCloseWaitAck(t *testing.T) {
+	for _, tt := range []struct {
+		description    string
+		makeTestingTCP func(conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP
+		seqNumOffset   seqnum.Size
+		expectAck      bool
+	}{
+		{"OTW", GenerateOTWSeqSegment, 0, false},
+		{"OTW", GenerateOTWSeqSegment, 1, true},
+		{"OTW", GenerateOTWSeqSegment, 2, true},
+		{"ACK", GenerateUnaccACKSegment, 0, false},
+		{"ACK", GenerateUnaccACKSegment, 1, true},
+		{"ACK", GenerateUnaccACKSegment, 2, true},
+	} {
+		t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+			defer dut.Close(listenFd)
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close()
+
+			conn.Connect()
+			acceptFd, _ := dut.Accept(listenFd)
+
+			// Send a FIN to DUT to intiate the active close
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)})
+			gotTCP, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+			if err != nil {
+				t.Fatalf("expected an ACK for our fin and DUT should enter CLOSE_WAIT: %s", err)
+			}
+			windowSize := seqnum.Size(*gotTCP.WindowSize)
+
+			// Send a segment with OTW Seq / unacc ACK and expect an ACK back
+			conn.Send(tt.makeTestingTCP(&conn, tt.seqNumOffset, windowSize), &testbench.Payload{Bytes: []byte("Sample Data")})
+			gotAck, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+			if tt.expectAck && err != nil {
+				t.Fatalf("expected an ack but got none: %s", err)
+			}
+			if !tt.expectAck && gotAck != nil {
+				t.Fatalf("expected no ack but got one: %s", gotAck)
+			}
+
+			// Now let's verify DUT is indeed in CLOSE_WAIT
+			dut.Close(acceptFd)
+			if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)}, time.Second); err != nil {
+				t.Fatalf("expected DUT to send a FIN: %s", err)
+			}
+			// Ack the FIN from DUT
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+			// Send some extra data to DUT
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: []byte("Sample Data")})
+			if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
+				t.Fatalf("expected DUT to send an RST: %s", err)
+			}
+		})
+	}
+}
+
+// This generates an segment with seqnum = RCV.NXT + RCV.WND + seqNumOffset, the
+// generated segment is only acceptable when seqNumOffset is 0, otherwise an ACK
+// is expected from the receiver.
+func GenerateOTWSeqSegment(conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
+	lastAcceptable := conn.LocalSeqNum().Add(windowSize)
+	otwSeq := uint32(lastAcceptable.Add(seqNumOffset))
+	return testbench.TCP{SeqNum: testbench.Uint32(otwSeq), Flags: testbench.Uint8(header.TCPFlagAck)}
+}
+
+// This generates an segment with acknum = SND.NXT + seqNumOffset, the generated
+// segment is only acceptable when seqNumOffset is 0, otherwise an ACK is
+// expected from the receiver.
+func GenerateUnaccACKSegment(conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
+	lastAcceptable := conn.RemoteSeqNum()
+	unaccAck := uint32(lastAcceptable.Add(seqNumOffset))
+	return testbench.TCP{AckNum: testbench.Uint32(unaccAck), Flags: testbench.Uint8(header.TCPFlagAck)}
+}
diff --git a/test/packetimpact/tests/tcp_cork_mss_test.go b/test/packetimpact/tests/tcp_cork_mss_test.go
new file mode 100644
index 000000000..fb8f48629
--- /dev/null
+++ b/test/packetimpact/tests/tcp_cork_mss_test.go
@@ -0,0 +1,84 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_cork_mss_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestTCPCorkMSS tests for segment coalesce and split as per MSS.
+func TestTCPCorkMSS(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFD)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	const mss = uint32(header.TCPDefaultMSS)
+	options := make([]byte, header.TCPOptionMSSLength)
+	header.EncodeMSSOption(mss, options)
+	conn.ConnectWithOptions(options)
+
+	acceptFD, _ := dut.Accept(listenFD)
+	defer dut.Close(acceptFD)
+
+	dut.SetSockOptInt(acceptFD, unix.IPPROTO_TCP, unix.TCP_CORK, 1)
+
+	// Let the dut application send 2 small segments to be held up and coalesced
+	// until the application sends a larger segment to fill up to > MSS.
+	sampleData := []byte("Sample Data")
+	dut.Send(acceptFD, sampleData, 0)
+	dut.Send(acceptFD, sampleData, 0)
+
+	expectedData := sampleData
+	expectedData = append(expectedData, sampleData...)
+	largeData := make([]byte, mss+1)
+	expectedData = append(expectedData, largeData...)
+	dut.Send(acceptFD, largeData, 0)
+
+	// Expect the segments to be coalesced and sent and capped to MSS.
+	expectedPayload := testbench.Payload{Bytes: expectedData[:mss]}
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &expectedPayload, time.Second); err != nil {
+		t.Fatalf("expected payload was not received: %s", err)
+	}
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+	// Expect the coalesced segment to be split and transmitted.
+	expectedPayload = testbench.Payload{Bytes: expectedData[mss:]}
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &expectedPayload, time.Second); err != nil {
+		t.Fatalf("expected payload was not received: %s", err)
+	}
+
+	// Check for segments to *not* be held up because of TCP_CORK when
+	// the current send window is less than MSS.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(uint16(2 * len(sampleData)))})
+	dut.Send(acceptFD, sampleData, 0)
+	dut.Send(acceptFD, sampleData, 0)
+	expectedPayload = testbench.Payload{Bytes: append(sampleData, sampleData...)}
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &expectedPayload, time.Second); err != nil {
+		t.Fatalf("expected payload was not received: %s", err)
+	}
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+}
diff --git a/test/packetimpact/tests/tcp_noaccept_close_rst_test.go b/test/packetimpact/tests/tcp_noaccept_close_rst_test.go
new file mode 100644
index 000000000..b9b3e91d3
--- /dev/null
+++ b/test/packetimpact/tests/tcp_noaccept_close_rst_test.go
@@ -0,0 +1,42 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_noaccept_close_rst_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestTcpNoAcceptCloseReset(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	conn.Connect()
+	defer conn.Close()
+	dut.Close(listenFd)
+	if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, 1*time.Second); err != nil {
+		t.Fatalf("expected a RST-ACK packet but got none: %s", err)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_outside_the_window_test.go b/test/packetimpact/tests/tcp_outside_the_window_test.go
new file mode 100644
index 000000000..ad8c74234
--- /dev/null
+++ b/test/packetimpact/tests/tcp_outside_the_window_test.go
@@ -0,0 +1,93 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_outside_the_window_test
+
+import (
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestTCPOutsideTheWindows tests the behavior of the DUT when packets arrive
+// that are inside or outside the TCP window. Packets that are outside the
+// window should force an extra ACK, as described in RFC793 page 69:
+// https://tools.ietf.org/html/rfc793#page-69
+func TestTCPOutsideTheWindow(t *testing.T) {
+	for _, tt := range []struct {
+		description  string
+		tcpFlags     uint8
+		payload      []testbench.Layer
+		seqNumOffset seqnum.Size
+		expectACK    bool
+	}{
+		{"SYN", header.TCPFlagSyn, nil, 0, true},
+		{"SYNACK", header.TCPFlagSyn | header.TCPFlagAck, nil, 0, true},
+		{"ACK", header.TCPFlagAck, nil, 0, false},
+		{"FIN", header.TCPFlagFin, nil, 0, false},
+		{"Data", header.TCPFlagAck, []testbench.Layer{&testbench.Payload{Bytes: []byte("abc123")}}, 0, true},
+
+		{"SYN", header.TCPFlagSyn, nil, 1, true},
+		{"SYNACK", header.TCPFlagSyn | header.TCPFlagAck, nil, 1, true},
+		{"ACK", header.TCPFlagAck, nil, 1, true},
+		{"FIN", header.TCPFlagFin, nil, 1, false},
+		{"Data", header.TCPFlagAck, []testbench.Layer{&testbench.Payload{Bytes: []byte("abc123")}}, 1, true},
+
+		{"SYN", header.TCPFlagSyn, nil, 2, true},
+		{"SYNACK", header.TCPFlagSyn | header.TCPFlagAck, nil, 2, true},
+		{"ACK", header.TCPFlagAck, nil, 2, true},
+		{"FIN", header.TCPFlagFin, nil, 2, false},
+		{"Data", header.TCPFlagAck, []testbench.Layer{&testbench.Payload{Bytes: []byte("abc123")}}, 2, true},
+	} {
+		t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+			defer dut.Close(listenFD)
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close()
+			conn.Connect()
+			acceptFD, _ := dut.Accept(listenFD)
+			defer dut.Close(acceptFD)
+
+			windowSize := seqnum.Size(*conn.SynAck().WindowSize) + tt.seqNumOffset
+			conn.Drain()
+			// Ignore whatever incrementing that this out-of-order packet might cause
+			// to the AckNum.
+			localSeqNum := testbench.Uint32(uint32(*conn.LocalSeqNum()))
+			conn.Send(testbench.TCP{
+				Flags:  testbench.Uint8(tt.tcpFlags),
+				SeqNum: testbench.Uint32(uint32(conn.LocalSeqNum().Add(windowSize))),
+			}, tt.payload...)
+			timeout := 3 * time.Second
+			gotACK, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: localSeqNum}, timeout)
+			if tt.expectACK && err != nil {
+				t.Fatalf("expected an ACK packet within %s but got none: %s", timeout, err)
+			}
+			if !tt.expectACK && gotACK != nil {
+				t.Fatalf("expected no ACK packet within %s but got one: %s", timeout, gotACK)
+			}
+		})
+	}
+}
diff --git a/test/packetimpact/tests/tcp_paws_mechanism_test.go b/test/packetimpact/tests/tcp_paws_mechanism_test.go
new file mode 100644
index 000000000..55db4ece6
--- /dev/null
+++ b/test/packetimpact/tests/tcp_paws_mechanism_test.go
@@ -0,0 +1,109 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_paws_mechanism_test
+
+import (
+	"encoding/hex"
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestPAWSMechanism(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFD)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	options := make([]byte, header.TCPOptionTSLength)
+	header.EncodeTSOption(currentTS(), 0, options)
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn), Options: options})
+	synAck, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second)
+	if err != nil {
+		t.Fatalf("didn't get synack during handshake: %s", err)
+	}
+	parsedSynOpts := header.ParseSynOptions(synAck.Options, true)
+	if !parsedSynOpts.TS {
+		t.Fatalf("expected TSOpt from DUT, options we got:\n%s", hex.Dump(synAck.Options))
+	}
+	tsecr := parsedSynOpts.TSVal
+	header.EncodeTSOption(currentTS(), tsecr, options)
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options})
+	acceptFD, _ := dut.Accept(listenFD)
+	defer dut.Close(acceptFD)
+
+	sampleData := []byte("Sample Data")
+	sentTSVal := currentTS()
+	header.EncodeTSOption(sentTSVal, tsecr, options)
+	// 3ms here is chosen arbitrarily to make sure we have increasing timestamps
+	// every time we send one, it should not cause any flakiness because timestamps
+	// only need to be non-decreasing.
+	time.Sleep(3 * time.Millisecond)
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options}, &testbench.Payload{Bytes: sampleData})
+
+	gotTCP, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+	if err != nil {
+		t.Fatalf("expected an ACK but got none: %s", err)
+	}
+
+	parsedOpts := header.ParseTCPOptions(gotTCP.Options)
+	if !parsedOpts.TS {
+		t.Fatalf("expected TS option in response, options we got:\n%s", hex.Dump(gotTCP.Options))
+	}
+	if parsedOpts.TSVal < tsecr {
+		t.Fatalf("TSVal should be non-decreasing, but %d < %d", parsedOpts.TSVal, tsecr)
+	}
+	if parsedOpts.TSEcr != sentTSVal {
+		t.Fatalf("TSEcr should match our sent TSVal, %d != %d", parsedOpts.TSEcr, sentTSVal)
+	}
+	tsecr = parsedOpts.TSVal
+	lastAckNum := gotTCP.AckNum
+
+	badTSVal := sentTSVal - 100
+	header.EncodeTSOption(badTSVal, tsecr, options)
+	// 3ms here is chosen arbitrarily and this time.Sleep() should not cause flakiness
+	// due to the exact same reasoning discussed above.
+	time.Sleep(3 * time.Millisecond)
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options}, &testbench.Payload{Bytes: sampleData})
+
+	gotTCP, err = conn.Expect(testbench.TCP{AckNum: lastAckNum, Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+	if err != nil {
+		t.Fatalf("expected segment with AckNum %d but got none: %s", lastAckNum, err)
+	}
+	parsedOpts = header.ParseTCPOptions(gotTCP.Options)
+	if !parsedOpts.TS {
+		t.Fatalf("expected TS option in response, options we got:\n%s", hex.Dump(gotTCP.Options))
+	}
+	if parsedOpts.TSVal < tsecr {
+		t.Fatalf("TSVal should be non-decreasing, but %d < %d", parsedOpts.TSVal, tsecr)
+	}
+	if parsedOpts.TSEcr != sentTSVal {
+		t.Fatalf("TSEcr should match our sent TSVal, %d != %d", parsedOpts.TSEcr, sentTSVal)
+	}
+}
+
+func currentTS() uint32 {
+	return uint32(time.Now().UnixNano() / 1e6)
+}
diff --git a/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go b/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go
new file mode 100644
index 000000000..b640d8673
--- /dev/null
+++ b/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go
@@ -0,0 +1,87 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_queue_receive_in_syn_sent_test
+
+import (
+	"bytes"
+	"context"
+	"encoding/hex"
+	"errors"
+	"flag"
+	"net"
+	"sync"
+	"syscall"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestQueueReceiveInSynSent(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+
+	socket, remotePort := dut.CreateBoundSocket(unix.SOCK_STREAM, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4))
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	sampleData := []byte("Sample Data")
+
+	dut.SetNonBlocking(socket, true)
+	if _, err := dut.ConnectWithErrno(context.Background(), socket, conn.LocalAddr()); !errors.Is(err, syscall.EINPROGRESS) {
+		t.Fatalf("failed to bring DUT to SYN-SENT, got: %s, want EINPROGRESS", err)
+	}
+	if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, time.Second); err != nil {
+		t.Fatalf("expected a SYN from DUT, but got none: %s", err)
+	}
+
+	// Issue RECEIVE call in SYN-SENT, this should be queued for process until the connection
+	// is established.
+	dut.SetNonBlocking(socket, false)
+	var wg sync.WaitGroup
+	defer wg.Wait()
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second*3)
+		defer cancel()
+		n, buff, err := dut.RecvWithErrno(ctx, socket, int32(len(sampleData)), 0)
+		if n == -1 {
+			t.Fatalf("failed to recv on DUT: %s", err)
+		}
+		if got := buff[:n]; !bytes.Equal(got, sampleData) {
+			t.Fatalf("received data don't match, got:\n%s, want:\n%s", hex.Dump(got), hex.Dump(sampleData))
+		}
+	}()
+
+	// The following sleep is used to prevent the connection from being established while the
+	// RPC is in flight.
+	time.Sleep(time.Second)
+
+	// Bring the connection to Established.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)})
+	if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil {
+		t.Fatalf("expected an ACK from DUT, but got none: %s", err)
+	}
+
+	// Send sample data to DUT.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: sampleData})
+}
diff --git a/test/packetimpact/tests/tcp_retransmits_test.go b/test/packetimpact/tests/tcp_retransmits_test.go
new file mode 100644
index 000000000..e51409b66
--- /dev/null
+++ b/test/packetimpact/tests/tcp_retransmits_test.go
@@ -0,0 +1,84 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_retransmits_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestRetransmits tests retransmits occur at exponentially increasing
+// time intervals.
+func TestRetransmits(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFd)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	conn.Connect()
+	acceptFd, _ := dut.Accept(listenFd)
+	defer dut.Close(acceptFd)
+
+	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+	sampleData := []byte("Sample Data")
+	samplePayload := &testbench.Payload{Bytes: sampleData}
+
+	dut.Send(acceptFd, sampleData, 0)
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+	// Give a chance for the dut to estimate RTO with RTT from the DATA-ACK.
+	// TODO(gvisor.dev/issue/2685) Estimate RTO during handshake, after which
+	// we can skip sending this ACK.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+
+	startRTO := time.Second
+	current := startRTO
+	first := time.Now()
+	dut.Send(acceptFd, sampleData, 0)
+	seq := testbench.Uint32(uint32(*conn.RemoteSeqNum()))
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: seq}, samplePayload, startRTO); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+	// Expect retransmits of the same segment.
+	for i := 0; i < 5; i++ {
+		start := time.Now()
+		if _, err := conn.ExpectData(&testbench.TCP{SeqNum: seq}, samplePayload, 2*current); err != nil {
+			t.Fatalf("expected a packet with payload %v: %s loop %d", samplePayload, err, i)
+		}
+		if i == 0 {
+			startRTO = time.Now().Sub(first)
+			current = 2 * startRTO
+			continue
+		}
+		// Check if the probes came at exponentially increasing intervals.
+		if p := time.Since(start); p < current-startRTO {
+			t.Fatalf("retransmit came sooner interval %d probe %d\n", p, i)
+		}
+		current *= 2
+	}
+}
diff --git a/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go b/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go
new file mode 100644
index 000000000..90ab85419
--- /dev/null
+++ b/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_send_window_sizes_piggyback_test
+
+import (
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestSendWindowSizesPiggyback tests cases where segment sizes are close to
+// sender window size and checks for ACK piggybacking for each of those case.
+func TestSendWindowSizesPiggyback(t *testing.T) {
+	sampleData := []byte("Sample Data")
+	segmentSize := uint16(len(sampleData))
+	// Advertise receive window sizes that are lesser, equal to or greater than
+	// enqueued segment size and check for segment transmits. The test attempts
+	// to enqueue a segment on the dut before acknowledging previous segment and
+	// lets the dut piggyback any ACKs along with the enqueued segment.
+	for _, tt := range []struct {
+		description      string
+		windowSize       uint16
+		expectedPayload1 []byte
+		expectedPayload2 []byte
+		enqueue          bool
+	}{
+		// Expect the first segment to be split as it cannot be accomodated in
+		// the sender window. This means we need not enqueue a new segment after
+		// the first segment.
+		{"WindowSmallerThanSegment", segmentSize - 1, sampleData[:(segmentSize - 1)], sampleData[(segmentSize - 1):], false /* enqueue */},
+
+		{"WindowEqualToSegment", segmentSize, sampleData, sampleData, true /* enqueue */},
+
+		// Expect the second segment to not be split as its size is greater than
+		// the available sender window size. The segments should not be split
+		// when there is pending unacknowledged data and the segment-size is
+		// greater than available sender window.
+		{"WindowGreaterThanSegment", segmentSize + 1, sampleData, sampleData, true /* enqueue */},
+	} {
+		t.Run(fmt.Sprintf("%s%d", tt.description, tt.windowSize), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+			defer dut.Close(listenFd)
+
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort, WindowSize: testbench.Uint16(tt.windowSize)}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close()
+
+			conn.Connect()
+			acceptFd, _ := dut.Accept(listenFd)
+			defer dut.Close(acceptFd)
+
+			dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+			expectedTCP := testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}
+
+			dut.Send(acceptFd, sampleData, 0)
+			expectedPayload := testbench.Payload{Bytes: tt.expectedPayload1}
+			if _, err := conn.ExpectData(&expectedTCP, &expectedPayload, time.Second); err != nil {
+				t.Fatalf("expected payload was not received: %s", err)
+			}
+
+			// Expect any enqueued segment to be transmitted by the dut along with
+			// piggybacked ACK for our data.
+
+			if tt.enqueue {
+				// Enqueue a segment for the dut to transmit.
+				dut.Send(acceptFd, sampleData, 0)
+			}
+
+			// Send ACK for the previous segment along with data for the dut to
+			// receive and ACK back. Sending this ACK would make room for the dut
+			// to transmit any enqueued segment.
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh), WindowSize: testbench.Uint16(tt.windowSize)}, &testbench.Payload{Bytes: sampleData})
+
+			// Expect the dut to piggyback the ACK for received data along with
+			// the segment enqueued for transmit.
+			expectedPayload = testbench.Payload{Bytes: tt.expectedPayload2}
+			if _, err := conn.ExpectData(&expectedTCP, &expectedPayload, time.Second); err != nil {
+				t.Fatalf("expected payload was not received: %s", err)
+			}
+		})
+	}
+}
diff --git a/test/packetimpact/tests/tcp_splitseg_mss_test.go b/test/packetimpact/tests/tcp_splitseg_mss_test.go
new file mode 100644
index 000000000..9350d0988
--- /dev/null
+++ b/test/packetimpact/tests/tcp_splitseg_mss_test.go
@@ -0,0 +1,71 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_splitseg_mss_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestTCPSplitSegMSS lets the dut try to send segments larger than MSS.
+// It tests if the transmitted segments are capped at MSS and are split.
+func TestTCPSplitSegMSS(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFD)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	const mss = uint32(header.TCPDefaultMSS)
+	options := make([]byte, header.TCPOptionMSSLength)
+	header.EncodeMSSOption(mss, options)
+	conn.ConnectWithOptions(options)
+
+	acceptFD, _ := dut.Accept(listenFD)
+	defer dut.Close(acceptFD)
+
+	// Let the dut send a segment larger than MSS.
+	largeData := make([]byte, mss+1)
+	for i := 0; i < 2; i++ {
+		dut.Send(acceptFD, largeData, 0)
+		if i == 0 {
+			// On Linux, the initial segment goes out beyond MSS and the segment
+			// split occurs on retransmission. Call ExpectData to wait to
+			// receive the split segment.
+			if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: largeData[:mss]}, time.Second); err != nil {
+				t.Fatalf("expected payload was not received: %s", err)
+			}
+		} else {
+			if _, err := conn.ExpectNextData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: largeData[:mss]}, time.Second); err != nil {
+				t.Fatalf("expected payload was not received: %s", err)
+			}
+		}
+		conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+		if _, err := conn.ExpectNextData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &testbench.Payload{Bytes: largeData[mss:]}, time.Second); err != nil {
+			t.Fatalf("expected payload was not received: %s", err)
+		}
+		conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+	}
+}
diff --git a/test/packetimpact/tests/tcp_synrcvd_reset_test.go b/test/packetimpact/tests/tcp_synrcvd_reset_test.go
new file mode 100644
index 000000000..7d5deab01
--- /dev/null
+++ b/test/packetimpact/tests/tcp_synrcvd_reset_test.go
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_syn_reset_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestTCPSynRcvdReset tests transition from SYN-RCVD to CLOSED.
+func TestTCPSynRcvdReset(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFD)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	// Expect dut connection to have transitioned to SYN-RCVD state.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)})
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+		t.Fatalf("expected SYN-ACK %s", err)
+	}
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)})
+	// Expect the connection to have transitioned SYN-RCVD to CLOSED.
+	// TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+		t.Fatalf("expected a TCP RST %s", err)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_synsent_reset_test.go b/test/packetimpact/tests/tcp_synsent_reset_test.go
new file mode 100644
index 000000000..6898a2239
--- /dev/null
+++ b/test/packetimpact/tests/tcp_synsent_reset_test.go
@@ -0,0 +1,88 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_synsent_reset_test
+
+import (
+	"flag"
+	"net"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	tb.RegisterFlags(flag.CommandLine)
+}
+
+// dutSynSentState sets up the dut connection in SYN-SENT state.
+func dutSynSentState(t *testing.T) (*tb.DUT, *tb.TCPIPv4, uint16, uint16) {
+	dut := tb.NewDUT(t)
+
+	clientFD, clientPort := dut.CreateBoundSocket(unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, net.ParseIP(tb.RemoteIPv4))
+	port := uint16(9001)
+	conn := tb.NewTCPIPv4(t, tb.TCP{SrcPort: &port, DstPort: &clientPort}, tb.TCP{SrcPort: &clientPort, DstPort: &port})
+
+	sa := unix.SockaddrInet4{Port: int(port)}
+	copy(sa.Addr[:], net.IP(net.ParseIP(tb.LocalIPv4)).To4())
+	// Bring the dut to SYN-SENT state with a non-blocking connect.
+	dut.Connect(clientFD, &sa)
+	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)}, nil, time.Second); err != nil {
+		t.Fatalf("expected SYN\n")
+	}
+
+	return &dut, &conn, port, clientPort
+}
+
+// TestTCPSynSentReset tests RFC793, p67: SYN-SENT to CLOSED transition.
+func TestTCPSynSentReset(t *testing.T) {
+	dut, conn, _, _ := dutSynSentState(t)
+	defer conn.Close()
+	defer dut.TearDown()
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst | header.TCPFlagAck)})
+	// Expect the connection to have closed.
+	// TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side.
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+		t.Fatalf("expected a TCP RST")
+	}
+}
+
+// TestTCPSynSentRcvdReset tests RFC793, p70, SYN-SENT to SYN-RCVD to CLOSED
+// transitions.
+func TestTCPSynSentRcvdReset(t *testing.T) {
+	dut, c, remotePort, clientPort := dutSynSentState(t)
+	defer dut.TearDown()
+	defer c.Close()
+
+	conn := tb.NewTCPIPv4(t, tb.TCP{SrcPort: &remotePort, DstPort: &clientPort}, tb.TCP{SrcPort: &clientPort, DstPort: &remotePort})
+	defer conn.Close()
+	// Initiate new SYN connection with the same port pair
+	// (simultaneous open case), expect the dut connection to move to
+	// SYN-RCVD state
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)})
+	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+		t.Fatalf("expected SYN-ACK %s\n", err)
+	}
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)})
+	// Expect the connection to have transitioned SYN-RCVD to CLOSED.
+	// TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side.
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+		t.Fatalf("expected a TCP RST")
+	}
+}
diff --git a/test/packetimpact/tests/tcp_user_timeout_test.go b/test/packetimpact/tests/tcp_user_timeout_test.go
new file mode 100644
index 000000000..87e45d765
--- /dev/null
+++ b/test/packetimpact/tests/tcp_user_timeout_test.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_user_timeout_test
+
+import (
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func sendPayload(conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) error {
+	sampleData := make([]byte, 100)
+	for i := range sampleData {
+		sampleData[i] = uint8(i)
+	}
+	conn.Drain()
+	dut.Send(fd, sampleData, 0)
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &testbench.Payload{Bytes: sampleData}, time.Second); err != nil {
+		return fmt.Errorf("expected data but got none: %w", err)
+	}
+	return nil
+}
+
+func sendFIN(conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) error {
+	dut.Close(fd)
+	return nil
+}
+
+func TestTCPUserTimeout(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		userTimeout time.Duration
+		sendDelay   time.Duration
+	}{
+		{"NoUserTimeout", 0, 3 * time.Second},
+		{"ACKBeforeUserTimeout", 5 * time.Second, 4 * time.Second},
+		{"ACKAfterUserTimeout", 5 * time.Second, 7 * time.Second},
+	} {
+		for _, ttf := range []struct {
+			description string
+			f           func(conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) error
+		}{
+			{"AfterPayload", sendPayload},
+			{"AfterFIN", sendFIN},
+		} {
+			t.Run(tt.description+ttf.description, func(t *testing.T) {
+				// Create a socket, listen, TCP handshake, and accept.
+				dut := testbench.NewDUT(t)
+				defer dut.TearDown()
+				listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+				defer dut.Close(listenFD)
+				conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+				defer conn.Close()
+				conn.Connect()
+				acceptFD, _ := dut.Accept(listenFD)
+
+				if tt.userTimeout != 0 {
+					dut.SetSockOptInt(acceptFD, unix.SOL_TCP, unix.TCP_USER_TIMEOUT, int32(tt.userTimeout.Milliseconds()))
+				}
+
+				if err := ttf.f(&conn, &dut, acceptFD); err != nil {
+					t.Fatal(err)
+				}
+
+				time.Sleep(tt.sendDelay)
+				conn.Drain()
+				conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+
+				// If TCP_USER_TIMEOUT was set and the above delay was longer than the
+				// TCP_USER_TIMEOUT then the DUT should send a RST in response to the
+				// testbench's packet.
+				expectRST := tt.userTimeout != 0 && tt.sendDelay > tt.userTimeout
+				expectTimeout := 5 * time.Second
+				got, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, expectTimeout)
+				if expectRST && err != nil {
+					t.Errorf("expected RST packet within %s but got none: %s", expectTimeout, err)
+				}
+				if !expectRST && got != nil {
+					t.Errorf("expected no RST packet within %s but got one: %s", expectTimeout, got)
+				}
+			})
+		}
+	}
+}
diff --git a/test/packetimpact/tests/tcp_window_shrink_test.go b/test/packetimpact/tests/tcp_window_shrink_test.go
new file mode 100644
index 000000000..576ec1a8b
--- /dev/null
+++ b/test/packetimpact/tests/tcp_window_shrink_test.go
@@ -0,0 +1,73 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_window_shrink_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestWindowShrink(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFd)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	conn.Connect()
+	acceptFd, _ := dut.Accept(listenFd)
+	defer dut.Close(acceptFd)
+
+	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+	sampleData := []byte("Sample Data")
+	samplePayload := &testbench.Payload{Bytes: sampleData}
+
+	dut.Send(acceptFd, sampleData, 0)
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+
+	dut.Send(acceptFd, sampleData, 0)
+	dut.Send(acceptFd, sampleData, 0)
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+	// We close our receiving window here
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
+
+	dut.Send(acceptFd, []byte("Sample Data"), 0)
+	// Note: There is another kind of zero-window probing which Windows uses (by sending one
+	// new byte at `RemoteSeqNum`), if netstack wants to go that way, we may want to change
+	// the following lines.
+	expectedRemoteSeqNum := *conn.RemoteSeqNum() - 1
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: testbench.Uint32(uint32(expectedRemoteSeqNum))}, nil, time.Second); err != nil {
+		t.Fatalf("expected a packet with sequence number %v: %s", expectedRemoteSeqNum, err)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go
new file mode 100644
index 000000000..54cee138f
--- /dev/null
+++ b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_zero_window_probe_retransmit_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestZeroWindowProbeRetransmit tests retransmits of zero window probes
+// to be sent at exponentially inreasing time intervals.
+func TestZeroWindowProbeRetransmit(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFd)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	conn.Connect()
+	acceptFd, _ := dut.Accept(listenFd)
+	defer dut.Close(acceptFd)
+
+	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+	sampleData := []byte("Sample Data")
+	samplePayload := &testbench.Payload{Bytes: sampleData}
+
+	// Send and receive sample data to the dut.
+	dut.Send(acceptFd, sampleData, 0)
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil {
+		t.Fatalf("expected a packet with sequence number %s", err)
+	}
+
+	// Check for the dut to keep the connection alive as long as the zero window
+	// probes are acknowledged. Check if the zero window probes are sent at
+	// exponentially increasing intervals. The timeout intervals are function
+	// of the recorded first zero probe transmission duration.
+	//
+	// Advertize zero receive window again.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
+	probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum() - 1))
+	ackProbe := testbench.Uint32(uint32(*conn.RemoteSeqNum()))
+
+	startProbeDuration := time.Second
+	current := startProbeDuration
+	first := time.Now()
+	// Ask the dut to send out data.
+	dut.Send(acceptFd, sampleData, 0)
+	// Expect the dut to keep the connection alive as long as the remote is
+	// acknowledging the zero-window probes.
+	for i := 0; i < 5; i++ {
+		start := time.Now()
+		// Expect zero-window probe with a timeout which is a function of the typical
+		// first retransmission time. The retransmission times is supposed to
+		// exponentially increase.
+		if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, 2*current); err != nil {
+			t.Fatalf("expected a probe with sequence number %v: loop %d", probeSeq, i)
+		}
+		if i == 0 {
+			startProbeDuration = time.Now().Sub(first)
+			current = 2 * startProbeDuration
+			continue
+		}
+		// Check if the probes came at exponentially increasing intervals.
+		if p := time.Since(start); p < current-startProbeDuration {
+			t.Fatalf("zero probe came sooner interval %d probe %d\n", p, i)
+		}
+		// Acknowledge the zero-window probes from the dut.
+		conn.Send(testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
+		current *= 2
+	}
+	// Advertize non-zero window.
+	conn.Send(testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck)})
+	// Expect the dut to recover and transmit data.
+	if _, err := conn.ExpectData(&testbench.
+		TCP{SeqNum: ackProbe}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_zero_window_probe_test.go b/test/packetimpact/tests/tcp_zero_window_probe_test.go
new file mode 100644
index 000000000..c9b3b7af2
--- /dev/null
+++ b/test/packetimpact/tests/tcp_zero_window_probe_test.go
@@ -0,0 +1,112 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_zero_window_probe_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestZeroWindowProbe tests few cases of zero window probing over the
+// same connection.
+func TestZeroWindowProbe(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFd)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	conn.Connect()
+	acceptFd, _ := dut.Accept(listenFd)
+	defer dut.Close(acceptFd)
+
+	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+	sampleData := []byte("Sample Data")
+	samplePayload := &testbench.Payload{Bytes: sampleData}
+
+	start := time.Now()
+	// Send and receive sample data to the dut.
+	dut.Send(acceptFd, sampleData, 0)
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+	sendTime := time.Now().Sub(start)
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil {
+		t.Fatalf("expected a packet with sequence number %s", err)
+	}
+
+	// Test 1: Check for receive of a zero window probe, record the duration for
+	//         probe to be sent.
+	//
+	// Advertize zero window to the dut.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
+
+	// Expected sequence number of the zero window probe.
+	probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum() - 1))
+	// Expected ack number of the ACK for the probe.
+	ackProbe := testbench.Uint32(uint32(*conn.RemoteSeqNum()))
+
+	// Expect there are no zero-window probes sent until there is data to be sent out
+	// from the dut.
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, 2*time.Second); err == nil {
+		t.Fatalf("unexpected a packet with sequence number %v: %s", probeSeq, err)
+	}
+
+	start = time.Now()
+	// Ask the dut to send out data.
+	dut.Send(acceptFd, sampleData, 0)
+	// Expect zero-window probe from the dut.
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, time.Second); err != nil {
+		t.Fatalf("expected a packet with sequence number %v: %s", probeSeq, err)
+	}
+	// Expect the probe to be sent after some time. Compare against the previous
+	// time recorded when the dut immediately sends out data on receiving the
+	// send command.
+	if startProbeDuration := time.Now().Sub(start); startProbeDuration <= sendTime {
+		t.Fatalf("expected the first probe to be sent out after retransmission interval, got %v want > %v\n", startProbeDuration, sendTime)
+	}
+
+	// Test 2: Check if the dut recovers on advertizing non-zero receive window.
+	//         and sends out the sample payload after the send window opens.
+	//
+	// Advertize non-zero window to the dut and ack the zero window probe.
+	conn.Send(testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck)})
+	// Expect the dut to recover and transmit data.
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: ackProbe}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+
+	// Test 3: Sanity check for dut's processing of a similar probe it sent.
+	//         Check if the dut responds as we do for a similar probe sent to it.
+	//         Basically with sequence number to one byte behind the unacknowledged
+	//         sequence number.
+	p := testbench.Uint32(uint32(*conn.LocalSeqNum()))
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), SeqNum: testbench.Uint32(uint32(*conn.LocalSeqNum() - 1))})
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: p}, nil, time.Second); err != nil {
+		t.Fatalf("expected a packet with ack number: %d: %s", p, err)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go b/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go
new file mode 100644
index 000000000..749281d9d
--- /dev/null
+++ b/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go
@@ -0,0 +1,98 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_zero_window_probe_usertimeout_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestZeroWindowProbeUserTimeout sanity tests user timeout when we are
+// retransmitting zero window probes.
+func TestZeroWindowProbeUserTimeout(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFd)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	conn.Connect()
+	acceptFd, _ := dut.Accept(listenFd)
+	defer dut.Close(acceptFd)
+
+	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+	sampleData := []byte("Sample Data")
+	samplePayload := &testbench.Payload{Bytes: sampleData}
+
+	// Send and receive sample data to the dut.
+	dut.Send(acceptFd, sampleData, 0)
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+	}
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil {
+		t.Fatalf("expected a packet with sequence number %s", err)
+	}
+
+	// Test 1: Check for receive of a zero window probe, record the duration for
+	//         probe to be sent.
+	//
+	// Advertize zero window to the dut.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
+
+	// Expected sequence number of the zero window probe.
+	probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum() - 1))
+	start := time.Now()
+	// Ask the dut to send out data.
+	dut.Send(acceptFd, sampleData, 0)
+	// Expect zero-window probe from the dut.
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, time.Second); err != nil {
+		t.Fatalf("expected a packet with sequence number %v: %s", probeSeq, err)
+	}
+	// Record the duration for first probe, the dut sends the zero window probe after
+	// a retransmission time interval.
+	startProbeDuration := time.Now().Sub(start)
+
+	// Test 2: Check if the dut times out the connection by honoring usertimeout
+	//         when the dut is sending zero-window probes.
+	//
+	// Reduce the retransmit timeout.
+	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_USER_TIMEOUT, int32(startProbeDuration.Milliseconds()))
+	// Advertize zero window again.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
+	// Ask the dut to send out data that would trigger zero window probe retransmissions.
+	dut.Send(acceptFd, sampleData, 0)
+
+	// Wait for the connection to timeout after multiple zero-window probe retransmissions.
+	time.Sleep(8 * startProbeDuration)
+
+	// Expect the connection to have timed out and closed which would cause the dut
+	// to reply with a RST to the ACK we send.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+		t.Fatalf("expected a TCP RST")
+	}
+}
diff --git a/test/packetimpact/tests/udp_icmp_error_propagation_test.go b/test/packetimpact/tests/udp_icmp_error_propagation_test.go
new file mode 100644
index 000000000..aedabf9de
--- /dev/null
+++ b/test/packetimpact/tests/udp_icmp_error_propagation_test.go
@@ -0,0 +1,364 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package udp_icmp_error_propagation_test
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"net"
+	"sync"
+	"syscall"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+type connectionMode bool
+
+func (c connectionMode) String() string {
+	if c {
+		return "Connected"
+	}
+	return "Connectionless"
+}
+
+type icmpError int
+
+const (
+	portUnreachable icmpError = iota
+	timeToLiveExceeded
+)
+
+func (e icmpError) String() string {
+	switch e {
+	case portUnreachable:
+		return "PortUnreachable"
+	case timeToLiveExceeded:
+		return "TimeToLiveExpired"
+	}
+	return "Unknown ICMP error"
+}
+
+func (e icmpError) ToICMPv4() *testbench.ICMPv4 {
+	switch e {
+	case portUnreachable:
+		return &testbench.ICMPv4{Type: testbench.ICMPv4Type(header.ICMPv4DstUnreachable), Code: testbench.Uint8(header.ICMPv4PortUnreachable)}
+	case timeToLiveExceeded:
+		return &testbench.ICMPv4{Type: testbench.ICMPv4Type(header.ICMPv4TimeExceeded), Code: testbench.Uint8(header.ICMPv4TTLExceeded)}
+	}
+	return nil
+}
+
+type errorDetection struct {
+	name         string
+	useValidConn bool
+	f            func(context.Context, testData) error
+}
+
+type testData struct {
+	dut        *testbench.DUT
+	conn       *testbench.UDPIPv4
+	remoteFD   int32
+	remotePort uint16
+	cleanFD    int32
+	cleanPort  uint16
+	wantErrno  syscall.Errno
+}
+
+// wantErrno computes the errno to expect given the connection mode of a UDP
+// socket and the ICMP error it will receive.
+func wantErrno(c connectionMode, icmpErr icmpError) syscall.Errno {
+	if c && icmpErr == portUnreachable {
+		return syscall.Errno(unix.ECONNREFUSED)
+	}
+	return syscall.Errno(0)
+}
+
+// sendICMPError sends an ICMP error message in response to a UDP datagram.
+func sendICMPError(conn *testbench.UDPIPv4, icmpErr icmpError, udp *testbench.UDP) error {
+	if icmpErr == timeToLiveExceeded {
+		ip, ok := udp.Prev().(*testbench.IPv4)
+		if !ok {
+			return fmt.Errorf("expected %s to be IPv4", udp.Prev())
+		}
+		*ip.TTL = 1
+		// Let serialization recalculate the checksum since we set the TTL
+		// to 1.
+		ip.Checksum = nil
+
+		// Note that the ICMP payload is valid in this case because the UDP
+		// payload is empty. If the UDP payload were not empty, the packet
+		// length during serialization may not be calculated correctly,
+		// resulting in a mal-formed packet.
+		conn.SendIP(icmpErr.ToICMPv4(), ip, udp)
+	} else {
+		conn.SendIP(icmpErr.ToICMPv4(), udp.Prev(), udp)
+	}
+	return nil
+}
+
+// testRecv tests observing the ICMP error through the recv syscall. A packet
+// is sent to the DUT, and if wantErrno is non-zero, then the first recv should
+// fail and the second should succeed. Otherwise if wantErrno is zero then the
+// first recv should succeed immediately.
+func testRecv(ctx context.Context, d testData) error {
+	// Check that receiving on the clean socket works.
+	d.conn.Send(testbench.UDP{DstPort: &d.cleanPort})
+	d.dut.Recv(d.cleanFD, 100, 0)
+
+	d.conn.Send(testbench.UDP{})
+
+	if d.wantErrno != syscall.Errno(0) {
+		ctx, cancel := context.WithTimeout(ctx, time.Second)
+		defer cancel()
+		ret, _, err := d.dut.RecvWithErrno(ctx, d.remoteFD, 100, 0)
+		if ret != -1 {
+			return fmt.Errorf("recv after ICMP error succeeded unexpectedly, expected (%[1]d) %[1]v", d.wantErrno)
+		}
+		if err != d.wantErrno {
+			return fmt.Errorf("recv after ICMP error resulted in error (%[1]d) %[1]v, expected (%[2]d) %[2]v", err, d.wantErrno)
+		}
+	}
+
+	d.dut.Recv(d.remoteFD, 100, 0)
+	return nil
+}
+
+// testSendTo tests observing the ICMP error through the send syscall. If
+// wantErrno is non-zero, the first send should fail and a subsequent send
+// should suceed; while if wantErrno is zero then the first send should just
+// succeed.
+func testSendTo(ctx context.Context, d testData) error {
+	// Check that sending on the clean socket works.
+	d.dut.SendTo(d.cleanFD, nil, 0, d.conn.LocalAddr())
+	if _, err := d.conn.Expect(testbench.UDP{SrcPort: &d.cleanPort}, time.Second); err != nil {
+		return fmt.Errorf("did not receive UDP packet from clean socket on DUT: %s", err)
+	}
+
+	if d.wantErrno != syscall.Errno(0) {
+		ctx, cancel := context.WithTimeout(ctx, time.Second)
+		defer cancel()
+		ret, err := d.dut.SendToWithErrno(ctx, d.remoteFD, nil, 0, d.conn.LocalAddr())
+
+		if ret != -1 {
+			return fmt.Errorf("sendto after ICMP error succeeded unexpectedly, expected (%[1]d) %[1]v", d.wantErrno)
+		}
+		if err != d.wantErrno {
+			return fmt.Errorf("sendto after ICMP error resulted in error (%[1]d) %[1]v, expected (%[2]d) %[2]v", err, d.wantErrno)
+		}
+	}
+
+	d.dut.SendTo(d.remoteFD, nil, 0, d.conn.LocalAddr())
+	if _, err := d.conn.Expect(testbench.UDP{}, time.Second); err != nil {
+		return fmt.Errorf("did not receive UDP packet as expected: %s", err)
+	}
+	return nil
+}
+
+func testSockOpt(_ context.Context, d testData) error {
+	// Check that there's no pending error on the clean socket.
+	if errno := syscall.Errno(d.dut.GetSockOptInt(d.cleanFD, unix.SOL_SOCKET, unix.SO_ERROR)); errno != syscall.Errno(0) {
+		return fmt.Errorf("unexpected error (%[1]d) %[1]v on clean socket", errno)
+	}
+
+	if errno := syscall.Errno(d.dut.GetSockOptInt(d.remoteFD, unix.SOL_SOCKET, unix.SO_ERROR)); errno != d.wantErrno {
+		return fmt.Errorf("SO_ERROR sockopt after ICMP error is (%[1]d) %[1]v, expected (%[2]d) %[2]v", errno, d.wantErrno)
+	}
+
+	// Check that after clearing socket error, sending doesn't fail.
+	d.dut.SendTo(d.remoteFD, nil, 0, d.conn.LocalAddr())
+	if _, err := d.conn.Expect(testbench.UDP{}, time.Second); err != nil {
+		return fmt.Errorf("did not receive UDP packet as expected: %s", err)
+	}
+	return nil
+}
+
+// TestUDPICMPErrorPropagation tests that ICMP error messages in response to
+// UDP datagrams are processed correctly. RFC 1122 section 4.1.3.3 states that:
+// "UDP MUST pass to the application layer all ICMP error messages that it
+// receives from the IP layer."
+//
+// The test cases are parametrized in 3 dimensions: 1. the UDP socket is either
+// put into connection mode or left connectionless, 2. the ICMP message type
+// and code, and 3. the method by which the ICMP error is observed on the
+// socket: sendto, recv, or getsockopt(SO_ERROR).
+//
+// Linux's udp(7) man page states: "All fatal errors will be passed to the user
+// as an error return even when the socket is not connected. This includes
+// asynchronous errors received from the network." In practice, the only
+// combination of parameters to the test that causes an error to be observable
+// on the UDP socket is receiving a port unreachable message on a connected
+// socket.
+func TestUDPICMPErrorPropagation(t *testing.T) {
+	for _, connect := range []connectionMode{true, false} {
+		for _, icmpErr := range []icmpError{portUnreachable, timeToLiveExceeded} {
+			wantErrno := wantErrno(connect, icmpErr)
+
+			for _, errDetect := range []errorDetection{
+				errorDetection{"SendTo", false, testSendTo},
+				// Send to an address that's different from the one that caused an ICMP
+				// error to be returned.
+				errorDetection{"SendToValid", true, testSendTo},
+				errorDetection{"Recv", false, testRecv},
+				errorDetection{"SockOpt", false, testSockOpt},
+			} {
+				t.Run(fmt.Sprintf("%s/%s/%s", connect, icmpErr, errDetect.name), func(t *testing.T) {
+					dut := testbench.NewDUT(t)
+					defer dut.TearDown()
+
+					remoteFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
+					defer dut.Close(remoteFD)
+
+					// Create a second, clean socket on the DUT to ensure that the ICMP
+					// error messages only affect the sockets they are intended for.
+					cleanFD, cleanPort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
+					defer dut.Close(cleanFD)
+
+					conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
+					defer conn.Close()
+
+					if connect {
+						dut.Connect(remoteFD, conn.LocalAddr())
+						dut.Connect(cleanFD, conn.LocalAddr())
+					}
+
+					dut.SendTo(remoteFD, nil, 0, conn.LocalAddr())
+					udp, err := conn.Expect(testbench.UDP{}, time.Second)
+					if err != nil {
+						t.Fatalf("did not receive message from DUT: %s", err)
+					}
+
+					if err := sendICMPError(&conn, icmpErr, udp); err != nil {
+						t.Fatal(err)
+					}
+
+					errDetectConn := &conn
+					if errDetect.useValidConn {
+						// connClean is a UDP socket on the test runner that was not
+						// involved in the generation of the ICMP error. As such,
+						// interactions between it and the the DUT should be independent of
+						// the ICMP error at least at the port level.
+						connClean := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
+						defer connClean.Close()
+
+						errDetectConn = &connClean
+					}
+
+					if err := errDetect.f(context.Background(), testData{&dut, errDetectConn, remoteFD, remotePort, cleanFD, cleanPort, wantErrno}); err != nil {
+						t.Fatal(err)
+					}
+				})
+			}
+		}
+	}
+}
+
+// TestICMPErrorDuringUDPRecv tests behavior when a UDP socket is in the middle
+// of a blocking recv and receives an ICMP error.
+func TestICMPErrorDuringUDPRecv(t *testing.T) {
+	for _, connect := range []connectionMode{true, false} {
+		for _, icmpErr := range []icmpError{portUnreachable, timeToLiveExceeded} {
+			wantErrno := wantErrno(connect, icmpErr)
+
+			t.Run(fmt.Sprintf("%s/%s", connect, icmpErr), func(t *testing.T) {
+				dut := testbench.NewDUT(t)
+				defer dut.TearDown()
+
+				remoteFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
+				defer dut.Close(remoteFD)
+
+				// Create a second, clean socket on the DUT to ensure that the ICMP
+				// error messages only affect the sockets they are intended for.
+				cleanFD, cleanPort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
+				defer dut.Close(cleanFD)
+
+				conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
+				defer conn.Close()
+
+				if connect {
+					dut.Connect(remoteFD, conn.LocalAddr())
+					dut.Connect(cleanFD, conn.LocalAddr())
+				}
+
+				dut.SendTo(remoteFD, nil, 0, conn.LocalAddr())
+				udp, err := conn.Expect(testbench.UDP{}, time.Second)
+				if err != nil {
+					t.Fatalf("did not receive message from DUT: %s", err)
+				}
+
+				var wg sync.WaitGroup
+				wg.Add(2)
+				go func() {
+					defer wg.Done()
+
+					if wantErrno != syscall.Errno(0) {
+						ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+						defer cancel()
+
+						ret, _, err := dut.RecvWithErrno(ctx, remoteFD, 100, 0)
+						if ret != -1 {
+							t.Errorf("recv during ICMP error succeeded unexpectedly, expected (%[1]d) %[1]v", wantErrno)
+							return
+						}
+						if err != wantErrno {
+							t.Errorf("recv during ICMP error resulted in error (%[1]d) %[1]v, expected (%[2]d) %[2]v", err, wantErrno)
+							return
+						}
+					}
+
+					ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+					defer cancel()
+
+					if ret, _, err := dut.RecvWithErrno(ctx, remoteFD, 100, 0); ret == -1 {
+						t.Errorf("recv after ICMP error failed with (%[1]d) %[1]", err)
+					}
+				}()
+
+				go func() {
+					defer wg.Done()
+
+					ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+					defer cancel()
+
+					if ret, _, err := dut.RecvWithErrno(ctx, cleanFD, 100, 0); ret == -1 {
+						t.Errorf("recv on clean socket failed with (%[1]d) %[1]", err)
+					}
+				}()
+
+				// TODO(b/155684889) This sleep is to allow time for the DUT to
+				// actually call recv since we want the ICMP error to arrive during the
+				// blocking recv, and should be replaced when a better synchronization
+				// alternative is available.
+				time.Sleep(2 * time.Second)
+
+				if err := sendICMPError(&conn, icmpErr, udp); err != nil {
+					t.Fatal(err)
+				}
+
+				conn.Send(testbench.UDP{DstPort: &cleanPort})
+				conn.Send(testbench.UDP{})
+				wg.Wait()
+			})
+		}
+	}
+}
diff --git a/test/packetimpact/tests/udp_recv_multicast_test.go b/test/packetimpact/tests/udp_recv_multicast_test.go
new file mode 100644
index 000000000..d51a34145
--- /dev/null
+++ b/test/packetimpact/tests/udp_recv_multicast_test.go
@@ -0,0 +1,42 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package udp_recv_multicast_test
+
+import (
+	"flag"
+	"net"
+	"testing"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestUDPRecvMulticast(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
+	defer dut.Close(boundFD)
+	conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
+	defer conn.Close()
+	frame := conn.CreateFrame(&testbench.UDP{}, &testbench.Payload{Bytes: []byte("hello world")})
+	frame[1].(*testbench.IPv4).DstAddr = testbench.Address(tcpip.Address(net.ParseIP("224.0.0.1").To4()))
+	conn.SendFrame(frame)
+	dut.Recv(boundFD, 100, 0)
+}
diff --git a/test/packetimpact/tests/udp_send_recv_dgram_test.go b/test/packetimpact/tests/udp_send_recv_dgram_test.go
new file mode 100644
index 000000000..bf64803e2
--- /dev/null
+++ b/test/packetimpact/tests/udp_send_recv_dgram_test.go
@@ -0,0 +1,101 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package udp_send_recv_dgram_test
+
+import (
+	"flag"
+	"math/rand"
+	"net"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func generateRandomPayload(t *testing.T, n int) string {
+	t.Helper()
+	buf := make([]byte, n)
+	if _, err := rand.Read(buf); err != nil {
+		t.Fatalf("rand.Read(buf) failed: %s", err)
+	}
+	return string(buf)
+}
+
+func TestUDPRecv(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
+	defer dut.Close(boundFD)
+	conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	testCases := []struct {
+		name    string
+		payload string
+	}{
+		{"emptypayload", ""},
+		{"small payload", "hello world"},
+		{"1kPayload", generateRandomPayload(t, 1<<10)},
+		// Even though UDP allows larger dgrams we don't test it here as
+		// they need to be fragmented and written out as individual
+		// frames.
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			frame := conn.CreateFrame(&testbench.UDP{}, &testbench.Payload{Bytes: []byte(tc.payload)})
+			conn.SendFrame(frame)
+			if got, want := string(dut.Recv(boundFD, int32(len(tc.payload)), 0)), tc.payload; got != want {
+				t.Fatalf("received payload does not match sent payload got: %s, want: %s", got, want)
+			}
+		})
+	}
+}
+
+func TestUDPSend(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
+	defer dut.Close(boundFD)
+	conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	testCases := []struct {
+		name    string
+		payload string
+	}{
+		{"emptypayload", ""},
+		{"small payload", "hello world"},
+		{"1kPayload", generateRandomPayload(t, 1<<10)},
+		// Even though UDP allows larger dgrams we don't test it here as
+		// they need to be fragmented and written out as individual
+		// frames.
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			conn.Drain()
+			if got, want := int(dut.SendTo(boundFD, []byte(tc.payload), 0, conn.LocalAddr())), len(tc.payload); got != want {
+				t.Fatalf("short write got: %d, want: %d", got, want)
+			}
+			if _, err := conn.ExpectData(testbench.UDP{SrcPort: &remotePort}, testbench.Payload{Bytes: []byte(tc.payload)}, 1*time.Second); err != nil {
+				t.Fatal(err)
+			}
+		})
+	}
+}
diff --git a/test/perf/BUILD b/test/perf/BUILD
new file mode 100644
index 000000000..471d8c2ab
--- /dev/null
+++ b/test/perf/BUILD
@@ -0,0 +1,117 @@
+load("//test/runner:defs.bzl", "syscall_test")
+
+package(licenses = ["notice"])
+
+syscall_test(
+    test = "//test/perf/linux:clock_getres_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:clock_gettime_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:death_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:epoll_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:fork_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:futex_benchmark",
+)
+
+syscall_test(
+    size = "enormous",
+    shard_count = 10,
+    tags = ["nogotsan"],
+    test = "//test/perf/linux:getdents_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:getpid_benchmark",
+)
+
+syscall_test(
+    size = "enormous",
+    tags = ["nogotsan"],
+    test = "//test/perf/linux:gettid_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:mapping_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:open_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:pipe_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:randread_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:read_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:sched_yield_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:send_recv_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:seqwrite_benchmark",
+)
+
+syscall_test(
+    size = "enormous",
+    test = "//test/perf/linux:signal_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:sleep_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:stat_benchmark",
+)
+
+syscall_test(
+    size = "enormous",
+    add_overlay = True,
+    test = "//test/perf/linux:unlink_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:write_benchmark",
+)
diff --git a/test/perf/linux/BUILD b/test/perf/linux/BUILD
new file mode 100644
index 000000000..b4e907826
--- /dev/null
+++ b/test/perf/linux/BUILD
@@ -0,0 +1,356 @@
+load("//tools:defs.bzl", "cc_binary", "gbenchmark", "gtest")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+cc_binary(
+    name = "getpid_benchmark",
+    testonly = 1,
+    srcs = [
+        "getpid_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "send_recv_benchmark",
+    testonly = 1,
+    srcs = [
+        "send_recv_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/syscalls/linux:socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_binary(
+    name = "gettid_benchmark",
+    testonly = 1,
+    srcs = [
+        "gettid_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "sched_yield_benchmark",
+    testonly = 1,
+    srcs = [
+        "sched_yield_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "clock_getres_benchmark",
+    testonly = 1,
+    srcs = [
+        "clock_getres_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "clock_gettime_benchmark",
+    testonly = 1,
+    srcs = [
+        "clock_gettime_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "open_benchmark",
+    testonly = 1,
+    srcs = [
+        "open_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "read_benchmark",
+    testonly = 1,
+    srcs = [
+        "read_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "randread_benchmark",
+    testonly = 1,
+    srcs = [
+        "randread_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/random",
+    ],
+)
+
+cc_binary(
+    name = "write_benchmark",
+    testonly = 1,
+    srcs = [
+        "write_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "seqwrite_benchmark",
+    testonly = 1,
+    srcs = [
+        "seqwrite_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/random",
+    ],
+)
+
+cc_binary(
+    name = "pipe_benchmark",
+    testonly = 1,
+    srcs = [
+        "pipe_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+    ],
+)
+
+cc_binary(
+    name = "fork_benchmark",
+    testonly = 1,
+    srcs = [
+        "fork_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_binary(
+    name = "futex_benchmark",
+    testonly = 1,
+    srcs = [
+        "futex_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "epoll_benchmark",
+    testonly = 1,
+    srcs = [
+        "epoll_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:epoll_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "death_benchmark",
+    testonly = 1,
+    srcs = [
+        "death_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "mapping_benchmark",
+    testonly = 1,
+    srcs = [
+        "mapping_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "signal_benchmark",
+    testonly = 1,
+    srcs = [
+        "signal_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "getdents_benchmark",
+    testonly = 1,
+    srcs = [
+        "getdents_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "sleep_benchmark",
+    testonly = 1,
+    srcs = [
+        "sleep_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "stat_benchmark",
+    testonly = 1,
+    srcs = [
+        "stat_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_binary(
+    name = "unlink_benchmark",
+    testonly = 1,
+    srcs = [
+        "unlink_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
diff --git a/test/perf/linux/clock_getres_benchmark.cc b/test/perf/linux/clock_getres_benchmark.cc
new file mode 100644
index 000000000..b051293ad
--- /dev/null
+++ b/test/perf/linux/clock_getres_benchmark.cc
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// clock_getres(1) is very nearly a no-op syscall, but it does require copying
+// out to a userspace struct. It thus provides a nice small copy-out benchmark.
+void BM_ClockGetRes(benchmark::State& state) {
+  struct timespec ts;
+  for (auto _ : state) {
+    clock_getres(CLOCK_MONOTONIC, &ts);
+  }
+}
+
+BENCHMARK(BM_ClockGetRes);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/clock_gettime_benchmark.cc b/test/perf/linux/clock_gettime_benchmark.cc
new file mode 100644
index 000000000..6691bebd9
--- /dev/null
+++ b/test/perf/linux/clock_gettime_benchmark.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pthread.h>
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_ClockGettimeThreadCPUTime(benchmark::State& state) {
+  clockid_t clockid;
+  ASSERT_EQ(0, pthread_getcpuclockid(pthread_self(), &clockid));
+  struct timespec tp;
+
+  for (auto _ : state) {
+    clock_gettime(clockid, &tp);
+  }
+}
+
+BENCHMARK(BM_ClockGettimeThreadCPUTime);
+
+void BM_VDSOClockGettime(benchmark::State& state) {
+  const clockid_t clock = state.range(0);
+  struct timespec tp;
+  absl::Time start = absl::Now();
+
+  // Don't benchmark the calibration phase.
+  while (absl::Now() < start + absl::Milliseconds(2100)) {
+    clock_gettime(clock, &tp);
+  }
+
+  for (auto _ : state) {
+    clock_gettime(clock, &tp);
+  }
+}
+
+BENCHMARK(BM_VDSOClockGettime)->Arg(CLOCK_MONOTONIC)->Arg(CLOCK_REALTIME);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/death_benchmark.cc b/test/perf/linux/death_benchmark.cc
new file mode 100644
index 000000000..cb2b6fd07
--- /dev/null
+++ b/test/perf/linux/death_benchmark.cc
@@ -0,0 +1,36 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// DeathTest is not so much a microbenchmark as a macrobenchmark. It is testing
+// the ability of gVisor (on whatever platform) to execute all the related
+// stack-dumping routines associated with EXPECT_EXIT / EXPECT_DEATH.
+TEST(DeathTest, ZeroEqualsOne) {
+  EXPECT_EXIT({ TEST_CHECK(0 == 1); }, ::testing::KilledBySignal(SIGABRT), "");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/epoll_benchmark.cc b/test/perf/linux/epoll_benchmark.cc
new file mode 100644
index 000000000..0b121338a
--- /dev/null
+++ b/test/perf/linux/epoll_benchmark.cc
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+#include "test/util/epoll_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Returns a new eventfd.
+PosixErrorOr<FileDescriptor> NewEventFD() {
+  int fd = eventfd(0, /* flags = */ 0);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(errno, "eventfd");
+  }
+  return FileDescriptor(fd);
+}
+
+// Also stolen from epoll.cc unit tests.
+void BM_EpollTimeout(benchmark::State& state) {
+  constexpr int kFDsPerEpoll = 3;
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(
+        RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN, 0));
+  }
+
+  struct epoll_event result[kFDsPerEpoll];
+  int timeout_ms = state.range(0);
+
+  for (auto _ : state) {
+    EXPECT_EQ(0, epoll_wait(epollfd.get(), result, kFDsPerEpoll, timeout_ms));
+  }
+}
+
+BENCHMARK(BM_EpollTimeout)->Range(0, 8);
+
+// Also stolen from epoll.cc unit tests.
+void BM_EpollAllEvents(benchmark::State& state) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  const int fds_per_epoll = state.range(0);
+  constexpr uint64_t kEventVal = 5;
+
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < fds_per_epoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(
+        RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN, 0));
+
+    ASSERT_THAT(WriteFd(eventfds[i].get(), &kEventVal, sizeof(kEventVal)),
+                SyscallSucceedsWithValue(sizeof(kEventVal)));
+  }
+
+  std::vector<struct epoll_event> result(fds_per_epoll);
+
+  for (auto _ : state) {
+    EXPECT_EQ(fds_per_epoll,
+              epoll_wait(epollfd.get(), result.data(), fds_per_epoll, 0));
+  }
+}
+
+BENCHMARK(BM_EpollAllEvents)->Range(2, 1024);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/fork_benchmark.cc b/test/perf/linux/fork_benchmark.cc
new file mode 100644
index 000000000..84fdbc8a0
--- /dev/null
+++ b/test/perf/linux/fork_benchmark.cc
@@ -0,0 +1,350 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/barrier.h"
+#include "benchmark/benchmark.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kBusyMax = 250;
+
+// Do some CPU-bound busy-work.
+int busy(int max) {
+  // Prevent the compiler from optimizing this work away,
+  volatile int count = 0;
+
+  for (int i = 1; i < max; i++) {
+    for (int j = 2; j < i / 2; j++) {
+      if (i % j == 0) {
+        count++;
+      }
+    }
+  }
+
+  return count;
+}
+
+void BM_CPUBoundUniprocess(benchmark::State& state) {
+  for (auto _ : state) {
+    busy(kBusyMax);
+  }
+}
+
+BENCHMARK(BM_CPUBoundUniprocess);
+
+void BM_CPUBoundAsymmetric(benchmark::State& state) {
+  const size_t max = state.max_iterations;
+  pid_t child = fork();
+  if (child == 0) {
+    for (int i = 0; i < max; i++) {
+      busy(kBusyMax);
+    }
+    _exit(0);
+  }
+  ASSERT_THAT(child, SyscallSucceeds());
+  ASSERT_TRUE(state.KeepRunningBatch(max));
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(0, WEXITSTATUS(status));
+  ASSERT_FALSE(state.KeepRunning());
+}
+
+BENCHMARK(BM_CPUBoundAsymmetric)->UseRealTime();
+
+void BM_CPUBoundSymmetric(benchmark::State& state) {
+  std::vector<pid_t> children;
+  auto child_cleanup = Cleanup([&] {
+    for (const pid_t child : children) {
+      int status;
+      EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+      EXPECT_TRUE(WIFEXITED(status));
+      EXPECT_EQ(0, WEXITSTATUS(status));
+    }
+    ASSERT_FALSE(state.KeepRunning());
+  });
+
+  const int processes = state.range(0);
+  for (int i = 0; i < processes; i++) {
+    size_t cur = (state.max_iterations + (processes - 1)) / processes;
+    if ((state.iterations() + cur) >= state.max_iterations) {
+      cur = state.max_iterations - state.iterations();
+    }
+    pid_t child = fork();
+    if (child == 0) {
+      for (int i = 0; i < cur; i++) {
+        busy(kBusyMax);
+      }
+      _exit(0);
+    }
+    ASSERT_THAT(child, SyscallSucceeds());
+    if (cur > 0) {
+      // We can have a zero cur here, depending.
+      ASSERT_TRUE(state.KeepRunningBatch(cur));
+    }
+    children.push_back(child);
+  }
+}
+
+BENCHMARK(BM_CPUBoundSymmetric)->Range(2, 16)->UseRealTime();
+
+// Child routine for ProcessSwitch/ThreadSwitch.
+// Reads from readfd and writes the result to writefd.
+void SwitchChild(int readfd, int writefd) {
+  while (1) {
+    char buf;
+    int ret = ReadFd(readfd, &buf, 1);
+    if (ret == 0) {
+      break;
+    }
+    TEST_CHECK_MSG(ret == 1, "read failed");
+
+    ret = WriteFd(writefd, &buf, 1);
+    if (ret == -1) {
+      TEST_CHECK_MSG(errno == EPIPE, "unexpected write failure");
+      break;
+    }
+    TEST_CHECK_MSG(ret == 1, "write failed");
+  }
+}
+
+// Send bytes in a loop through a series of pipes, each passing through a
+// different process.
+//
+//  Proc 0        Proc 1
+//    * ----------> *
+//    ^   Pipe 1    |
+//    |             |
+//    | Pipe 0      | Pipe 2
+//    |             |
+//    |             |
+//    |   Pipe 3    v
+//    * <---------- *
+//  Proc 3        Proc 2
+//
+// This exercises context switching through multiple processes.
+void BM_ProcessSwitch(benchmark::State& state) {
+  // Code below assumes there are at least two processes.
+  const int num_processes = state.range(0);
+  ASSERT_GE(num_processes, 2);
+
+  std::vector<pid_t> children;
+  auto child_cleanup = Cleanup([&] {
+    for (const pid_t child : children) {
+      int status;
+      EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+      EXPECT_TRUE(WIFEXITED(status));
+      EXPECT_EQ(0, WEXITSTATUS(status));
+    }
+  });
+
+  // Must come after children, as the FDs must be closed before the children
+  // will exit.
+  std::vector<FileDescriptor> read_fds;
+  std::vector<FileDescriptor> write_fds;
+
+  for (int i = 0; i < num_processes; i++) {
+    int fds[2];
+    ASSERT_THAT(pipe(fds), SyscallSucceeds());
+    read_fds.emplace_back(fds[0]);
+    write_fds.emplace_back(fds[1]);
+  }
+
+  // This process is one of the processes in the loop. It will be considered
+  // index 0.
+  for (int i = 1; i < num_processes; i++) {
+    // Read from current pipe index, write to next.
+    const int read_index = i;
+    const int read_fd = read_fds[read_index].get();
+
+    const int write_index = (i + 1) % num_processes;
+    const int write_fd = write_fds[write_index].get();
+
+    // std::vector isn't safe to use from the fork child.
+    FileDescriptor* read_array = read_fds.data();
+    FileDescriptor* write_array = write_fds.data();
+
+    pid_t child = fork();
+    if (!child) {
+      // Close all other FDs.
+      for (int j = 0; j < num_processes; j++) {
+        if (j != read_index) {
+          read_array[j].reset();
+        }
+        if (j != write_index) {
+          write_array[j].reset();
+        }
+      }
+
+      SwitchChild(read_fd, write_fd);
+      _exit(0);
+    }
+    ASSERT_THAT(child, SyscallSucceeds());
+    children.push_back(child);
+  }
+
+  // Read from current pipe index (0), write to next (1).
+  const int read_index = 0;
+  const int read_fd = read_fds[read_index].get();
+
+  const int write_index = 1;
+  const int write_fd = write_fds[write_index].get();
+
+  // Kick start the loop.
+  char buf = 'a';
+  ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+
+  for (auto _ : state) {
+    ASSERT_THAT(ReadFd(read_fd, &buf, 1), SyscallSucceedsWithValue(1));
+    ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+  }
+}
+
+BENCHMARK(BM_ProcessSwitch)->Range(2, 16)->UseRealTime();
+
+// Equivalent to BM_ThreadSwitch using threads instead of processes.
+void BM_ThreadSwitch(benchmark::State& state) {
+  // Code below assumes there are at least two threads.
+  const int num_threads = state.range(0);
+  ASSERT_GE(num_threads, 2);
+
+  // Must come after threads, as the FDs must be closed before the children
+  // will exit.
+  std::vector<std::unique_ptr<ScopedThread>> threads;
+  std::vector<FileDescriptor> read_fds;
+  std::vector<FileDescriptor> write_fds;
+
+  for (int i = 0; i < num_threads; i++) {
+    int fds[2];
+    ASSERT_THAT(pipe(fds), SyscallSucceeds());
+    read_fds.emplace_back(fds[0]);
+    write_fds.emplace_back(fds[1]);
+  }
+
+  // This thread is one of the threads in the loop. It will be considered
+  // index 0.
+  for (int i = 1; i < num_threads; i++) {
+    // Read from current pipe index, write to next.
+    //
+    // Transfer ownership of the FDs to the thread.
+    const int read_index = i;
+    const int read_fd = read_fds[read_index].release();
+
+    const int write_index = (i + 1) % num_threads;
+    const int write_fd = write_fds[write_index].release();
+
+    threads.emplace_back(std::make_unique<ScopedThread>([read_fd, write_fd] {
+      FileDescriptor read(read_fd);
+      FileDescriptor write(write_fd);
+      SwitchChild(read.get(), write.get());
+    }));
+  }
+
+  // Read from current pipe index (0), write to next (1).
+  const int read_index = 0;
+  const int read_fd = read_fds[read_index].get();
+
+  const int write_index = 1;
+  const int write_fd = write_fds[write_index].get();
+
+  // Kick start the loop.
+  char buf = 'a';
+  ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+
+  for (auto _ : state) {
+    ASSERT_THAT(ReadFd(read_fd, &buf, 1), SyscallSucceedsWithValue(1));
+    ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+  }
+
+  // The two FDs still owned by this thread are closed, causing the next thread
+  // to exit its loop and close its FDs, and so on until all threads exit.
+}
+
+BENCHMARK(BM_ThreadSwitch)->Range(2, 16)->UseRealTime();
+
+void BM_ThreadStart(benchmark::State& state) {
+  const int num_threads = state.range(0);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    auto barrier = new absl::Barrier(num_threads + 1);
+    std::vector<std::unique_ptr<ScopedThread>> threads;
+
+    state.ResumeTiming();
+
+    for (size_t i = 0; i < num_threads; ++i) {
+      threads.emplace_back(std::make_unique<ScopedThread>([barrier] {
+        if (barrier->Block()) {
+          delete barrier;
+        }
+      }));
+    }
+
+    if (barrier->Block()) {
+      delete barrier;
+    }
+
+    state.PauseTiming();
+
+    for (const auto& thread : threads) {
+      thread->Join();
+    }
+
+    state.ResumeTiming();
+  }
+}
+
+BENCHMARK(BM_ThreadStart)->Range(1, 2048)->UseRealTime();
+
+// Benchmark the complete fork + exit + wait.
+void BM_ProcessLifecycle(benchmark::State& state) {
+  const int num_procs = state.range(0);
+
+  std::vector<pid_t> pids(num_procs);
+  for (auto _ : state) {
+    for (size_t i = 0; i < num_procs; ++i) {
+      int pid = fork();
+      if (pid == 0) {
+        _exit(0);
+      }
+      ASSERT_THAT(pid, SyscallSucceeds());
+      pids[i] = pid;
+    }
+
+    for (const int pid : pids) {
+      ASSERT_THAT(RetryEINTR(waitpid)(pid, nullptr, 0),
+                  SyscallSucceedsWithValue(pid));
+    }
+  }
+}
+
+BENCHMARK(BM_ProcessLifecycle)->Range(1, 512)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/futex_benchmark.cc b/test/perf/linux/futex_benchmark.cc
new file mode 100644
index 000000000..241f39896
--- /dev/null
+++ b/test/perf/linux/futex_benchmark.cc
@@ -0,0 +1,198 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/futex.h>
+
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+inline int FutexWait(std::atomic<int32_t>* v, int32_t val) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, val, nullptr);
+}
+
+inline int FutexWaitMonotonicTimeout(std::atomic<int32_t>* v, int32_t val,
+                                     const struct timespec* timeout) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, val, timeout);
+}
+
+inline int FutexWaitMonotonicDeadline(std::atomic<int32_t>* v, int32_t val,
+                                      const struct timespec* deadline) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_BITSET_PRIVATE, val, deadline,
+                 nullptr, FUTEX_BITSET_MATCH_ANY);
+}
+
+inline int FutexWaitRealtimeDeadline(std::atomic<int32_t>* v, int32_t val,
+                                     const struct timespec* deadline) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_BITSET_PRIVATE | FUTEX_CLOCK_REALTIME,
+                 val, deadline, nullptr, FUTEX_BITSET_MATCH_ANY);
+}
+
+inline int FutexWake(std::atomic<int32_t>* v, int32_t count) {
+  return syscall(SYS_futex, v, FUTEX_WAKE_PRIVATE, count);
+}
+
+// This just uses FUTEX_WAKE on an address with nothing waiting, very simple.
+void BM_FutexWakeNop(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+
+  for (auto _ : state) {
+    TEST_PCHECK(FutexWake(&v, 1) == 0);
+  }
+}
+
+BENCHMARK(BM_FutexWakeNop)->MinTime(5);
+
+// This just uses FUTEX_WAIT on an address whose value has changed, i.e., the
+// syscall won't wait.
+void BM_FutexWaitNop(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+
+  for (auto _ : state) {
+    TEST_PCHECK(FutexWait(&v, 1) == -1 && errno == EAGAIN);
+  }
+}
+
+BENCHMARK(BM_FutexWaitNop)->MinTime(5);
+
+// This uses FUTEX_WAIT with a timeout on an address whose value never
+// changes, such that it always times out. Timeout overhead can be estimated by
+// timer overruns for short timeouts.
+void BM_FutexWaitMonotonicTimeout(benchmark::State& state) {
+  const int timeout_ns = state.range(0);
+  std::atomic<int32_t> v(0);
+  auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
+
+  for (auto _ : state) {
+    TEST_PCHECK(FutexWaitMonotonicTimeout(&v, 0, &ts) == -1 &&
+                errno == ETIMEDOUT);
+  }
+}
+
+BENCHMARK(BM_FutexWaitMonotonicTimeout)
+    ->MinTime(5)
+    ->UseRealTime()
+    ->Arg(1)
+    ->Arg(10)
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000);
+
+// This uses FUTEX_WAIT_BITSET with a deadline that is in the past. This allows
+// estimation of the overhead of setting up a timer for a deadline (as opposed
+// to a timeout as specified for FUTEX_WAIT).
+void BM_FutexWaitMonotonicDeadline(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+  struct timespec ts = {};
+
+  for (auto _ : state) {
+    TEST_PCHECK(FutexWaitMonotonicDeadline(&v, 0, &ts) == -1 &&
+                errno == ETIMEDOUT);
+  }
+}
+
+BENCHMARK(BM_FutexWaitMonotonicDeadline)->MinTime(5);
+
+// This is equivalent to BM_FutexWaitMonotonicDeadline, but uses CLOCK_REALTIME
+// instead of CLOCK_MONOTONIC for the deadline.
+void BM_FutexWaitRealtimeDeadline(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+  struct timespec ts = {};
+
+  for (auto _ : state) {
+    TEST_PCHECK(FutexWaitRealtimeDeadline(&v, 0, &ts) == -1 &&
+                errno == ETIMEDOUT);
+  }
+}
+
+BENCHMARK(BM_FutexWaitRealtimeDeadline)->MinTime(5);
+
+int64_t GetCurrentMonotonicTimeNanos() {
+  struct timespec ts;
+  TEST_CHECK(clock_gettime(CLOCK_MONOTONIC, &ts) != -1);
+  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+void SpinNanos(int64_t delay_ns) {
+  if (delay_ns <= 0) {
+    return;
+  }
+  const int64_t end = GetCurrentMonotonicTimeNanos() + delay_ns;
+  while (GetCurrentMonotonicTimeNanos() < end) {
+    // spin
+  }
+}
+
+// Each iteration of FutexRoundtripDelayed involves a thread sending a futex
+// wakeup to another thread, which spins for delay_us and then sends a futex
+// wakeup back. The time per iteration is 2 * (delay_us + kBeforeWakeDelayNs +
+// futex/scheduling overhead).
+void BM_FutexRoundtripDelayed(benchmark::State& state) {
+  const int delay_us = state.range(0);
+  const int64_t delay_ns = delay_us * 1000;
+  // Spin for an extra kBeforeWakeDelayNs before invoking FUTEX_WAKE to reduce
+  // the probability that the wakeup comes before the wait, preventing the wait
+  // from ever taking effect and causing the benchmark to underestimate the
+  // actual wakeup time.
+  constexpr int64_t kBeforeWakeDelayNs = 500;
+  std::atomic<int32_t> v(0);
+  ScopedThread t([&] {
+    for (int i = 0; i < state.max_iterations; i++) {
+      SpinNanos(delay_ns);
+      while (v.load(std::memory_order_acquire) == 0) {
+        FutexWait(&v, 0);
+      }
+      SpinNanos(kBeforeWakeDelayNs + delay_ns);
+      v.store(0, std::memory_order_release);
+      FutexWake(&v, 1);
+    }
+  });
+  for (auto _ : state) {
+    SpinNanos(kBeforeWakeDelayNs + delay_ns);
+    v.store(1, std::memory_order_release);
+    FutexWake(&v, 1);
+    SpinNanos(delay_ns);
+    while (v.load(std::memory_order_acquire) == 1) {
+      FutexWait(&v, 1);
+    }
+  }
+}
+
+BENCHMARK(BM_FutexRoundtripDelayed)
+    ->MinTime(5)
+    ->UseRealTime()
+    ->Arg(0)
+    ->Arg(10)
+    ->Arg(20)
+    ->Arg(50)
+    ->Arg(100);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/getdents_benchmark.cc b/test/perf/linux/getdents_benchmark.cc
new file mode 100644
index 000000000..d8e81fa8c
--- /dev/null
+++ b/test/perf/linux/getdents_benchmark.cc
@@ -0,0 +1,149 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+#ifndef SYS_getdents64
+#if defined(__x86_64__)
+#define SYS_getdents64 217
+#elif defined(__aarch64__)
+#define SYS_getdents64 217
+#else
+#error "Unknown architecture"
+#endif
+#endif  // SYS_getdents64
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kBufferSize = 65536;
+
+PosixErrorOr<TempPath> CreateDirectory(int count,
+                                       std::vector<std::string>* files) {
+  ASSIGN_OR_RETURN_ERRNO(TempPath dir, TempPath::CreateDir());
+
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor dfd,
+                         Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+  for (int i = 0; i < count; i++) {
+    auto file = NewTempRelPath();
+    auto res = MknodAt(dfd, file, S_IFREG | 0644, 0);
+    RETURN_IF_ERRNO(res);
+    files->push_back(file);
+  }
+
+  return std::move(dir);
+}
+
+PosixError CleanupDirectory(const TempPath& dir,
+                            std::vector<std::string>* files) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor dfd,
+                         Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+  for (auto it = files->begin(); it != files->end(); ++it) {
+    auto res = UnlinkAt(dfd, *it, 0);
+    RETURN_IF_ERRNO(res);
+  }
+  return NoError();
+}
+
+// Creates a directory containing `files` files, and reads all the directory
+// entries from the directory using a single FD.
+void BM_GetdentsSameFD(benchmark::State& state) {
+  // Create directory with given files.
+  const int count = state.range(0);
+
+  // Keep a vector of all of the file TempPaths that is destroyed before dir.
+  //
+  // Normally, we'd simply allow dir to recursively clean up the contained
+  // files, but that recursive cleanup uses getdents, which may be very slow in
+  // extreme benchmarks.
+  TempPath dir;
+  std::vector<std::string> files;
+  dir = ASSERT_NO_ERRNO_AND_VALUE(CreateDirectory(count, &files));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+  char buffer[kBufferSize];
+
+  // We read all directory entries on each iteration, but report this as a
+  // "batch" iteration so that reported times are per file.
+  while (state.KeepRunningBatch(count)) {
+    ASSERT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallSucceeds());
+
+    int ret;
+    do {
+      ASSERT_THAT(ret = syscall(SYS_getdents64, fd.get(), buffer, kBufferSize),
+                  SyscallSucceeds());
+    } while (ret > 0);
+  }
+
+  ASSERT_NO_ERRNO(CleanupDirectory(dir, &files));
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_GetdentsSameFD)->Range(1, 1 << 16)->UseRealTime();
+
+// Creates a directory containing `files` files, and reads all the directory
+// entries from the directory using a new FD each time.
+void BM_GetdentsNewFD(benchmark::State& state) {
+  // Create directory with given files.
+  const int count = state.range(0);
+
+  // Keep a vector of all of the file TempPaths that is destroyed before dir.
+  //
+  // Normally, we'd simply allow dir to recursively clean up the contained
+  // files, but that recursive cleanup uses getdents, which may be very slow in
+  // extreme benchmarks.
+  TempPath dir;
+  std::vector<std::string> files;
+  dir = ASSERT_NO_ERRNO_AND_VALUE(CreateDirectory(count, &files));
+  char buffer[kBufferSize];
+
+  // We read all directory entries on each iteration, but report this as a
+  // "batch" iteration so that reported times are per file.
+  while (state.KeepRunningBatch(count)) {
+    FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+    int ret;
+    do {
+      ASSERT_THAT(ret = syscall(SYS_getdents64, fd.get(), buffer, kBufferSize),
+                  SyscallSucceeds());
+    } while (ret > 0);
+  }
+
+  ASSERT_NO_ERRNO(CleanupDirectory(dir, &files));
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_GetdentsNewFD)->Range(1, 1 << 12)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/getpid_benchmark.cc b/test/perf/linux/getpid_benchmark.cc
new file mode 100644
index 000000000..db74cb264
--- /dev/null
+++ b/test/perf/linux/getpid_benchmark.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Getpid(benchmark::State& state) {
+  for (auto _ : state) {
+    syscall(SYS_getpid);
+  }
+}
+
+BENCHMARK(BM_Getpid);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/gettid_benchmark.cc b/test/perf/linux/gettid_benchmark.cc
new file mode 100644
index 000000000..8f4961f5e
--- /dev/null
+++ b/test/perf/linux/gettid_benchmark.cc
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Gettid(benchmark::State& state) {
+  for (auto _ : state) {
+    syscall(SYS_gettid);
+  }
+}
+
+BENCHMARK(BM_Gettid)->ThreadRange(1, 4000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/mapping_benchmark.cc b/test/perf/linux/mapping_benchmark.cc
new file mode 100644
index 000000000..39c30fe69
--- /dev/null
+++ b/test/perf/linux/mapping_benchmark.cc
@@ -0,0 +1,163 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Conservative value for /proc/sys/vm/max_map_count, which limits the number of
+// VMAs, minus a safety margin for VMAs that already exist for the test binary.
+// The default value for max_map_count is
+// include/linux/mm.h:DEFAULT_MAX_MAP_COUNT = 65530.
+constexpr size_t kMaxVMAs = 64001;
+
+// Map then unmap pages without touching them.
+void BM_MapUnmap(benchmark::State& state) {
+  // Number of pages to map.
+  const int pages = state.range(0);
+
+  while (state.KeepRunning()) {
+    void* addr = mmap(0, pages * kPageSize, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+    int ret = munmap(addr, pages * kPageSize);
+    TEST_CHECK_MSG(ret == 0, "munmap failed");
+  }
+}
+
+BENCHMARK(BM_MapUnmap)->Range(1, 1 << 17)->UseRealTime();
+
+// Map, touch, then unmap pages.
+void BM_MapTouchUnmap(benchmark::State& state) {
+  // Number of pages to map.
+  const int pages = state.range(0);
+
+  while (state.KeepRunning()) {
+    void* addr = mmap(0, pages * kPageSize, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+    char* c = reinterpret_cast<char*>(addr);
+    char* end = c + pages * kPageSize;
+    while (c < end) {
+      *c = 42;
+      c += kPageSize;
+    }
+
+    int ret = munmap(addr, pages * kPageSize);
+    TEST_CHECK_MSG(ret == 0, "munmap failed");
+  }
+}
+
+BENCHMARK(BM_MapTouchUnmap)->Range(1, 1 << 17)->UseRealTime();
+
+// Map and touch many pages, unmapping all at once.
+//
+// NOTE(b/111429208): This is a regression test to ensure performant mapping and
+// allocation even with tons of mappings.
+void BM_MapTouchMany(benchmark::State& state) {
+  // Number of pages to map.
+  const int page_count = state.range(0);
+
+  while (state.KeepRunning()) {
+    std::vector<void*> pages;
+
+    for (int i = 0; i < page_count; i++) {
+      void* addr = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE,
+                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+      char* c = reinterpret_cast<char*>(addr);
+      *c = 42;
+
+      pages.push_back(addr);
+    }
+
+    for (void* addr : pages) {
+      int ret = munmap(addr, kPageSize);
+      TEST_CHECK_MSG(ret == 0, "munmap failed");
+    }
+  }
+
+  state.SetBytesProcessed(kPageSize * page_count * state.iterations());
+}
+
+BENCHMARK(BM_MapTouchMany)->Range(1, 1 << 12)->UseRealTime();
+
+void BM_PageFault(benchmark::State& state) {
+  // Map the region in which we will take page faults. To ensure that each page
+  // fault maps only a single page, each page we touch must correspond to a
+  // distinct VMA. Thus we need a 1-page gap between each 1-page VMA. However,
+  // each gap consists of a PROT_NONE VMA, instead of an unmapped hole, so that
+  // if there are background threads running, they can't inadvertently creating
+  // mappings in our gaps that are unmapped when the test ends.
+  size_t test_pages = kMaxVMAs;
+  // Ensure that test_pages is odd, since we want the test region to both
+  // begin and end with a mapped page.
+  if (test_pages % 2 == 0) {
+    test_pages--;
+  }
+  const size_t test_region_bytes = test_pages * kPageSize;
+  // Use MAP_SHARED here because madvise(MADV_DONTNEED) on private mappings on
+  // gVisor won't force future sentry page faults (by design). Use MAP_POPULATE
+  // so that Linux pre-allocates the shmem file used to back the mapping.
+  Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(test_region_bytes, PROT_READ, MAP_SHARED | MAP_POPULATE));
+  for (size_t i = 0; i < test_pages / 2; i++) {
+    ASSERT_THAT(
+        mprotect(reinterpret_cast<void*>(m.addr() + ((2 * i + 1) * kPageSize)),
+                 kPageSize, PROT_NONE),
+        SyscallSucceeds());
+  }
+
+  const size_t mapped_pages = test_pages / 2 + 1;
+  // "Start" at the end of the mapped region to force the mapped region to be
+  // reset, since we mapped it with MAP_POPULATE.
+  size_t cur_page = mapped_pages;
+  for (auto _ : state) {
+    if (cur_page >= mapped_pages) {
+      // We've reached the end of our mapped region and have to reset it to
+      // incur page faults again.
+      state.PauseTiming();
+      ASSERT_THAT(madvise(m.ptr(), test_region_bytes, MADV_DONTNEED),
+                  SyscallSucceeds());
+      cur_page = 0;
+      state.ResumeTiming();
+    }
+    const uintptr_t addr = m.addr() + (2 * cur_page * kPageSize);
+    const char c = *reinterpret_cast<volatile char*>(addr);
+    benchmark::DoNotOptimize(c);
+    cur_page++;
+  }
+}
+
+BENCHMARK(BM_PageFault)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/open_benchmark.cc b/test/perf/linux/open_benchmark.cc
new file mode 100644
index 000000000..68008f6d5
--- /dev/null
+++ b/test/perf/linux/open_benchmark.cc
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Open(benchmark::State& state) {
+  const int size = state.range(0);
+  std::vector<TempPath> cache;
+  for (int i = 0; i < size; i++) {
+    auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+    cache.emplace_back(std::move(path));
+  }
+
+  unsigned int seed = 1;
+  for (auto _ : state) {
+    const int chosen = rand_r(&seed) % size;
+    int fd = open(cache[chosen].path().c_str(), O_RDONLY);
+    TEST_CHECK(fd != -1);
+    close(fd);
+  }
+}
+
+BENCHMARK(BM_Open)->Range(1, 128)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/pipe_benchmark.cc b/test/perf/linux/pipe_benchmark.cc
new file mode 100644
index 000000000..8f5f6a2a3
--- /dev/null
+++ b/test/perf/linux/pipe_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cerrno>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Pipe(benchmark::State& state) {
+  int fds[2];
+  TEST_CHECK(pipe(fds) == 0);
+
+  const int size = state.range(0);
+  std::vector<char> wbuf(size);
+  std::vector<char> rbuf(size);
+  RandomizeBuffer(wbuf.data(), size);
+
+  ScopedThread t([&] {
+    auto const fd = fds[1];
+    for (int i = 0; i < state.max_iterations; i++) {
+      TEST_CHECK(WriteFd(fd, wbuf.data(), wbuf.size()) == size);
+    }
+  });
+
+  for (auto _ : state) {
+    TEST_CHECK(ReadFd(fds[0], rbuf.data(), rbuf.size()) == size);
+  }
+
+  t.Join();
+
+  close(fds[0]);
+  close(fds[1]);
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Pipe)->Range(1, 1 << 20)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/randread_benchmark.cc b/test/perf/linux/randread_benchmark.cc
new file mode 100644
index 000000000..b0eb8c24e
--- /dev/null
+++ b/test/perf/linux/randread_benchmark.cc
@@ -0,0 +1,100 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Create a 1GB file that will be read from at random positions. This should
+// invalid any performance gains from caching.
+const uint64_t kFileSize = 1ULL << 30;
+
+// How many bytes to write at once to initialize the file used to read from.
+const uint32_t kWriteSize = 65536;
+
+// Largest benchmarked read unit.
+const uint32_t kMaxRead = 1UL << 26;
+
+TempPath CreateFile(uint64_t file_size) {
+  auto path = TempPath::CreateFile().ValueOrDie();
+  FileDescriptor fd = Open(path.path(), O_WRONLY).ValueOrDie();
+
+  // Try to minimize syscalls by using maximum size writev() requests.
+  std::vector<char> buffer(kWriteSize);
+  RandomizeBuffer(buffer.data(), buffer.size());
+  const std::vector<std::vector<struct iovec>> iovecs_list =
+      GenerateIovecs(file_size, buffer.data(), buffer.size());
+  for (const auto& iovecs : iovecs_list) {
+    TEST_CHECK(writev(fd.get(), iovecs.data(), iovecs.size()) >= 0);
+  }
+
+  return path;
+}
+
+// Global test state, initialized once per process lifetime.
+struct GlobalState {
+  const TempPath tmpfile;
+  explicit GlobalState(TempPath tfile) : tmpfile(std::move(tfile)) {}
+};
+
+GlobalState& GetGlobalState() {
+  // This gets created only once throughout the lifetime of the process.
+  // Use a dynamically allocated object (that is never deleted) to avoid order
+  // of destruction of static storage variables issues.
+  static GlobalState* const state =
+      // The actual file size is the maximum random seek range (kFileSize) + the
+      // maximum read size so we can read that number of bytes at the end of the
+      // file.
+      new GlobalState(CreateFile(kFileSize + kMaxRead));
+  return *state;
+}
+
+void BM_RandRead(benchmark::State& state) {
+  const int size = state.range(0);
+
+  GlobalState& global_state = GetGlobalState();
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(global_state.tmpfile.path(), O_RDONLY));
+  std::vector<char> buf(size);
+
+  unsigned int seed = 1;
+  for (auto _ : state) {
+    TEST_CHECK(PreadFd(fd.get(), buf.data(), buf.size(),
+                       rand_r(&seed) % kFileSize) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_RandRead)->Range(1, kMaxRead)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/read_benchmark.cc b/test/perf/linux/read_benchmark.cc
new file mode 100644
index 000000000..62445867d
--- /dev/null
+++ b/test/perf/linux/read_benchmark.cc
@@ -0,0 +1,53 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Read(benchmark::State& state) {
+  const int size = state.range(0);
+  const std::string contents(size, 0);
+  auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), contents, TempPath::kDefaultFileMode));
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDONLY));
+
+  std::vector<char> buf(size);
+  for (auto _ : state) {
+    TEST_CHECK(PreadFd(fd.get(), buf.data(), buf.size(), 0) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Read)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/sched_yield_benchmark.cc b/test/perf/linux/sched_yield_benchmark.cc
new file mode 100644
index 000000000..6756b5575
--- /dev/null
+++ b/test/perf/linux/sched_yield_benchmark.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Sched_yield(benchmark::State& state) {
+  for (auto ignored : state) {
+    TEST_CHECK(sched_yield() == 0);
+  }
+}
+
+BENCHMARK(BM_Sched_yield)->ThreadRange(1, 2000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/send_recv_benchmark.cc b/test/perf/linux/send_recv_benchmark.cc
new file mode 100644
index 000000000..d73e49523
--- /dev/null
+++ b/test/perf/linux/send_recv_benchmark.cc
@@ -0,0 +1,372 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include <cstring>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "benchmark/benchmark.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr ssize_t kMessageSize = 1024;
+
+class Message {
+ public:
+  explicit Message(int byte = 0) : Message(byte, kMessageSize, 0) {}
+
+  explicit Message(int byte, int sz) : Message(byte, sz, 0) {}
+
+  explicit Message(int byte, int sz, int cmsg_sz)
+      : buffer_(sz, byte), cmsg_buffer_(cmsg_sz, 0) {
+    iov_.iov_base = buffer_.data();
+    iov_.iov_len = sz;
+    hdr_.msg_iov = &iov_;
+    hdr_.msg_iovlen = 1;
+    hdr_.msg_control = cmsg_buffer_.data();
+    hdr_.msg_controllen = cmsg_sz;
+  }
+
+  struct msghdr* header() {
+    return &hdr_;
+  }
+
+ private:
+  std::vector<char> buffer_;
+  std::vector<char> cmsg_buffer_;
+  struct iovec iov_ = {};
+  struct msghdr hdr_ = {};
+};
+
+void BM_Recvmsg(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  Message send_msg('a'), recv_msg;
+
+  ScopedThread t([&send_msg, &send_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendmsg(send_socket.get(), send_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_Recvmsg)->UseRealTime();
+
+void BM_Sendmsg(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  Message send_msg('a'), recv_msg;
+
+  ScopedThread t([&recv_msg, &recv_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  for (auto ignored : state) {
+    int n = sendmsg(send_socket.get(), send_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_sent += n;
+  }
+
+  notification.Notify();
+  send_socket.reset();
+
+  state.SetBytesProcessed(bytes_sent);
+}
+
+BENCHMARK(BM_Sendmsg)->UseRealTime();
+
+void BM_Recvfrom(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  char send_buffer[kMessageSize], recv_buffer[kMessageSize];
+
+  ScopedThread t([&send_socket, &send_buffer, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendto(send_socket.get(), send_buffer, kMessageSize, 0, nullptr, 0);
+    }
+  });
+
+  int bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvfrom(recv_socket.get(), recv_buffer, kMessageSize, 0, nullptr,
+                     nullptr);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_Recvfrom)->UseRealTime();
+
+void BM_Sendto(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  char send_buffer[kMessageSize], recv_buffer[kMessageSize];
+
+  ScopedThread t([&recv_socket, &recv_buffer, &notification] {
+    while (!notification.HasBeenNotified()) {
+      recvfrom(recv_socket.get(), recv_buffer, kMessageSize, 0, nullptr,
+               nullptr);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  for (auto ignored : state) {
+    int n = sendto(send_socket.get(), send_buffer, kMessageSize, 0, nullptr, 0);
+    TEST_CHECK(n > 0);
+    bytes_sent += n;
+  }
+
+  notification.Notify();
+  send_socket.reset();
+
+  state.SetBytesProcessed(bytes_sent);
+}
+
+BENCHMARK(BM_Sendto)->UseRealTime();
+
+PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) {
+  struct sockaddr_storage addr;
+  memset(&addr, 0, sizeof(addr));
+  addr.ss_family = family;
+  switch (family) {
+    case AF_INET:
+      reinterpret_cast<struct sockaddr_in*>(&addr)->sin_addr.s_addr =
+          htonl(INADDR_LOOPBACK);
+      break;
+    case AF_INET6:
+      reinterpret_cast<struct sockaddr_in6*>(&addr)->sin6_addr =
+          in6addr_loopback;
+      break;
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+  return addr;
+}
+
+// BM_RecvmsgWithControlBuf measures the performance of recvmsg when we allocate
+// space for control messages. Note that we do not expect to receive any.
+void BM_RecvmsgWithControlBuf(benchmark::State& state) {
+  auto listen_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(AF_INET6));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(bind(listen_socket.get(),
+                   reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_socket.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the address we're listening on, then connect to it. We need to do this
+  // because we're allowing the stack to pick a port for us.
+  ASSERT_THAT(getsockname(listen_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  auto send_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(
+      RetryEINTR(connect)(send_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+
+  // Accept the connection.
+  auto recv_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_socket.get(), nullptr, nullptr));
+
+  absl::Notification notification;
+  Message send_msg('a');
+  // Create a msghdr with a buffer allocated for control messages.
+  Message recv_msg(0, kMessageSize, /*cmsg_sz=*/24);
+
+  ScopedThread t([&send_msg, &send_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendmsg(send_socket.get(), send_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_RecvmsgWithControlBuf)->UseRealTime();
+
+// BM_SendmsgTCP measures the sendmsg throughput with varying payload sizes.
+//
+// state.Args[0] indicates whether the underlying socket should be blocking or
+// non-blocking w/ 0 indicating non-blocking and 1 to indicate blocking.
+// state.Args[1] is the size of the payload to be used per sendmsg call.
+void BM_SendmsgTCP(benchmark::State& state) {
+  auto listen_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(AF_INET));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(bind(listen_socket.get(),
+                   reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_socket.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the address we're listening on, then connect to it. We need to do this
+  // because we're allowing the stack to pick a port for us.
+  ASSERT_THAT(getsockname(listen_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  auto send_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(
+      RetryEINTR(connect)(send_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+
+  // Accept the connection.
+  auto recv_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_socket.get(), nullptr, nullptr));
+
+  // Check if we want to run the test w/ a blocking send socket
+  // or non-blocking.
+  const int blocking = state.range(0);
+  if (!blocking) {
+    // Set the send FD to O_NONBLOCK.
+    int opts;
+    ASSERT_THAT(opts = fcntl(send_socket.get(), F_GETFL), SyscallSucceeds());
+    opts |= O_NONBLOCK;
+    ASSERT_THAT(fcntl(send_socket.get(), F_SETFL, opts), SyscallSucceeds());
+  }
+
+  absl::Notification notification;
+
+  // Get the buffer size we should use for this iteration of the test.
+  const int buf_size = state.range(1);
+  Message send_msg('a', buf_size), recv_msg(0, buf_size);
+
+  ScopedThread t([&recv_msg, &recv_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      TEST_CHECK(recvmsg(recv_socket.get(), recv_msg.header(), 0) >= 0);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  int ncalls = 0;
+  for (auto ignored : state) {
+    int sent = 0;
+    while (true) {
+      struct msghdr hdr = {};
+      struct iovec iov = {};
+      struct msghdr* snd_header = send_msg.header();
+      iov.iov_base = static_cast<char*>(snd_header->msg_iov->iov_base) + sent;
+      iov.iov_len = snd_header->msg_iov->iov_len - sent;
+      hdr.msg_iov = &iov;
+      hdr.msg_iovlen = 1;
+      int n = RetryEINTR(sendmsg)(send_socket.get(), &hdr, 0);
+      ncalls++;
+      if (n > 0) {
+        sent += n;
+        if (sent == buf_size) {
+          break;
+        }
+        // n can be > 0 but less than requested size. In which case we don't
+        // poll.
+        continue;
+      }
+      // Poll the fd for it to become writable.
+      struct pollfd poll_fd = {send_socket.get(), POLL_OUT, 0};
+      EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10),
+                  SyscallSucceedsWithValue(0));
+    }
+    bytes_sent += static_cast<int64_t>(sent);
+  }
+
+  notification.Notify();
+  send_socket.reset();
+  state.SetBytesProcessed(bytes_sent);
+}
+
+void Args(benchmark::internal::Benchmark* benchmark) {
+  for (int blocking = 0; blocking < 2; blocking++) {
+    for (int buf_size = 1024; buf_size <= 256 << 20; buf_size *= 2) {
+      benchmark->Args({blocking, buf_size});
+    }
+  }
+}
+
+BENCHMARK(BM_SendmsgTCP)->Apply(&Args)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/seqwrite_benchmark.cc b/test/perf/linux/seqwrite_benchmark.cc
new file mode 100644
index 000000000..af49e4477
--- /dev/null
+++ b/test/perf/linux/seqwrite_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// The maximum file size of the test file, when writes get beyond this point
+// they wrap around. This should be large enough to blow away caches.
+const uint64_t kMaxFile = 1 << 30;
+
+// Perform writes of various sizes sequentially to one file. Wraps around if it
+// goes above a certain maximum file size.
+void BM_SeqWrite(benchmark::State& state) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_WRONLY));
+
+  const int size = state.range(0);
+  std::vector<char> buf(size);
+  RandomizeBuffer(buf.data(), buf.size());
+
+  // Start writes at offset 0.
+  uint64_t offset = 0;
+  for (auto _ : state) {
+    TEST_CHECK(PwriteFd(fd.get(), buf.data(), buf.size(), offset) ==
+               buf.size());
+    offset += buf.size();
+    // Wrap around if going above the maximum file size.
+    if (offset >= kMaxFile) {
+      offset = 0;
+    }
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_SeqWrite)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/signal_benchmark.cc b/test/perf/linux/signal_benchmark.cc
new file mode 100644
index 000000000..cec679191
--- /dev/null
+++ b/test/perf/linux/signal_benchmark.cc
@@ -0,0 +1,61 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <string.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void FixupHandler(int sig, siginfo_t* si, void* void_ctx) {
+  static unsigned int dataval = 0;
+
+  // Skip the offending instruction.
+  ucontext_t* ctx = reinterpret_cast<ucontext_t*>(void_ctx);
+  ctx->uc_mcontext.gregs[REG_RAX] = reinterpret_cast<greg_t>(&dataval);
+}
+
+void BM_FaultSignalFixup(benchmark::State& state) {
+  // Set up the signal handler.
+  struct sigaction sa = {};
+  sigemptyset(&sa.sa_mask);
+  sa.sa_sigaction = FixupHandler;
+  sa.sa_flags = SA_SIGINFO;
+  TEST_CHECK(sigaction(SIGSEGV, &sa, nullptr) == 0);
+
+  // Fault, fault, fault.
+  for (auto _ : state) {
+    // Trigger the segfault.
+    asm volatile(
+        "movq $0, %%rax\n"
+        "movq $0x77777777, (%%rax)\n"
+        :
+        :
+        : "rax");
+  }
+}
+
+BENCHMARK(BM_FaultSignalFixup)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/sleep_benchmark.cc b/test/perf/linux/sleep_benchmark.cc
new file mode 100644
index 000000000..99ef05117
--- /dev/null
+++ b/test/perf/linux/sleep_benchmark.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Sleep for 'param' nanoseconds.
+void BM_Sleep(benchmark::State& state) {
+  const int nanoseconds = state.range(0);
+
+  for (auto _ : state) {
+    struct timespec ts;
+    ts.tv_sec = 0;
+    ts.tv_nsec = nanoseconds;
+
+    int ret;
+    do {
+      ret = syscall(SYS_nanosleep, &ts, &ts);
+      if (ret < 0) {
+        TEST_CHECK(errno == EINTR);
+      }
+    } while (ret < 0);
+  }
+}
+
+BENCHMARK(BM_Sleep)
+    ->Arg(0)
+    ->Arg(1)
+    ->Arg(1000)              // 1us
+    ->Arg(1000 * 1000)       // 1ms
+    ->Arg(10 * 1000 * 1000)  // 10ms
+    ->Arg(50 * 1000 * 1000)  // 50ms
+    ->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/stat_benchmark.cc b/test/perf/linux/stat_benchmark.cc
new file mode 100644
index 000000000..f15424482
--- /dev/null
+++ b/test/perf/linux/stat_benchmark.cc
@@ -0,0 +1,62 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Creates a file in a nested directory hierarchy at least `depth` directories
+// deep, and stats that file multiple times.
+void BM_Stat(benchmark::State& state) {
+  // Create nested directories with given depth.
+  int depth = state.range(0);
+  const TempPath top_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  std::string dir_path = top_dir.path();
+
+  while (depth-- > 0) {
+    // Don't use TempPath because it will make paths too long to use.
+    //
+    // The top_dir destructor will clean up this whole tree.
+    dir_path = JoinPath(dir_path, absl::StrCat(depth));
+    ASSERT_NO_ERRNO(Mkdir(dir_path, 0755));
+  }
+
+  // Create the file that will be stat'd.
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir_path));
+
+  struct stat st;
+  for (auto _ : state) {
+    ASSERT_THAT(stat(file.path().c_str(), &st), SyscallSucceeds());
+  }
+}
+
+BENCHMARK(BM_Stat)->Range(1, 100)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/unlink_benchmark.cc b/test/perf/linux/unlink_benchmark.cc
new file mode 100644
index 000000000..92243a042
--- /dev/null
+++ b/test/perf/linux/unlink_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Creates a directory containing `files` files, and unlinks all the files.
+void BM_Unlink(benchmark::State& state) {
+  // Create directory with given files.
+  const int file_count = state.range(0);
+
+  // We unlink all files on each iteration, but report this as a "batch"
+  // iteration so that reported times are per file.
+  TempPath dir;
+  while (state.KeepRunningBatch(file_count)) {
+    state.PauseTiming();
+    // N.B. dir is declared outside the loop so that destruction of the previous
+    // iteration's directory occurs here, inside of PauseTiming.
+    dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+    std::vector<TempPath> files;
+    for (int i = 0; i < file_count; i++) {
+      TempPath file =
+          ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+      files.push_back(std::move(file));
+    }
+    state.ResumeTiming();
+
+    while (!files.empty()) {
+      // Destructor unlinks.
+      files.pop_back();
+    }
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_Unlink)->Range(1, 100 * 1000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/write_benchmark.cc b/test/perf/linux/write_benchmark.cc
new file mode 100644
index 000000000..7b060c70e
--- /dev/null
+++ b/test/perf/linux/write_benchmark.cc
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Write(benchmark::State& state) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_WRONLY));
+
+  const int size = state.range(0);
+  std::vector<char> buf(size);
+  RandomizeBuffer(buf.data(), size);
+
+  for (auto _ : state) {
+    TEST_CHECK(PwriteFd(fd.get(), buf.data(), size, 0) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Write)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/root/BUILD b/test/root/BUILD
index 23ce2a70f..a9e91ccd6 100644
--- a/test/root/BUILD
+++ b/test/root/BUILD
@@ -1,4 +1,5 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/vm:defs.bzl", "vm_test")
 
 package(licenses = ["notice"])
 
@@ -16,6 +17,7 @@ go_test(
         "crictl_test.go",
         "main_test.go",
         "oom_score_adj_test.go",
+        "runsc_test.go",
     ],
     data = [
         "//runsc",
@@ -23,21 +25,34 @@ go_test(
     library = ":root",
     tags = [
         # Requires docker and runsc to be configured before the test runs.
-        # Also test only runs as root.
+        # Also, the test needs to be run as root. Note that below, the
+        # root_vm_test relies on the default runtime 'runsc' being installed by
+        # the default installer.
         "manual",
         "local",
     ],
     visibility = ["//:sandbox"],
     deps = [
-        "//runsc/boot",
+        "//pkg/cleanup",
+        "//pkg/test/criutil",
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
         "//runsc/cgroup",
         "//runsc/container",
-        "//runsc/criutil",
-        "//runsc/dockerutil",
         "//runsc/specutils",
-        "//runsc/testutil",
-        "//test/root/testdata",
+        "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@com_github_syndtr_gocapability//capability:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+vm_test(
+    name = "root_vm_test",
+    size = "large",
+    shard_count = 1,
+    targets = [
+        "//tools/installers:shim",
+        ":root_test",
     ],
 )
diff --git a/test/root/cgroup_test.go b/test/root/cgroup_test.go
index 4038661cb..d0634b5c3 100644
--- a/test/root/cgroup_test.go
+++ b/test/root/cgroup_test.go
@@ -26,9 +26,9 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/cgroup"
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func verifyPid(pid int, path string) error {
@@ -53,68 +53,82 @@ func verifyPid(pid int, path string) error {
 	if scanner.Err() != nil {
 		return scanner.Err()
 	}
-	return fmt.Errorf("got: %s, want: %d", gots, pid)
+	return fmt.Errorf("got: %v, want: %d", gots, pid)
 }
 
-// TestCgroup sets cgroup options and checks that cgroup was properly configured.
 func TestMemCGroup(t *testing.T) {
-	allocMemSize := 128 << 20
-	if err := dockerutil.Pull("python"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("memusage-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// Start a new container and allocate the specified about of memory.
-	args := []string{
-		"--memory=256MB",
-		"python",
-		"python",
-		"-c",
-		fmt.Sprintf("import time; s = 'a' * %d; time.sleep(100)", allocMemSize),
-	}
-	if err := d.Run(args...); err != nil {
-		t.Fatal("docker create failed:", err)
+	allocMemSize := 128 << 20
+	allocMemLimit := 2 * allocMemSize
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image:  "basic/python",
+		Memory: allocMemLimit / 1024, // Must be in Kb.
+	}, "python", "-c", fmt.Sprintf("import time; s = 'a' * %d; time.sleep(100)", allocMemSize)); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
+	// Extract the ID to lookup the cgroup.
 	gid, err := d.ID()
 	if err != nil {
 		t.Fatalf("Docker.ID() failed: %v", err)
 	}
 	t.Logf("cgroup ID: %s", gid)
 
-	path := filepath.Join("/sys/fs/cgroup/memory/docker", gid, "memory.usage_in_bytes")
-	memUsage := 0
-
 	// Wait when the container will allocate memory.
+	memUsage := 0
 	start := time.Now()
-	for time.Now().Sub(start) < 30*time.Second {
+	for time.Since(start) < 30*time.Second {
+		// Sleep for a brief period of time after spawning the
+		// container (so that Docker can create the cgroup etc.
+		// or after looping below (so the application can start).
+		time.Sleep(100 * time.Millisecond)
+
+		// Read the cgroup memory limit.
+		path := filepath.Join("/sys/fs/cgroup/memory/docker", gid, "memory.limit_in_bytes")
 		outRaw, err := ioutil.ReadFile(path)
 		if err != nil {
-			t.Fatalf("failed to read %q: %v", path, err)
+			// It's possible that the container does not exist yet.
+			continue
 		}
 		out := strings.TrimSpace(string(outRaw))
+		memLimit, err := strconv.Atoi(out)
+		if err != nil {
+			t.Fatalf("Atoi(%v): %v", out, err)
+		}
+		if memLimit != allocMemLimit {
+			// The group may not have had the correct limit set yet.
+			continue
+		}
+
+		// Read the cgroup memory usage.
+		path = filepath.Join("/sys/fs/cgroup/memory/docker", gid, "memory.max_usage_in_bytes")
+		outRaw, err = ioutil.ReadFile(path)
+		if err != nil {
+			t.Fatalf("error reading usage: %v", err)
+		}
+		out = strings.TrimSpace(string(outRaw))
 		memUsage, err = strconv.Atoi(out)
 		if err != nil {
 			t.Fatalf("Atoi(%v): %v", out, err)
 		}
+		t.Logf("read usage: %v, wanted: %v", memUsage, allocMemSize)
 
-		if memUsage > allocMemSize {
+		// Are we done?
+		if memUsage >= allocMemSize {
 			return
 		}
-
-		time.Sleep(100 * time.Millisecond)
 	}
 
-	t.Fatalf("%vMB is less than %vMB: %v", memUsage>>20, allocMemSize>>20)
+	t.Fatalf("%vMB is less than %vMB", memUsage>>20, allocMemSize>>20)
 }
 
 // TestCgroup sets cgroup options and checks that cgroup was properly configured.
 func TestCgroup(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("cgroup-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
 	// This is not a comprehensive list of attributes.
 	//
@@ -179,10 +193,17 @@ func TestCgroup(t *testing.T) {
 			want: "5",
 		},
 		{
-			arg:  "--blkio-weight=750",
-			ctrl: "blkio",
-			file: "blkio.weight",
-			want: "750",
+			arg:            "--blkio-weight=750",
+			ctrl:           "blkio",
+			file:           "blkio.weight",
+			want:           "750",
+			skipIfNotFound: true, // blkio groups may not be available.
+		},
+		{
+			arg:  "--pids-limit=1000",
+			ctrl: "pids",
+			file: "pids.max",
+			want: "1000",
 		},
 	}
 
@@ -191,12 +212,15 @@ func TestCgroup(t *testing.T) {
 		args = append(args, attr.arg)
 	}
 
-	args = append(args, "alpine", "sleep", "10000")
-	if err := d.Run(args...); err != nil {
-		t.Fatal("docker create failed:", err)
+	// Start the container.
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		Extra: args, // Cgroup arguments.
+	}, "sleep", "10000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
+	// Lookup the relevant cgroup ID.
 	gid, err := d.ID()
 	if err != nil {
 		t.Fatalf("Docker.ID() failed: %v", err)
@@ -245,17 +269,21 @@ func TestCgroup(t *testing.T) {
 	}
 }
 
+// TestCgroup sets cgroup options and checks that cgroup was properly configured.
 func TestCgroupParent(t *testing.T) {
-	if err := dockerutil.Pull("alpine"); err != nil {
-		t.Fatal("docker pull failed:", err)
-	}
-	d := dockerutil.MakeDocker("cgroup-test")
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
 
-	parent := testutil.RandomName("runsc")
-	if err := d.Run("--cgroup-parent", parent, "alpine", "sleep", "10000"); err != nil {
-		t.Fatal("docker create failed:", err)
+	// Construct a known cgroup name.
+	parent := testutil.RandomID("runsc-")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+		Extra: []string{fmt.Sprintf("--cgroup-parent=%s", parent)},
+	}, "sleep", "10000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
+
+	// Extract the ID to look up the cgroup.
 	gid, err := d.ID()
 	if err != nil {
 		t.Fatalf("Docker.ID() failed: %v", err)
diff --git a/test/root/chroot_test.go b/test/root/chroot_test.go
index be0f63d18..a306132a4 100644
--- a/test/root/chroot_test.go
+++ b/test/root/chroot_test.go
@@ -24,17 +24,20 @@ import (
 	"strings"
 	"testing"
 
-	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
 )
 
 // TestChroot verifies that the sandbox is chroot'd and that mounts are cleaned
 // up after the sandbox is destroyed.
 func TestChroot(t *testing.T) {
-	d := dockerutil.MakeDocker("chroot-test")
-	if err := d.Run("alpine", "sleep", "10000"); err != nil {
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "10000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	pid, err := d.SandboxPid()
 	if err != nil {
@@ -76,11 +79,14 @@ func TestChroot(t *testing.T) {
 }
 
 func TestChrootGofer(t *testing.T) {
-	d := dockerutil.MakeDocker("chroot-test")
-	if err := d.Run("alpine", "sleep", "10000"); err != nil {
+	d := dockerutil.MakeDocker(t)
+	defer d.CleanUp()
+
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: "basic/alpine",
+	}, "sleep", "10000"); err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-	defer d.CleanUp()
 
 	// It's tricky to find gofers. Get sandbox PID first, then find parent. From
 	// parent get all immediate children, remove the sandbox, and everything else
diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go
index 3f90c4c6a..c138e02dc 100644
--- a/test/root/crictl_test.go
+++ b/test/root/crictl_test.go
@@ -16,6 +16,7 @@ package root
 
 import (
 	"bytes"
+	"encoding/json"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -29,16 +30,58 @@ import (
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/criutil"
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
-	"gvisor.dev/gvisor/test/root/testdata"
+	"gvisor.dev/gvisor/pkg/cleanup"
+	"gvisor.dev/gvisor/pkg/test/criutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 // Tests for crictl have to be run as root (rather than in a user namespace)
 // because crictl creates named network namespaces in /var/run/netns/.
 
+// SimpleSpec returns a JSON config for a simple container that runs the
+// specified command in the specified image.
+func SimpleSpec(name, image string, cmd []string, extra map[string]interface{}) string {
+	s := map[string]interface{}{
+		"metadata": map[string]string{
+			"name": name,
+		},
+		"image": map[string]string{
+			"image": testutil.ImageByName(image),
+		},
+		"log_path": fmt.Sprintf("%s.log", name),
+	}
+	if len(cmd) > 0 { // Omit if empty.
+		s["command"] = cmd
+	}
+	for k, v := range extra {
+		s[k] = v // Extra settings.
+	}
+	v, err := json.Marshal(s)
+	if err != nil {
+		// This shouldn't happen.
+		panic(err)
+	}
+	return string(v)
+}
+
+// Sandbox is a default JSON config for a sandbox.
+var Sandbox = `{
+    "metadata": {
+        "name": "default-sandbox",
+        "namespace": "default",
+        "attempt": 1,
+        "uid": "hdishd83djaidwnduwk28bcsb"
+    },
+    "linux": {
+    },
+    "log_directory": "/tmp"
+}
+`
+
+// Httpd is a JSON config for an httpd container.
+var Httpd = SimpleSpec("httpd", "basic/httpd", nil, nil)
+
 // TestCrictlSanity refers to b/112433158.
 func TestCrictlSanity(t *testing.T) {
 	// Setup containerd and crictl.
@@ -47,9 +90,9 @@ func TestCrictlSanity(t *testing.T) {
 		t.Fatalf("failed to setup crictl: %v", err)
 	}
 	defer cleanup()
-	podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.Httpd)
+	podID, contID, err := crictl.StartPodAndContainer("basic/httpd", Sandbox, Httpd)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("start failed: %v", err)
 	}
 
 	// Look for the httpd page.
@@ -59,10 +102,38 @@ func TestCrictlSanity(t *testing.T) {
 
 	// Stop everything.
 	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
+		t.Fatalf("stop failed: %v", err)
 	}
 }
 
+// HttpdMountPaths is a JSON config for an httpd container with additional
+// mounts.
+var HttpdMountPaths = SimpleSpec("httpd", "basic/httpd", nil, map[string]interface{}{
+	"mounts": []map[string]interface{}{
+		map[string]interface{}{
+			"container_path": "/var/run/secrets/kubernetes.io/serviceaccount",
+			"host_path":      "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/volumes/kubernetes.io~secret/default-token-2rpfx",
+			"readonly":       true,
+		},
+		map[string]interface{}{
+			"container_path": "/etc/hosts",
+			"host_path":      "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/etc-hosts",
+			"readonly":       false,
+		},
+		map[string]interface{}{
+			"container_path": "/dev/termination-log",
+			"host_path":      "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/containers/httpd/d1709580",
+			"readonly":       false,
+		},
+		map[string]interface{}{
+			"container_path": "/usr/local/apache2/htdocs/test",
+			"host_path":      "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064",
+			"readonly":       true,
+		},
+	},
+	"linux": map[string]interface{}{},
+})
+
 // TestMountPaths refers to b/117635704.
 func TestMountPaths(t *testing.T) {
 	// Setup containerd and crictl.
@@ -71,9 +142,9 @@ func TestMountPaths(t *testing.T) {
 		t.Fatalf("failed to setup crictl: %v", err)
 	}
 	defer cleanup()
-	podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.HttpdMountPaths)
+	podID, contID, err := crictl.StartPodAndContainer("basic/httpd", Sandbox, HttpdMountPaths)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("start failed: %v", err)
 	}
 
 	// Look for the directory available at /test.
@@ -83,7 +154,7 @@ func TestMountPaths(t *testing.T) {
 
 	// Stop everything.
 	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
+		t.Fatalf("stop failed: %v", err)
 	}
 }
 
@@ -95,14 +166,16 @@ func TestMountOverSymlinks(t *testing.T) {
 		t.Fatalf("failed to setup crictl: %v", err)
 	}
 	defer cleanup()
-	podID, contID, err := crictl.StartPodAndContainer("k8s.gcr.io/busybox", testdata.Sandbox, testdata.MountOverSymlink)
+
+	spec := SimpleSpec("busybox", "basic/resolv", []string{"sleep", "1000"}, nil)
+	podID, contID, err := crictl.StartPodAndContainer("basic/resolv", Sandbox, spec)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("start failed: %v", err)
 	}
 
 	out, err := crictl.Exec(contID, "readlink", "/etc/resolv.conf")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("readlink failed: %v, out: %s", err, out)
 	}
 	if want := "/tmp/resolv.conf"; !strings.Contains(string(out), want) {
 		t.Fatalf("/etc/resolv.conf is not pointing to %q: %q", want, string(out))
@@ -110,11 +183,11 @@ func TestMountOverSymlinks(t *testing.T) {
 
 	etc, err := crictl.Exec(contID, "cat", "/etc/resolv.conf")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("cat failed: %v, out: %s", err, etc)
 	}
 	tmp, err := crictl.Exec(contID, "cat", "/tmp/resolv.conf")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("cat failed: %v, out: %s", err, out)
 	}
 	if tmp != etc {
 		t.Fatalf("file content doesn't match:\n\t/etc/resolv.conf: %s\n\t/tmp/resolv.conf: %s", string(etc), string(tmp))
@@ -122,7 +195,7 @@ func TestMountOverSymlinks(t *testing.T) {
 
 	// Stop everything.
 	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
+		t.Fatalf("stop failed: %v", err)
 	}
 }
 
@@ -135,16 +208,16 @@ func TestHomeDir(t *testing.T) {
 		t.Fatalf("failed to setup crictl: %v", err)
 	}
 	defer cleanup()
-	contSpec := testdata.SimpleSpec("root", "k8s.gcr.io/busybox", []string{"sleep", "1000"})
-	podID, contID, err := crictl.StartPodAndContainer("k8s.gcr.io/busybox", testdata.Sandbox, contSpec)
+	contSpec := SimpleSpec("root", "basic/busybox", []string{"sleep", "1000"}, nil)
+	podID, contID, err := crictl.StartPodAndContainer("basic/busybox", Sandbox, contSpec)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("start failed: %v", err)
 	}
 
 	t.Run("root container", func(t *testing.T) {
 		out, err := crictl.Exec(contID, "sh", "-c", "echo $HOME")
 		if err != nil {
-			t.Fatal(err)
+			t.Fatalf("exec failed: %v, out: %s", err, out)
 		}
 		if got, want := strings.TrimSpace(string(out)), "/root"; got != want {
 			t.Fatalf("Home directory invalid. Got %q, Want : %q", got, want)
@@ -153,58 +226,66 @@ func TestHomeDir(t *testing.T) {
 
 	t.Run("sub-container", func(t *testing.T) {
 		// Create a sub container in the same pod.
-		subContSpec := testdata.SimpleSpec("subcontainer", "k8s.gcr.io/busybox", []string{"sleep", "1000"})
-		subContID, err := crictl.StartContainer(podID, "k8s.gcr.io/busybox", testdata.Sandbox, subContSpec)
+		subContSpec := SimpleSpec("subcontainer", "basic/busybox", []string{"sleep", "1000"}, nil)
+		subContID, err := crictl.StartContainer(podID, "basic/busybox", Sandbox, subContSpec)
 		if err != nil {
-			t.Fatal(err)
+			t.Fatalf("start failed: %v", err)
 		}
 
 		out, err := crictl.Exec(subContID, "sh", "-c", "echo $HOME")
 		if err != nil {
-			t.Fatal(err)
+			t.Fatalf("exec failed: %v, out: %s", err, out)
 		}
 		if got, want := strings.TrimSpace(string(out)), "/root"; got != want {
 			t.Fatalf("Home directory invalid. Got %q, Want: %q", got, want)
 		}
 
 		if err := crictl.StopContainer(subContID); err != nil {
-			t.Fatal(err)
+			t.Fatalf("stop failed: %v", err)
 		}
 	})
 
 	// Stop everything.
 	if err := crictl.StopPodAndContainer(podID, contID); err != nil {
-		t.Fatal(err)
+		t.Fatalf("stop failed: %v", err)
 	}
 
 }
 
+// containerdConfigTemplate is a .toml config for containerd. It contains a
+// formatting verb so the runtime field can be set via fmt.Sprintf.
+const containerdConfigTemplate = `
+disabled_plugins = ["restart"]
+[plugins.linux]
+  runtime = "%s"
+  runtime_root = "/tmp/test-containerd/runsc"
+  shim = "/usr/local/bin/gvisor-containerd-shim"
+  shim_debug = true
+
+[plugins.cri.containerd.runtimes.runsc]
+  runtime_type = "io.containerd.runtime.v1.linux"
+  runtime_engine = "%s"
+`
+
 // setup sets up before a test. Specifically it:
 // * Creates directories and a socket for containerd to utilize.
 // * Runs containerd and waits for it to reach a "ready" state for testing.
 // * Returns a cleanup function that should be called at the end of the test.
 func setup(t *testing.T) (*criutil.Crictl, func(), error) {
-	var cleanups []func()
-	cleanupFunc := func() {
-		for i := len(cleanups) - 1; i >= 0; i-- {
-			cleanups[i]()
-		}
-	}
-	cleanup := specutils.MakeCleanup(cleanupFunc)
-	defer cleanup.Clean()
-
 	// Create temporary containerd root and state directories, and a socket
 	// via which crictl and containerd communicate.
 	containerdRoot, err := ioutil.TempDir(testutil.TmpDir(), "containerd-root")
 	if err != nil {
 		t.Fatalf("failed to create containerd root: %v", err)
 	}
-	cleanups = append(cleanups, func() { os.RemoveAll(containerdRoot) })
+	cu := cleanup.Make(func() { os.RemoveAll(containerdRoot) })
+	defer cu.Clean()
+
 	containerdState, err := ioutil.TempDir(testutil.TmpDir(), "containerd-state")
 	if err != nil {
 		t.Fatalf("failed to create containerd state: %v", err)
 	}
-	cleanups = append(cleanups, func() { os.RemoveAll(containerdState) })
+	cu.Add(func() { os.RemoveAll(containerdState) })
 	sockAddr := filepath.Join(testutil.TmpDir(), "containerd-test.sock")
 
 	// We rewrite a configuration. This is based on the current docker
@@ -213,50 +294,52 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) {
 	if err != nil {
 		t.Fatalf("error discovering runtime path: %v", err)
 	}
-	config, err := testutil.WriteTmpFile("containerd-config", testdata.ContainerdConfig(runtime))
+	config, configCleanup, err := testutil.WriteTmpFile("containerd-config", fmt.Sprintf(containerdConfigTemplate, runtime, runtime))
 	if err != nil {
 		t.Fatalf("failed to write containerd config")
 	}
-	cleanups = append(cleanups, func() { os.RemoveAll(config) })
+	cu.Add(configCleanup)
 
 	// Start containerd.
-	containerd := exec.Command(getContainerd(),
+	cmd := exec.Command(getContainerd(),
 		"--config", config,
 		"--log-level", "debug",
 		"--root", containerdRoot,
 		"--state", containerdState,
 		"--address", sockAddr)
-	cleanups = append(cleanups, func() {
-		if err := testutil.KillCommand(containerd); err != nil {
-			log.Printf("error killing containerd: %v", err)
-		}
+	startupR, startupW := io.Pipe()
+	defer startupR.Close()
+	defer startupW.Close()
+	stderr := &bytes.Buffer{}
+	stdout := &bytes.Buffer{}
+	cmd.Stderr = io.MultiWriter(startupW, stderr)
+	cmd.Stdout = io.MultiWriter(startupW, stdout)
+	cu.Add(func() {
+		// Log output in case of failure.
+		t.Logf("containerd stdout: %s", stdout.String())
+		t.Logf("containerd stderr: %s", stderr.String())
 	})
-	containerdStderr, err := containerd.StderrPipe()
-	if err != nil {
-		t.Fatalf("failed to get containerd stderr: %v", err)
-	}
-	containerdStdout, err := containerd.StdoutPipe()
-	if err != nil {
-		t.Fatalf("failed to get containerd stdout: %v", err)
-	}
-	if err := containerd.Start(); err != nil {
+
+	// Start the process.
+	if err := cmd.Start(); err != nil {
 		t.Fatalf("failed running containerd: %v", err)
 	}
 
-	// Wait for containerd to boot. Then put all containerd output into a
-	// buffer to be logged at the end of the test.
-	testutil.WaitUntilRead(containerdStderr, "Start streaming server", nil, 10*time.Second)
-	stdoutBuf := &bytes.Buffer{}
-	stderrBuf := &bytes.Buffer{}
-	go func() { io.Copy(stdoutBuf, containerdStdout) }()
-	go func() { io.Copy(stderrBuf, containerdStderr) }()
-	cleanups = append(cleanups, func() {
-		t.Logf("containerd stdout: %s", string(stdoutBuf.Bytes()))
-		t.Logf("containerd stderr: %s", string(stderrBuf.Bytes()))
+	// Wait for containerd to boot.
+	if err := testutil.WaitUntilRead(startupR, "Start streaming server", nil, 10*time.Second); err != nil {
+		t.Fatalf("failed to start containerd: %v", err)
+	}
+
+	// Kill must be the last cleanup (as it will be executed first).
+	cc := criutil.NewCrictl(t, sockAddr)
+	cu.Add(func() {
+		cc.CleanUp() // Remove tmp files, etc.
+		if err := testutil.KillCommand(cmd); err != nil {
+			log.Printf("error killing containerd: %v", err)
+		}
 	})
 
-	cleanup.Release()
-	return criutil.NewCrictl(20*time.Second, sockAddr), cleanupFunc, nil
+	return cc, cu.Release(), nil
 }
 
 // httpGet GETs the contents of a file served from a pod on port 80.
diff --git a/test/root/main_test.go b/test/root/main_test.go
index d74dec85f..9fb17e0dd 100644
--- a/test/root/main_test.go
+++ b/test/root/main_test.go
@@ -21,7 +21,7 @@ import (
 	"testing"
 
 	"github.com/syndtr/gocapability/capability"
-	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
diff --git a/test/root/oom_score_adj_test.go b/test/root/oom_score_adj_test.go
index 126f0975a..4243eb59e 100644
--- a/test/root/oom_score_adj_test.go
+++ b/test/root/oom_score_adj_test.go
@@ -20,10 +20,10 @@ import (
 	"testing"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/pkg/cleanup"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 var (
@@ -40,15 +40,6 @@ var (
 // TestOOMScoreAdjSingle tests that oom_score_adj is set properly in a
 // single container sandbox.
 func TestOOMScoreAdjSingle(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	conf := testutil.TestConfig()
-	conf.RootDir = rootDir
-
 	ppid, err := specutils.GetParentPid(os.Getpid())
 	if err != nil {
 		t.Fatalf("getting parent pid: %v", err)
@@ -89,11 +80,11 @@ func TestOOMScoreAdjSingle(t *testing.T) {
 
 	for _, testCase := range testCases {
 		t.Run(testCase.Name, func(t *testing.T) {
-			id := testutil.UniqueContainerID()
+			id := testutil.RandomContainerID()
 			s := testutil.NewSpecWithArgs("sleep", "1000")
 			s.Process.OOMScoreAdj = testCase.OOMScoreAdj
 
-			containers, cleanup, err := startContainers(conf, []*specs.Spec{s}, []string{id})
+			containers, cleanup, err := startContainers(t, []*specs.Spec{s}, []string{id})
 			if err != nil {
 				t.Fatalf("error starting containers: %v", err)
 			}
@@ -131,15 +122,6 @@ func TestOOMScoreAdjSingle(t *testing.T) {
 // TestOOMScoreAdjMulti tests that oom_score_adj is set properly in a
 // multi-container sandbox.
 func TestOOMScoreAdjMulti(t *testing.T) {
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	conf := testutil.TestConfig()
-	conf.RootDir = rootDir
-
 	ppid, err := specutils.GetParentPid(os.Getpid())
 	if err != nil {
 		t.Fatalf("getting parent pid: %v", err)
@@ -257,7 +239,7 @@ func TestOOMScoreAdjMulti(t *testing.T) {
 				}
 			}
 
-			containers, cleanup, err := startContainers(conf, specs, ids)
+			containers, cleanup, err := startContainers(t, specs, ids)
 			if err != nil {
 				t.Fatalf("error starting containers: %v", err)
 			}
@@ -321,7 +303,7 @@ func TestOOMScoreAdjMulti(t *testing.T) {
 func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 	var specs []*specs.Spec
 	var ids []string
-	rootID := testutil.UniqueContainerID()
+	rootID := testutil.RandomContainerID()
 
 	for i, cmd := range cmds {
 		spec := testutil.NewSpecWithArgs(cmd...)
@@ -335,35 +317,34 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
 				specutils.ContainerdSandboxIDAnnotation:     rootID,
 			}
-			ids = append(ids, testutil.UniqueContainerID())
+			ids = append(ids, testutil.RandomContainerID())
 		}
 		specs = append(specs, spec)
 	}
 	return specs, ids
 }
 
-func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*container.Container, func(), error) {
-	if len(conf.RootDir) == 0 {
-		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
-	}
-
+func startContainers(t *testing.T, specs []*specs.Spec, ids []string) ([]*container.Container, func(), error) {
 	var containers []*container.Container
-	var bundles []string
-	cleanup := func() {
-		for _, c := range containers {
-			c.Destroy()
-		}
-		for _, b := range bundles {
-			os.RemoveAll(b)
-		}
+
+	// All containers must share the same root.
+	rootDir, clean, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
 	}
+	cu := cleanup.Make(clean)
+	defer cu.Clean()
+
+	// Point this to from the configuration.
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
 	for i, spec := range specs {
-		bundleDir, err := testutil.SetupBundleDir(spec)
+		bundleDir, clean, err := testutil.SetupBundleDir(spec)
 		if err != nil {
-			cleanup()
-			return nil, nil, fmt.Errorf("error setting up container: %v", err)
+			return nil, nil, fmt.Errorf("error setting up bundle: %v", err)
 		}
-		bundles = append(bundles, bundleDir)
+		cu.Add(clean)
 
 		args := container.Args{
 			ID:        ids[i],
@@ -372,15 +353,14 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*c
 		}
 		cont, err := container.New(conf, args)
 		if err != nil {
-			cleanup()
 			return nil, nil, fmt.Errorf("error creating container: %v", err)
 		}
 		containers = append(containers, cont)
 
 		if err := cont.Start(conf); err != nil {
-			cleanup()
 			return nil, nil, fmt.Errorf("error starting container: %v", err)
 		}
 	}
-	return containers, cleanup, nil
+
+	return containers, cu.Release(), nil
 }
diff --git a/test/root/runsc_test.go b/test/root/runsc_test.go
new file mode 100644
index 000000000..25204bebb
--- /dev/null
+++ b/test/root/runsc_test.go
@@ -0,0 +1,151 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package root
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cenkalti/backoff"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// TestDoKill checks that when "runsc do..." is killed, the sandbox process is
+// also terminated. This ensures that parent death signal is propagate to the
+// sandbox process correctly.
+func TestDoKill(t *testing.T) {
+	// Make the sandbox process be reparented here when it's killed, so we can
+	// wait for it.
+	if err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); err != nil {
+		t.Fatalf("prctl(PR_SET_CHILD_SUBREAPER): %v", err)
+	}
+
+	cmd := exec.Command(specutils.ExePath, "do", "sleep", "10000")
+	buf := &bytes.Buffer{}
+	cmd.Stdout = buf
+	cmd.Stderr = buf
+	cmd.Start()
+
+	var pid int
+	findSandbox := func() error {
+		var err error
+		pid, err = sandboxPid(cmd.Process.Pid)
+		if err != nil {
+			return &backoff.PermanentError{Err: err}
+		}
+		if pid == 0 {
+			return fmt.Errorf("sandbox process not found")
+		}
+		return nil
+	}
+	if err := testutil.Poll(findSandbox, 10*time.Second); err != nil {
+		t.Fatalf("failed to find sandbox: %v", err)
+	}
+	t.Logf("Found sandbox, pid: %d", pid)
+
+	if err := cmd.Process.Kill(); err != nil {
+		t.Fatalf("failed to kill run process: %v", err)
+	}
+	cmd.Wait()
+	t.Logf("Parent process killed (%d). Output: %s", cmd.Process.Pid, buf.String())
+
+	ch := make(chan struct{})
+	go func() {
+		defer func() { ch <- struct{}{} }()
+		t.Logf("Waiting for sandbox process (%d) termination", pid)
+		if _, err := unix.Wait4(pid, nil, 0, nil); err != nil {
+			t.Errorf("error waiting for sandbox process (%d): %v", pid, err)
+		}
+	}()
+	select {
+	case <-ch:
+		// Done
+	case <-time.After(5 * time.Second):
+		t.Fatalf("timeout waiting for sandbox process (%d) to exit", pid)
+	}
+}
+
+// sandboxPid looks for the sandbox process inside the process tree starting
+// from "pid". It returns 0 and no error if no sandbox process is found. It
+// returns error if anything failed.
+func sandboxPid(pid int) (int, error) {
+	cmd := exec.Command("pgrep", "-P", strconv.Itoa(pid))
+	buf := &bytes.Buffer{}
+	cmd.Stdout = buf
+	if err := cmd.Start(); err != nil {
+		return 0, err
+	}
+	ps, err := cmd.Process.Wait()
+	if err != nil {
+		return 0, err
+	}
+	if ps.ExitCode() == 1 {
+		// pgrep returns 1 when no process is found.
+		return 0, nil
+	}
+
+	var children []int
+	for _, line := range strings.Split(buf.String(), "\n") {
+		if len(line) == 0 {
+			continue
+		}
+		child, err := strconv.Atoi(line)
+		if err != nil {
+			return 0, err
+		}
+
+		cmdline, err := ioutil.ReadFile(filepath.Join("/proc", line, "cmdline"))
+		if err != nil {
+			if os.IsNotExist(err) {
+				// Raced with process exit.
+				continue
+			}
+			return 0, err
+		}
+		args := strings.SplitN(string(cmdline), "\x00", 2)
+		if len(args) == 0 {
+			return 0, fmt.Errorf("malformed cmdline file: %q", cmdline)
+		}
+		// The sandbox process has the first argument set to "runsc-sandbox".
+		if args[0] == "runsc-sandbox" {
+			return child, nil
+		}
+
+		children = append(children, child)
+	}
+
+	// Sandbox process wasn't found, try another level down.
+	for _, pid := range children {
+		sand, err := sandboxPid(pid)
+		if err != nil {
+			return 0, err
+		}
+		if sand != 0 {
+			return sand, nil
+		}
+		// Not found, continue the search.
+	}
+	return 0, nil
+}
diff --git a/test/root/testdata/BUILD b/test/root/testdata/BUILD
deleted file mode 100644
index 6859541ad..000000000
--- a/test/root/testdata/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "testdata",
-    srcs = [
-        "busybox.go",
-        "containerd_config.go",
-        "httpd.go",
-        "httpd_mount_paths.go",
-        "sandbox.go",
-        "simple.go",
-    ],
-    visibility = [
-        "//:sandbox",
-    ],
-)
diff --git a/test/root/testdata/containerd_config.go b/test/root/testdata/containerd_config.go
deleted file mode 100644
index e12f1ec88..000000000
--- a/test/root/testdata/containerd_config.go
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package testdata contains data required for root tests.
-package testdata
-
-import "fmt"
-
-// containerdConfigTemplate is a .toml config for containerd. It contains a
-// formatting verb so the runtime field can be set via fmt.Sprintf.
-const containerdConfigTemplate = `
-disabled_plugins = ["restart"]
-[plugins.linux]
-  runtime = "%s"
-  runtime_root = "/tmp/test-containerd/runsc"
-  shim = "/usr/local/bin/gvisor-containerd-shim"
-  shim_debug = true
-
-[plugins.cri.containerd.runtimes.runsc]
-  runtime_type = "io.containerd.runtime.v1.linux"
-  runtime_engine = "%s"
-`
-
-// ContainerdConfig returns a containerd config file with the specified
-// runtime.
-func ContainerdConfig(runtime string) string {
-	return fmt.Sprintf(containerdConfigTemplate, runtime, runtime)
-}
diff --git a/test/root/testdata/httpd_mount_paths.go b/test/root/testdata/httpd_mount_paths.go
deleted file mode 100644
index ac3f4446a..000000000
--- a/test/root/testdata/httpd_mount_paths.go
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-// HttpdMountPaths is a JSON config for an httpd container with additional
-// mounts.
-const HttpdMountPaths = `
-{
-  "metadata": {
-    "name": "httpd"
-  },
-  "image":{
-    "image": "httpd"
-  },
-  "mounts": [
-      {
-        "container_path": "/var/run/secrets/kubernetes.io/serviceaccount",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/volumes/kubernetes.io~secret/default-token-2rpfx",
-        "readonly": true
-      },
-      {
-        "container_path": "/etc/hosts",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/etc-hosts",
-        "readonly": false
-      },
-      {
-        "container_path": "/dev/termination-log",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/containers/httpd/d1709580",
-        "readonly": false
-      },
-      {
-        "container_path": "/usr/local/apache2/htdocs/test",
-        "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064",
-        "readonly": true
-      }
-  ],
-  "linux": {
-  },
-  "log_path": "httpd.log"
-}
-`
diff --git a/test/root/testdata/sandbox.go b/test/root/testdata/sandbox.go
deleted file mode 100644
index 0db210370..000000000
--- a/test/root/testdata/sandbox.go
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-// Sandbox is a default JSON config for a sandbox.
-const Sandbox = `
-{
-    "metadata": {
-        "name": "default-sandbox",
-        "namespace": "default",
-        "attempt": 1,
-        "uid": "hdishd83djaidwnduwk28bcsb"
-    },
-    "linux": {
-    },
-    "log_directory": "/tmp"
-}
-`
diff --git a/test/root/testdata/simple.go b/test/root/testdata/simple.go
deleted file mode 100644
index 1cca53f0c..000000000
--- a/test/root/testdata/simple.go
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package testdata
-
-import (
-	"encoding/json"
-	"fmt"
-)
-
-// SimpleSpec returns a JSON config for a simple container that runs the
-// specified command in the specified image.
-func SimpleSpec(name, image string, cmd []string) string {
-	cmds, err := json.Marshal(cmd)
-	if err != nil {
-		// This shouldn't happen.
-		panic(err)
-	}
-	return fmt.Sprintf(`
-{
-        "metadata": {
-                "name": %q
-        },
-        "image": {
-                "image": %q
-        },
-        "command": %s
-	}
-`, name, image, cmds)
-}
diff --git a/test/runner/BUILD b/test/runner/BUILD
new file mode 100644
index 000000000..6833c9986
--- /dev/null
+++ b/test/runner/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "runner",
+    testonly = 1,
+    srcs = ["runner.go"],
+    data = [
+        "//runsc",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/log",
+        "//pkg/test/testutil",
+        "//runsc/specutils",
+        "//test/runner/gtest",
+        "//test/uds",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl
new file mode 100644
index 000000000..5a83f8060
--- /dev/null
+++ b/test/runner/defs.bzl
@@ -0,0 +1,238 @@
+"""Defines a rule for syscall test targets."""
+
+load("//tools:defs.bzl", "default_platform", "loopback", "platforms")
+
+def _runner_test_impl(ctx):
+    # Generate a runner binary.
+    runner = ctx.actions.declare_file("%s-runner" % ctx.label.name)
+    runner_content = "\n".join([
+        "#!/bin/bash",
+        "set -euf -x -o pipefail",
+        "if [[ -n \"${TEST_UNDECLARED_OUTPUTS_DIR}\" ]]; then",
+        "  mkdir -p \"${TEST_UNDECLARED_OUTPUTS_DIR}\"",
+        "  chmod a+rwx \"${TEST_UNDECLARED_OUTPUTS_DIR}\"",
+        "fi",
+        "exec %s %s %s\n" % (
+            ctx.files.runner[0].short_path,
+            " ".join(ctx.attr.runner_args),
+            ctx.files.test[0].short_path,
+        ),
+    ])
+    ctx.actions.write(runner, runner_content, is_executable = True)
+
+    # Return with all transitive files.
+    runfiles = ctx.runfiles(
+        transitive_files = depset(transitive = [
+            target.data_runfiles.files
+            for target in (ctx.attr.runner, ctx.attr.test)
+            if hasattr(target, "data_runfiles")
+        ]),
+        files = ctx.files.runner + ctx.files.test,
+        collect_default = True,
+        collect_data = True,
+    )
+    return [DefaultInfo(executable = runner, runfiles = runfiles)]
+
+_runner_test = rule(
+    attrs = {
+        "runner": attr.label(
+            default = "//test/runner:runner",
+        ),
+        "test": attr.label(
+            mandatory = True,
+        ),
+        "runner_args": attr.string_list(),
+        "data": attr.label_list(
+            allow_files = True,
+        ),
+    },
+    test = True,
+    implementation = _runner_test_impl,
+)
+
+def _syscall_test(
+        test,
+        shard_count,
+        size,
+        platform,
+        use_tmpfs,
+        tags,
+        network = "none",
+        file_access = "exclusive",
+        overlay = False,
+        add_uds_tree = False,
+        vfs2 = False):
+    # Prepend "runsc" to non-native platform names.
+    full_platform = platform if platform == "native" else "runsc_" + platform
+
+    # Name the test appropriately.
+    name = test.split(":")[1] + "_" + full_platform
+    if file_access == "shared":
+        name += "_shared"
+    if overlay:
+        name += "_overlay"
+    if vfs2:
+        name += "_vfs2"
+    if network != "none":
+        name += "_" + network + "net"
+
+    # Apply all tags.
+    if tags == None:
+        tags = []
+
+    # Add the full_platform and file access in a tag to make it easier to run
+    # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
+    tags += [full_platform, "file_" + file_access]
+
+    # Hash this target into one of 15 buckets. This can be used to
+    # randomly split targets between different workflows.
+    hash15 = hash(native.package_name() + name) % 15
+    tags.append("hash15:" + str(hash15))
+
+    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
+    # we figure out how to request ipv4 sockets on Guitar machines.
+    if network == "host":
+        tags.append("noguitar")
+        tags.append("block-network")
+
+    # Disable off-host networking.
+    tags.append("requires-net:loopback")
+
+    runner_args = [
+        # Arguments are passed directly to runner binary.
+        "--platform=" + platform,
+        "--network=" + network,
+        "--use-tmpfs=" + str(use_tmpfs),
+        "--file-access=" + file_access,
+        "--overlay=" + str(overlay),
+        "--add-uds-tree=" + str(add_uds_tree),
+        "--vfs2=" + str(vfs2),
+    ]
+
+    # Call the rule above.
+    _runner_test(
+        name = name,
+        test = test,
+        runner_args = runner_args,
+        data = [loopback],
+        size = size,
+        tags = tags,
+        shard_count = shard_count,
+    )
+
+def syscall_test(
+        test,
+        shard_count = 5,
+        size = "small",
+        use_tmpfs = False,
+        add_overlay = False,
+        add_uds_tree = False,
+        add_hostinet = False,
+        vfs2 = False,
+        tags = None):
+    """syscall_test is a macro that will create targets for all platforms.
+
+    Args:
+      test: the test target.
+      shard_count: shards for defined tests.
+      size: the defined test size.
+      use_tmpfs: use tmpfs in the defined tests.
+      add_overlay: add an overlay test.
+      add_uds_tree: add a UDS test.
+      add_hostinet: add a hostinet test.
+      tags: starting test tags.
+    """
+    if not tags:
+        tags = []
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = "native",
+        use_tmpfs = False,
+        add_uds_tree = add_uds_tree,
+        tags = tags,
+    )
+
+    for (platform, platform_tags) in platforms.items():
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = platform,
+            use_tmpfs = use_tmpfs,
+            add_uds_tree = add_uds_tree,
+            tags = platform_tags + tags,
+        )
+
+    vfs2_tags = list(tags)
+    if vfs2:
+        # Add tag to easily run VFS2 tests with --test_tag_filters=vfs2
+        vfs2_tags.append("vfs2")
+
+    else:
+        # Don't automatically run tests tests not yet passing.
+        vfs2_tags.append("manual")
+        vfs2_tags.append("noguitar")
+        vfs2_tags.append("notap")
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = default_platform,
+        use_tmpfs = use_tmpfs,
+        add_uds_tree = add_uds_tree,
+        tags = platforms[default_platform] + vfs2_tags,
+        vfs2 = True,
+    )
+
+    # TODO(gvisor.dev/issue/1487): Enable VFS2 overlay tests.
+    if add_overlay:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = default_platform,
+            use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
+            add_uds_tree = add_uds_tree,
+            tags = platforms[default_platform] + tags,
+            overlay = True,
+        )
+
+    if add_hostinet:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = default_platform,
+            use_tmpfs = use_tmpfs,
+            network = "host",
+            add_uds_tree = add_uds_tree,
+            tags = platforms[default_platform] + tags,
+        )
+
+    if not use_tmpfs:
+        # Also test shared gofer access.
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = default_platform,
+            use_tmpfs = use_tmpfs,
+            add_uds_tree = add_uds_tree,
+            tags = platforms[default_platform] + tags,
+            file_access = "shared",
+        )
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = default_platform,
+            use_tmpfs = use_tmpfs,
+            add_uds_tree = add_uds_tree,
+            tags = platforms[default_platform] + vfs2_tags,
+            file_access = "shared",
+            vfs2 = True,
+        )
diff --git a/test/syscalls/gtest/BUILD b/test/runner/gtest/BUILD
index de4b2727c..de4b2727c 100644
--- a/test/syscalls/gtest/BUILD
+++ b/test/runner/gtest/BUILD
diff --git a/test/runner/gtest/gtest.go b/test/runner/gtest/gtest.go
new file mode 100644
index 000000000..869169ad5
--- /dev/null
+++ b/test/runner/gtest/gtest.go
@@ -0,0 +1,168 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gtest contains helpers for running google-test tests from Go.
+package gtest
+
+import (
+	"fmt"
+	"os/exec"
+	"strings"
+)
+
+var (
+	// listTestFlag is the flag that will list tests in gtest binaries.
+	listTestFlag = "--gtest_list_tests"
+
+	// filterTestFlag is the flag that will filter tests in gtest binaries.
+	filterTestFlag = "--gtest_filter"
+
+	// listBechmarkFlag is the flag that will list benchmarks in gtest binaries.
+	listBenchmarkFlag = "--benchmark_list_tests"
+
+	// filterBenchmarkFlag is the flag that will run specified benchmarks.
+	filterBenchmarkFlag = "--benchmark_filter"
+)
+
+// TestCase is a single gtest test case.
+type TestCase struct {
+	// Suite is the suite for this test.
+	Suite string
+
+	// Name is the name of this individual test.
+	Name string
+
+	// all indicates that this will run without flags. This takes
+	// precendence over benchmark below.
+	all bool
+
+	// benchmark indicates that this is a benchmark. In this case, the
+	// suite will be empty, and we will use the appropriate test and
+	// benchmark flags.
+	benchmark bool
+}
+
+// FullName returns the name of the test including the suite. It is suitable to
+// pass to "-gtest_filter".
+func (tc TestCase) FullName() string {
+	return fmt.Sprintf("%s.%s", tc.Suite, tc.Name)
+}
+
+// Args returns arguments to be passed when invoking the test.
+func (tc TestCase) Args() []string {
+	if tc.all {
+		return []string{} // No arguments.
+	}
+	if tc.benchmark {
+		return []string{
+			fmt.Sprintf("%s=^%s$", filterBenchmarkFlag, tc.Name),
+			fmt.Sprintf("%s=", filterTestFlag),
+		}
+	}
+	return []string{
+		fmt.Sprintf("%s=%s", filterTestFlag, tc.FullName()),
+	}
+}
+
+// ParseTestCases calls a gtest test binary to list its test and returns a
+// slice with the name and suite of each test.
+//
+// If benchmarks is true, then benchmarks will be included in the list of test
+// cases provided. Note that this requires the binary to support the
+// benchmarks_list_tests flag.
+func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]TestCase, error) {
+	// Run to extract test cases.
+	args := append([]string{listTestFlag}, extraArgs...)
+	cmd := exec.Command(testBin, args...)
+	out, err := cmd.Output()
+	if err != nil {
+		// We failed to list tests with the given flags. Just
+		// return something that will run the binary with no
+		// flags, which should execute all tests.
+		return []TestCase{
+			TestCase{
+				Suite: "Default",
+				Name:  "All",
+				all:   true,
+			},
+		}, nil
+	}
+
+	// Parse test output.
+	var t []TestCase
+	var suite string
+	for _, line := range strings.Split(string(out), "\n") {
+		// Strip comments.
+		line = strings.Split(line, "#")[0]
+
+		// New suite?
+		if !strings.HasPrefix(line, " ") {
+			suite = strings.TrimSuffix(strings.TrimSpace(line), ".")
+			continue
+		}
+
+		// Individual test.
+		name := strings.TrimSpace(line)
+
+		// Do we have a suite yet?
+		if suite == "" {
+			return nil, fmt.Errorf("test without a suite: %v", name)
+		}
+
+		// Add this individual test.
+		t = append(t, TestCase{
+			Suite: suite,
+			Name:  name,
+		})
+	}
+
+	// Finished?
+	if !benchmarks {
+		return t, nil
+	}
+
+	// Run again to extract benchmarks.
+	args = append([]string{listBenchmarkFlag}, extraArgs...)
+	cmd = exec.Command(testBin, args...)
+	out, err = cmd.Output()
+	if err != nil {
+		// We were able to enumerate tests above, but not benchmarks?
+		// We requested them, so we return an error in this case.
+		exitErr, ok := err.(*exec.ExitError)
+		if !ok {
+			return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v", err)
+		}
+		return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v\nstderr\n%s", err, exitErr.Stderr)
+	}
+
+	out = []byte(strings.Trim(string(out), "\n"))
+
+	// Parse benchmark output.
+	for _, line := range strings.Split(string(out), "\n") {
+		// Strip comments.
+		line = strings.Split(line, "#")[0]
+
+		// Single benchmark.
+		name := strings.TrimSpace(line)
+
+		// Add the single benchmark.
+		t = append(t, TestCase{
+			Suite:     "Benchmarks",
+			Name:      name,
+			benchmark: true,
+		})
+	}
+
+	return t, nil
+}
diff --git a/test/syscalls/syscall_test_runner.go b/test/runner/runner.go
index ae342b68c..948e3a8ef 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/runner/runner.go
@@ -32,17 +32,13 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
-	"gvisor.dev/gvisor/test/syscalls/gtest"
+	"gvisor.dev/gvisor/test/runner/gtest"
 	"gvisor.dev/gvisor/test/uds"
 )
 
-// Location of syscall tests, relative to the repo root.
-const testDir = "test/syscalls/linux"
-
 var (
-	testName   = flag.String("test-name", "", "name of test binary to run")
 	debug      = flag.Bool("debug", false, "enable debug logs")
 	strace     = flag.Bool("strace", false, "enable strace logs")
 	platform   = flag.String("platform", "ptrace", "platform to run on")
@@ -50,6 +46,7 @@ var (
 	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
 	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
 	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
+	vfs2       = flag.Bool("vfs2", false, "enable VFS2")
 	parallel   = flag.Bool("parallel", false, "run tests in parallel")
 	runscPath  = flag.String("runsc", "", "path to runsc binary")
 
@@ -103,7 +100,7 @@ func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
 		env = append(env, "TEST_UDS_ATTACH_TREE="+socketDir)
 	}
 
-	cmd := exec.Command(testBin, gtest.FilterTestFlag+"="+tc.FullName())
+	cmd := exec.Command(testBin, tc.Args()...)
 	cmd.Env = env
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@@ -119,20 +116,20 @@ func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
 //
 // Returns an error if the sandboxed application exits non-zero.
 func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
-	bundleDir, err := testutil.SetupBundleDir(spec)
+	bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
 	if err != nil {
 		return fmt.Errorf("SetupBundleDir failed: %v", err)
 	}
-	defer os.RemoveAll(bundleDir)
+	defer cleanup()
 
-	rootDir, err := testutil.SetupRootDir()
+	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
 		return fmt.Errorf("SetupRootDir failed: %v", err)
 	}
-	defer os.RemoveAll(rootDir)
+	defer cleanup()
 
 	name := tc.FullName()
-	id := testutil.UniqueContainerID()
+	id := testutil.RandomContainerID()
 	log.Infof("Running test %q in container %q", name, id)
 	specutils.LogSpec(spec)
 
@@ -150,6 +147,9 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
 	if *overlay {
 		args = append(args, "-overlay")
 	}
+	if *vfs2 {
+		args = append(args, "-vfs2")
+	}
 	if *debug {
 		args = append(args, "-debug", "-log-packets=true")
 	}
@@ -208,7 +208,7 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
 			return
 		}
 		log.Warningf("%s: Got signal: %v", name, s)
-		done := make(chan bool)
+		done := make(chan bool, 1)
 		dArgs := append([]string{}, args...)
 		dArgs = append(dArgs, "-alsologtostderr=true", "debug", "--stacks", id)
 		go func(dArgs []string) {
@@ -296,7 +296,7 @@ func setupUDSTree(spec *specs.Spec) (cleanup func(), err error) {
 func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 	// Run a new container with the test executable and filter for the
 	// given test suite and name.
-	spec := testutil.NewSpecWithArgs(testBin, gtest.FilterTestFlag+"="+tc.FullName())
+	spec := testutil.NewSpecWithArgs(append([]string{testBin}, tc.Args()...)...)
 
 	// Mark the root as writeable, as some tests attempt to
 	// write to the rootfs, and expect EACCES, not EROFS.
@@ -304,6 +304,7 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 
 	// Test spec comes with pre-defined mounts that we don't want. Reset it.
 	spec.Mounts = nil
+	testTmpDir := "/tmp"
 	if *useTmpfs {
 		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
 		// features only available in gVisor's internal tmpfs may fail.
@@ -329,18 +330,28 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 			t.Fatalf("could not chmod temp dir: %v", err)
 		}
 
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: "/tmp",
-			Source:      tmpDir,
-		})
+		// "/tmp" is not replaced with a tmpfs mount inside the sandbox
+		// when it's not empty. This ensures that testTmpDir uses gofer
+		// in exclusive mode.
+		testTmpDir = tmpDir
+		if *fileAccess == "shared" {
+			// All external mounts except the root mount are shared.
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Type:        "bind",
+				Destination: "/tmp",
+				Source:      tmpDir,
+			})
+			testTmpDir = "/tmp"
+		}
 	}
 
-	// Set environment variables that indicate we are
-	// running in gVisor with the given platform and network.
+	// Set environment variables that indicate we are running in gVisor with
+	// the given platform, network, and filesystem stack.
+	// TODO(gvisor.dev/issue/1487): Update this when the runner supports VFS2.
 	platformVar := "TEST_ON_GVISOR"
 	networkVar := "GVISOR_NETWORK"
-	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
+	vfsVar := "GVISOR_VFS"
+	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network, vfsVar+"=VFS1")
 
 	// Remove env variables that cause the gunit binary to write output
 	// files, since they will stomp on eachother, and on the output files
@@ -353,12 +364,8 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 
 	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
 	// be backed by tmpfs.
-	for i, kv := range env {
-		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
-			env[i] = "TEST_TMPDIR=/tmp"
-			break
-		}
-	}
+	env = filterEnv(env, []string{"TEST_TMPDIR"})
+	env = append(env, fmt.Sprintf("TEST_TMPDIR=%s", testTmpDir))
 
 	spec.Process.Env = env
 
@@ -404,9 +411,10 @@ func matchString(a, b string) (bool, error) {
 
 func main() {
 	flag.Parse()
-	if *testName == "" {
-		fatalf("test-name flag must be provided")
+	if flag.NArg() != 1 {
+		fatalf("test must be provided")
 	}
+	testBin := flag.Args()[0] // Only argument.
 
 	log.SetLevel(log.Info)
 	if *debug {
@@ -436,15 +444,8 @@ func main() {
 		}
 	}
 
-	// Get path to test binary.
-	fullTestName := filepath.Join(testDir, *testName)
-	testBin, err := testutil.FindFile(fullTestName)
-	if err != nil {
-		fatalf("FindFile(%q) failed: %v", fullTestName, err)
-	}
-
 	// Get all test cases in each binary.
-	testCases, err := gtest.ParseTestCases(testBin)
+	testCases, err := gtest.ParseTestCases(testBin, true)
 	if err != nil {
 		fatalf("ParseTestCases(%q) failed: %v", testBin, err)
 	}
@@ -455,14 +456,19 @@ func main() {
 		fatalf("TestsForShard() failed: %v", err)
 	}
 
+	// Resolve the absolute path for the binary.
+	testBin, err = filepath.Abs(testBin)
+	if err != nil {
+		fatalf("Abs() failed: %v", err)
+	}
+
 	// Run the tests.
 	var tests []testing.InternalTest
 	for _, tci := range indices {
 		// Capture tc.
 		tc := testCases[tci]
-		testName := fmt.Sprintf("%s_%s", tc.Suite, tc.Name)
 		tests = append(tests, testing.InternalTest{
-			Name: testName,
+			Name: fmt.Sprintf("%s_%s", tc.Suite, tc.Name),
 			F: func(t *testing.T) {
 				if *parallel {
 					t.Parallel()
diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD
index 2c472bf8d..4cd627222 100644
--- a/test/runtimes/BUILD
+++ b/test/runtimes/BUILD
@@ -1,20 +1,7 @@
-# These packages are used to run language runtime tests inside gVisor sandboxes.
-
-load("//tools:defs.bzl", "go_binary", "go_test")
-load("//test/runtimes:build_defs.bzl", "runtime_test")
+load("//test/runtimes:defs.bzl", "runtime_test")
 
 package(licenses = ["notice"])
 
-go_binary(
-    name = "runner",
-    testonly = 1,
-    srcs = ["runner.go"],
-    deps = [
-        "//runsc/dockerutil",
-        "//runsc/testutil",
-    ],
-)
-
 runtime_test(
     name = "go1.12",
     blacklist_file = "blacklist_go1.12.csv",
@@ -44,10 +31,3 @@ runtime_test(
     blacklist_file = "blacklist_python3.7.3.csv",
     lang = "python",
 )
-
-go_test(
-    name = "blacklist_test",
-    size = "small",
-    srcs = ["blacklist_test.go"],
-    library = ":runner",
-)
diff --git a/test/runtimes/README.md b/test/runtimes/README.md
deleted file mode 100644
index 42d722553..000000000
--- a/test/runtimes/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Runtimes Tests Dockerfiles
-
-The Dockerfiles defined under this path are configured to host the execution of
-the runtimes language tests. Each Dockerfile can support the language indicated
-by its directory.
-
-The following runtimes are currently supported:
-
--   Go 1.12
--   Java 11
--   Node.js 12
--   PHP 7.3
--   Python 3.7
-
-### Building and pushing the images:
-
-The canonical source of images is the
-[gvisor-presubmit container registry](https://gcr.io/gvisor-presubmit/). You can
-build new images with the following command:
-
-```bash
-$ cd images
-$ docker build -f Dockerfile_$LANG [-t $NAME] .
-```
-
-To push them to our container registry, set the tag in the command above to
-`gcr.io/gvisor-presubmit/$LANG`, then push them. (Note that you will need
-appropriate permissions to the `gvisor-presubmit` GCP project.)
-
-```bash
-gcloud docker -- push gcr.io/gvisor-presubmit/$LANG
-```
-
-#### Running in Docker locally:
-
-1) [Install and configure Docker](https://docs.docker.com/install/)
-
-2) Pull the image you want to run:
-
-```bash
-$ docker pull gcr.io/gvisor-presubmit/$LANG
-```
-
-3) Run docker with the image.
-
-```bash
-$ docker run [--runtime=runsc] --rm -it $NAME [FLAG]
-```
-
-Running the command with no flags will cause all the available tests to execute.
-
-Flags can be added for additional functionality:
-
--   --list: Print a list of all available tests
--   --test &lt;name&gt;: Run a single test from the list of available tests
--   --v: Print the language version
diff --git a/test/runtimes/build_defs.bzl b/test/runtimes/build_defs.bzl
deleted file mode 100644
index 92e275a76..000000000
--- a/test/runtimes/build_defs.bzl
+++ /dev/null
@@ -1,75 +0,0 @@
-"""Defines a rule for runtime test targets."""
-
-load("//tools:defs.bzl", "go_test", "loopback")
-
-def runtime_test(
-        name,
-        lang,
-        image_repo = "gcr.io/gvisor-presubmit",
-        image_name = None,
-        blacklist_file = None,
-        shard_count = 50,
-        size = "enormous"):
-    """Generates sh_test and blacklist test targets for a given runtime.
-
-    Args:
-      name: The name of the runtime being tested. Typically, the lang + version.
-          This is used in the names of the generated test targets.
-      lang: The language being tested.
-      image_repo: The docker repository containing the proctor image to run.
-          i.e., the prefix to the fully qualified docker image id.
-      image_name: The name of the image in the image_repo.
-          Defaults to the test name.
-      blacklist_file: A test blacklist to pass to the runtime test's runner.
-      shard_count: See Bazel common test attributes.
-      size: See Bazel common test attributes.
-    """
-    if image_name == None:
-        image_name = name
-    args = [
-        "--lang",
-        lang,
-        "--image",
-        "/".join([image_repo, image_name]),
-    ]
-    data = [
-        ":runner",
-        loopback,
-    ]
-    if blacklist_file:
-        args += ["--blacklist_file", "test/runtimes/" + blacklist_file]
-        data += [blacklist_file]
-
-        # Add a test that the blacklist parses correctly.
-        blacklist_test(name, blacklist_file)
-
-    sh_test(
-        name = name + "_test",
-        srcs = ["runner.sh"],
-        args = args,
-        data = data,
-        size = size,
-        shard_count = shard_count,
-        tags = [
-            # Requires docker and runsc to be configured before the test runs.
-            "local",
-            # Don't include test target in wildcard target patterns.
-            "manual",
-        ],
-    )
-
-def blacklist_test(name, blacklist_file):
-    """Test that a blacklist parses correctly."""
-    go_test(
-        name = name + "_blacklist_test",
-        library = ":runner",
-        srcs = ["blacklist_test.go"],
-        args = ["--blacklist_file", "test/runtimes/" + blacklist_file],
-        data = [blacklist_file],
-    )
-
-def sh_test(**kwargs):
-    """Wraps the standard sh_test."""
-    native.sh_test(
-        **kwargs
-    )
diff --git a/test/runtimes/defs.bzl b/test/runtimes/defs.bzl
new file mode 100644
index 000000000..f836dd952
--- /dev/null
+++ b/test/runtimes/defs.bzl
@@ -0,0 +1,79 @@
+"""Defines a rule for runtime test targets."""
+
+load("//tools:defs.bzl", "go_test")
+
+def _runtime_test_impl(ctx):
+    # Construct arguments.
+    args = [
+        "--lang",
+        ctx.attr.lang,
+        "--image",
+        ctx.attr.image,
+    ]
+    if ctx.attr.blacklist_file:
+        args += [
+            "--blacklist_file",
+            ctx.files.blacklist_file[0].short_path,
+        ]
+
+    # Build a runner.
+    runner = ctx.actions.declare_file("%s-executer" % ctx.label.name)
+    runner_content = "\n".join([
+        "#!/bin/bash",
+        "%s %s\n" % (ctx.files._runner[0].short_path, " ".join(args)),
+    ])
+    ctx.actions.write(runner, runner_content, is_executable = True)
+
+    # Return the runner.
+    return [DefaultInfo(
+        executable = runner,
+        runfiles = ctx.runfiles(
+            files = ctx.files._runner + ctx.files.blacklist_file + ctx.files._proctor,
+            collect_default = True,
+            collect_data = True,
+        ),
+    )]
+
+_runtime_test = rule(
+    implementation = _runtime_test_impl,
+    attrs = {
+        "image": attr.string(
+            mandatory = False,
+        ),
+        "lang": attr.string(
+            mandatory = True,
+        ),
+        "blacklist_file": attr.label(
+            mandatory = False,
+            allow_single_file = True,
+        ),
+        "_runner": attr.label(
+            default = "//test/runtimes/runner:runner",
+        ),
+        "_proctor": attr.label(
+            default = "//test/runtimes/proctor:proctor",
+        ),
+    },
+    test = True,
+)
+
+def runtime_test(name, **kwargs):
+    _runtime_test(
+        name = name,
+        image = name,  # Resolved as images/runtimes/%s.
+        tags = [
+            "local",
+            "manual",
+        ],
+        **kwargs
+    )
+
+def blacklist_test(name, blacklist_file):
+    """Test that a blacklist parses correctly."""
+    go_test(
+        name = name + "_blacklist_test",
+        library = ":runner",
+        srcs = ["blacklist_test.go"],
+        args = ["--blacklist_file", "test/runtimes/" + blacklist_file],
+        data = [blacklist_file],
+    )
diff --git a/test/runtimes/images/Dockerfile_go1.12 b/test/runtimes/images/Dockerfile_go1.12
deleted file mode 100644
index ab9d6abf3..000000000
--- a/test/runtimes/images/Dockerfile_go1.12
+++ /dev/null
@@ -1,10 +0,0 @@
-# Go is easy, since we already have everything we need to compile the proctor
-# binary and run the tests in the golang Docker image.
-FROM golang:1.12
-ADD ["proctor/", "/go/src/proctor/"]
-RUN ["go", "build", "-o", "/proctor", "/go/src/proctor"]
-
-# Pre-compile the tests so we don't need to do so in each test run.
-RUN ["go", "tool", "dist", "test", "-compile-only"]
-
-ENTRYPOINT ["/proctor", "--runtime=go"]
diff --git a/test/runtimes/images/proctor/BUILD b/test/runtimes/proctor/BUILD
index 85e004c45..da1e331e1 100644
--- a/test/runtimes/images/proctor/BUILD
+++ b/test/runtimes/proctor/BUILD
@@ -12,7 +12,8 @@ go_binary(
         "proctor.go",
         "python.go",
     ],
-    visibility = ["//test/runtimes/images:__subpackages__"],
+    pure = True,
+    visibility = ["//test/runtimes:__pkg__"],
 )
 
 go_test(
@@ -20,7 +21,8 @@ go_test(
     size = "small",
     srcs = ["proctor_test.go"],
     library = ":proctor",
+    nocgo = 1,
     deps = [
-        "//runsc/testutil",
+        "//pkg/test/testutil",
     ],
 )
diff --git a/test/runtimes/images/proctor/go.go b/test/runtimes/proctor/go.go
index 3e2d5d8db..3e2d5d8db 100644
--- a/test/runtimes/images/proctor/go.go
+++ b/test/runtimes/proctor/go.go
diff --git a/test/runtimes/images/proctor/java.go b/test/runtimes/proctor/java.go
index 8b362029d..8b362029d 100644
--- a/test/runtimes/images/proctor/java.go
+++ b/test/runtimes/proctor/java.go
diff --git a/test/runtimes/images/proctor/nodejs.go b/test/runtimes/proctor/nodejs.go
index bd57db444..bd57db444 100644
--- a/test/runtimes/images/proctor/nodejs.go
+++ b/test/runtimes/proctor/nodejs.go
diff --git a/test/runtimes/images/proctor/php.go b/test/runtimes/proctor/php.go
index 9115040e1..9115040e1 100644
--- a/test/runtimes/images/proctor/php.go
+++ b/test/runtimes/proctor/php.go
diff --git a/test/runtimes/images/proctor/proctor.go b/test/runtimes/proctor/proctor.go
index b54abe434..b54abe434 100644
--- a/test/runtimes/images/proctor/proctor.go
+++ b/test/runtimes/proctor/proctor.go
diff --git a/test/runtimes/images/proctor/proctor_test.go b/test/runtimes/proctor/proctor_test.go
index 6bb61d142..6ef2de085 100644
--- a/test/runtimes/images/proctor/proctor_test.go
+++ b/test/runtimes/proctor/proctor_test.go
@@ -23,24 +23,24 @@ import (
 	"strings"
 	"testing"
 
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 func touch(t *testing.T, name string) {
 	t.Helper()
 	f, err := os.Create(name)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("error creating file %q: %v", name, err)
 	}
 	if err := f.Close(); err != nil {
-		t.Fatal(err)
+		t.Fatalf("error closing file %q: %v", name, err)
 	}
 }
 
 func TestSearchEmptyDir(t *testing.T) {
 	td, err := ioutil.TempDir(testutil.TmpDir(), "searchtest")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("error creating searchtest: %v", err)
 	}
 	defer os.RemoveAll(td)
 
@@ -60,7 +60,7 @@ func TestSearchEmptyDir(t *testing.T) {
 func TestSearch(t *testing.T) {
 	td, err := ioutil.TempDir(testutil.TmpDir(), "searchtest")
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("error creating searchtest: %v", err)
 	}
 	defer os.RemoveAll(td)
 
@@ -101,14 +101,14 @@ func TestSearch(t *testing.T) {
 		if strings.HasSuffix(item, "/") {
 			// This item is a directory, create it.
 			if err := os.MkdirAll(filepath.Join(td, item), 0755); err != nil {
-				t.Fatal(err)
+				t.Fatalf("error making directory: %v", err)
 			}
 		} else {
 			// This item is a file, create the directory and touch file.
 			// Create directory in which file should be created
 			fullDirPath := filepath.Join(td, filepath.Dir(item))
 			if err := os.MkdirAll(fullDirPath, 0755); err != nil {
-				t.Fatal(err)
+				t.Fatalf("error making directory: %v", err)
 			}
 			// Create file with full path to file.
 			touch(t, filepath.Join(td, item))
diff --git a/test/runtimes/images/proctor/python.go b/test/runtimes/proctor/python.go
index b9e0fbe6f..b9e0fbe6f 100644
--- a/test/runtimes/images/proctor/python.go
+++ b/test/runtimes/proctor/python.go
diff --git a/test/runtimes/runner/BUILD b/test/runtimes/runner/BUILD
new file mode 100644
index 000000000..63924b9c5
--- /dev/null
+++ b/test/runtimes/runner/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_binary", "go_test")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "runner",
+    testonly = 1,
+    srcs = ["main.go"],
+    visibility = ["//test/runtimes:__pkg__"],
+    deps = [
+        "//pkg/test/dockerutil",
+        "//pkg/test/testutil",
+    ],
+)
+
+go_test(
+    name = "blacklist_test",
+    size = "small",
+    srcs = ["blacklist_test.go"],
+    library = ":runner",
+)
diff --git a/test/runtimes/blacklist_test.go b/test/runtimes/runner/blacklist_test.go
index 52f49b984..0ff69ab18 100644
--- a/test/runtimes/blacklist_test.go
+++ b/test/runtimes/runner/blacklist_test.go
@@ -32,6 +32,6 @@ func TestBlacklists(t *testing.T) {
 		t.Fatalf("error parsing blacklist: %v", err)
 	}
 	if *blacklistFile != "" && len(bl) == 0 {
-		t.Errorf("got empty blacklist for file %q", blacklistFile)
+		t.Errorf("got empty blacklist for file %q", *blacklistFile)
 	}
 }
diff --git a/test/runtimes/runner.go b/test/runtimes/runner/main.go
index ddb890dbc..57540e00e 100644
--- a/test/runtimes/runner.go
+++ b/test/runtimes/runner/main.go
@@ -26,8 +26,8 @@ import (
 	"testing"
 	"time"
 
-	"gvisor.dev/gvisor/runsc/dockerutil"
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 var (
@@ -45,7 +45,6 @@ func main() {
 		fmt.Fprintf(os.Stderr, "lang and image flags must not be empty\n")
 		os.Exit(1)
 	}
-
 	os.Exit(runTests())
 }
 
@@ -60,8 +59,8 @@ func runTests() int {
 		return 1
 	}
 
-	// Create a single docker container that will be used for all tests.
-	d := dockerutil.MakeDocker("gvisor-" + *lang)
+	// Construct the shared docker instance.
+	d := dockerutil.MakeDocker(testutil.DefaultLogger(*lang))
 	defer d.CleanUp()
 
 	// Get a slice of tests to run. This will also start a single Docker
@@ -77,21 +76,18 @@ func runTests() int {
 	return m.Run()
 }
 
-// getTests returns a slice of tests to run, subject to the shard size and
-// index.
-func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.InternalTest, error) {
-	// Pull the image.
-	if err := dockerutil.Pull(*image); err != nil {
-		return nil, fmt.Errorf("docker pull %q failed: %v", *image, err)
-	}
-
-	// Run proctor with --pause flag to keep container alive forever.
-	if err := d.Run(*image, "--pause"); err != nil {
+// getTests executes all tests as table tests.
+func getTests(d *dockerutil.Docker, blacklist map[string]struct{}) ([]testing.InternalTest, error) {
+	// Start the container.
+	d.CopyFiles("/proctor", "test/runtimes/proctor/proctor")
+	if err := d.Spawn(dockerutil.RunOpts{
+		Image: fmt.Sprintf("runtimes/%s", *image),
+	}, "/proctor/proctor", "--pause"); err != nil {
 		return nil, fmt.Errorf("docker run failed: %v", err)
 	}
 
 	// Get a list of all tests in the image.
-	list, err := d.Exec("/proctor", "--runtime", *lang, "--list")
+	list, err := d.Exec(dockerutil.RunOpts{}, "/proctor/proctor", "--runtime", *lang, "--list")
 	if err != nil {
 		return nil, fmt.Errorf("docker exec failed: %v", err)
 	}
@@ -114,7 +110,7 @@ func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.Int
 			F: func(t *testing.T) {
 				// Is the test blacklisted?
 				if _, ok := blacklist[tc]; ok {
-					t.Skip("SKIP: blacklisted test %q", tc)
+					t.Skipf("SKIP: blacklisted test %q", tc)
 				}
 
 				var (
@@ -126,7 +122,7 @@ func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.Int
 
 				go func() {
 					fmt.Printf("RUNNING %s...\n", tc)
-					output, err = d.Exec("/proctor", "--runtime", *lang, "--test", tc)
+					output, err = d.Exec(dockerutil.RunOpts{}, "/proctor/proctor", "--runtime", *lang, "--test", tc)
 					close(done)
 				}()
 
@@ -143,6 +139,7 @@ func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.Int
 			},
 		})
 	}
+
 	return itests, nil
 }
 
@@ -153,11 +150,7 @@ func getBlacklist() (map[string]struct{}, error) {
 	if *blacklistFile == "" {
 		return blacklist, nil
 	}
-	file, err := testutil.FindFile(*blacklistFile)
-	if err != nil {
-		return nil, err
-	}
-	f, err := os.Open(file)
+	f, err := os.Open(*blacklistFile)
 	if err != nil {
 		return nil, err
 	}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 31d239c0e..d68afbe44 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -1,24 +1,34 @@
-load("//tools:defs.bzl", "go_binary")
-load("//test/syscalls:build_defs.bzl", "syscall_test")
+load("//test/runner:defs.bzl", "syscall_test")
 
 package(licenses = ["notice"])
 
-syscall_test(test = "//test/syscalls/linux:32bit_test")
+syscall_test(
+    test = "//test/syscalls/linux:32bit_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:accept_bind_stream_test")
+syscall_test(
+    test = "//test/syscalls/linux:accept_bind_stream_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "large",
     shard_count = 50,
     test = "//test/syscalls/linux:accept_bind_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:access_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:affinity_test")
+syscall_test(
+    test = "//test/syscalls/linux:affinity_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
@@ -29,11 +39,18 @@ syscall_test(
     size = "medium",
     shard_count = 5,
     test = "//test/syscalls/linux:alarm_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:arch_prctl_test")
+syscall_test(
+    test = "//test/syscalls/linux:arch_prctl_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:bad_test")
+syscall_test(
+    test = "//test/syscalls/linux:bad_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "large",
@@ -41,9 +58,15 @@ syscall_test(
     test = "//test/syscalls/linux:bind_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:brk_test")
+syscall_test(
+    test = "//test/syscalls/linux:brk_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "large",
@@ -52,16 +75,19 @@ syscall_test(
     # involve much concurrency, TSAN's usefulness here is limited anyway.
     tags = ["nogotsan"],
     test = "//test/syscalls/linux:socket_stress_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:chdir_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:chmod_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -69,6 +95,7 @@ syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:chown_test",
     use_tmpfs = True,  # chwon tests require gofer to be running as root.
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -76,45 +103,70 @@ syscall_test(
     test = "//test/syscalls/linux:chroot_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:clock_getres_test")
+syscall_test(
+    test = "//test/syscalls/linux:clock_getres_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:clock_gettime_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:clock_nanosleep_test")
+syscall_test(
+    test = "//test/syscalls/linux:clock_nanosleep_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:concurrency_test")
+syscall_test(
+    test = "//test/syscalls/linux:concurrency_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_uds_tree = True,
     test = "//test/syscalls/linux:connect_external_test",
     use_tmpfs = True,
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:creat_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:dev_test")
+syscall_test(
+    test = "//test/syscalls/linux:dev_test",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:dup_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:epoll_test")
+syscall_test(
+    test = "//test/syscalls/linux:epoll_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:eventfd_test")
+syscall_test(
+    test = "//test/syscalls/linux:eventfd_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:exceptions_test")
+syscall_test(
+    test = "//test/syscalls/linux:exceptions_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     add_overlay = True,
     test = "//test/syscalls/linux:exec_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -123,7 +175,10 @@ syscall_test(
     test = "//test/syscalls/linux:exec_binary_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:exit_test")
+syscall_test(
+    test = "//test/syscalls/linux:exit_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
@@ -135,11 +190,15 @@ syscall_test(
     test = "//test/syscalls/linux:fallocate_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:fault_test")
+syscall_test(
+    test = "//test/syscalls/linux:fault_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:fchdir_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -153,11 +212,20 @@ syscall_test(
     test = "//test/syscalls/linux:flock_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:fork_test")
+syscall_test(
+    test = "//test/syscalls/linux:fork_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:fpsig_fork_test")
+syscall_test(
+    test = "//test/syscalls/linux:fpsig_fork_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:fpsig_nested_test")
+syscall_test(
+    test = "//test/syscalls/linux:fpsig_nested_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
@@ -168,20 +236,33 @@ syscall_test(
     size = "medium",
     shard_count = 5,
     test = "//test/syscalls/linux:futex_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:getcpu_host_test")
+syscall_test(
+    test = "//test/syscalls/linux:getcpu_host_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:getcpu_test")
+syscall_test(
+    test = "//test/syscalls/linux:getcpu_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:getdents_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:getrandom_test")
+syscall_test(
+    test = "//test/syscalls/linux:getrandom_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:getrusage_test")
+syscall_test(
+    test = "//test/syscalls/linux:getrusage_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
@@ -197,15 +278,20 @@ syscall_test(
 
 syscall_test(
     test = "//test/syscalls/linux:iptables_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "large",
     shard_count = 5,
     test = "//test/syscalls/linux:itimer_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:kill_test")
+syscall_test(
+    test = "//test/syscalls/linux:kill_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
@@ -216,19 +302,33 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:lseek_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:madvise_test")
+syscall_test(
+    test = "//test/syscalls/linux:madvise_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:memory_accounting_test")
+syscall_test(
+    test = "//test/syscalls/linux:memory_accounting_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:mempolicy_test")
+syscall_test(
+    test = "//test/syscalls/linux:mempolicy_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:mincore_test")
+syscall_test(
+    test = "//test/syscalls/linux:mincore_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:mkdir_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -250,18 +350,29 @@ syscall_test(
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:mremap_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:msync_test",
+    vfs2 = "True",
+)
+
+syscall_test(
+    test = "//test/syscalls/linux:munmap_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:munmap_test")
+syscall_test(
+    test = "//test/syscalls/linux:network_namespace_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:open_create_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -269,40 +380,73 @@ syscall_test(
     test = "//test/syscalls/linux:open_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:packet_socket_raw_test")
+syscall_test(
+    test = "//test/syscalls/linux:packet_socket_raw_test",
+    vfs2 = "True",
+)
+
+syscall_test(
+    test = "//test/syscalls/linux:packet_socket_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:packet_socket_test")
+syscall_test(
+    test = "//test/syscalls/linux:partial_bad_buffer_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:partial_bad_buffer_test")
+syscall_test(
+    test = "//test/syscalls/linux:pause_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:pause_test")
+syscall_test(
+    size = "medium",
+    # Takes too long under gotsan to run.
+    tags = ["nogotsan"],
+    test = "//test/syscalls/linux:ping_socket_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "large",
     add_overlay = True,
     shard_count = 5,
     test = "//test/syscalls/linux:pipe_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:poll_test")
+syscall_test(
+    test = "//test/syscalls/linux:poll_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:ppoll_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:prctl_setuid_test")
+syscall_test(
+    test = "//test/syscalls/linux:prctl_setuid_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:prctl_test")
+syscall_test(
+    test = "//test/syscalls/linux:prctl_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:pread64_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:preadv_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -310,32 +454,56 @@ syscall_test(
     test = "//test/syscalls/linux:preadv2_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:priority_test")
+syscall_test(
+    test = "//test/syscalls/linux:priority_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:proc_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
+syscall_test(
+    test = "//test/syscalls/linux:proc_net_test",
+    vfs2 = "True",
+)
+
+syscall_test(
+    test = "//test/syscalls/linux:proc_pid_oomscore_test",
+    vfs2 = "True",
+)
+
+syscall_test(
+    test = "//test/syscalls/linux:proc_pid_smaps_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:proc_net_test")
+syscall_test(
+    test = "//test/syscalls/linux:proc_pid_uid_gid_map_test",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:pselect_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:ptrace_test")
+syscall_test(
+    test = "//test/syscalls/linux:ptrace_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     shard_count = 5,
     test = "//test/syscalls/linux:pty_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     test = "//test/syscalls/linux:pty_root_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -346,17 +514,28 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:pwrite64_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:raw_socket_hdrincl_test")
+syscall_test(
+    test = "//test/syscalls/linux:raw_socket_hdrincl_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:raw_socket_icmp_test")
+syscall_test(
+    test = "//test/syscalls/linux:raw_socket_icmp_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:raw_socket_ipv4_test")
+syscall_test(
+    test = "//test/syscalls/linux:raw_socket_ipv4_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:read_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -368,12 +547,14 @@ syscall_test(
     size = "medium",
     shard_count = 5,
     test = "//test/syscalls/linux:readv_socket_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     add_overlay = True,
     test = "//test/syscalls/linux:readv_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -382,25 +563,50 @@ syscall_test(
     test = "//test/syscalls/linux:rename_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:rlimits_test")
+syscall_test(
+    test = "//test/syscalls/linux:rlimits_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:rseq_test")
+syscall_test(
+    test = "//test/syscalls/linux:rseq_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:rtsignal_test")
+syscall_test(
+    test = "//test/syscalls/linux:rtsignal_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:signalfd_test")
+syscall_test(
+    test = "//test/syscalls/linux:signalfd_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:sched_test")
+syscall_test(
+    test = "//test/syscalls/linux:sched_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:sched_yield_test")
+syscall_test(
+    test = "//test/syscalls/linux:sched_yield_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:seccomp_test")
+syscall_test(
+    test = "//test/syscalls/linux:seccomp_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:select_test")
+syscall_test(
+    test = "//test/syscalls/linux:select_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     shard_count = 20,
     test = "//test/syscalls/linux:semaphore_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -416,49 +622,68 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:splice_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:sigaction_test")
+syscall_test(
+    test = "//test/syscalls/linux:sigaction_test",
+    vfs2 = "True",
+)
 
 # TODO(b/119826902): Enable once the test passes in runsc.
-# syscall_test(test = "//test/syscalls/linux:sigaltstack_test")
+# syscall_test(vfs2="True",test = "//test/syscalls/linux:sigaltstack_test")
 
-syscall_test(test = "//test/syscalls/linux:sigiret_test")
+syscall_test(
+    test = "//test/syscalls/linux:sigiret_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:sigprocmask_test")
+syscall_test(
+    test = "//test/syscalls/linux:sigprocmask_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:sigstop_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:sigtimedwait_test")
+syscall_test(
+    test = "//test/syscalls/linux:sigtimedwait_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:shm_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_abstract_non_blocking_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "large",
     shard_count = 50,
     test = "//test/syscalls/linux:socket_abstract_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_domain_non_blocking_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "large",
     shard_count = 50,
     test = "//test/syscalls/linux:socket_domain_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -483,59 +708,99 @@ syscall_test(
 syscall_test(
     size = "large",
     shard_count = 50,
+    # Takes too long for TSAN. Creates a lot of TCP sockets.
+    tags = ["nogotsan"],
+    test = "//test/syscalls/linux:socket_inet_loopback_nogotsan_test",
+)
+
+syscall_test(
+    size = "large",
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_generic_loopback_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_ip_tcp_loopback_non_blocking_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "large",
     shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_loopback_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_udp_generic_loopback_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_ip_udp_loopback_non_blocking_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "large",
     shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_udp_loopback_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_ipv4_udp_unbound_loopback_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:socket_ip_unbound_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_ip_unbound_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_netdevice_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_netdevice_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_netlink_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_netlink_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_netlink_route_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_netlink_route_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_netlink_uevent_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_netlink_uevent_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_blocking_local_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_blocking_local_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_blocking_ip_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_blocking_ip_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_non_stream_blocking_local_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_non_stream_blocking_local_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_non_stream_blocking_udp_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_non_stream_blocking_udp_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "large",
@@ -545,6 +810,7 @@ syscall_test(
 syscall_test(
     size = "large",
     test = "//test/syscalls/linux:socket_stream_blocking_tcp_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -567,6 +833,7 @@ syscall_test(
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_unix_dgram_non_blocking_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -574,6 +841,7 @@ syscall_test(
     add_overlay = True,
     shard_count = 50,
     test = "//test/syscalls/linux:socket_unix_pair_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -591,11 +859,13 @@ syscall_test(
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_unix_unbound_abstract_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_unix_unbound_dgram_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -607,6 +877,7 @@ syscall_test(
     size = "medium",
     shard_count = 10,
     test = "//test/syscalls/linux:socket_unix_unbound_seqpacket_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -618,6 +889,7 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:statfs_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -628,6 +900,7 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:stat_times_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -643,6 +916,7 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:sync_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -650,97 +924,151 @@ syscall_test(
     test = "//test/syscalls/linux:sync_file_range_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:sysinfo_test")
+syscall_test(
+    test = "//test/syscalls/linux:sysinfo_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:syslog_test")
+syscall_test(
+    test = "//test/syscalls/linux:syslog_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:sysret_test")
+syscall_test(
+    test = "//test/syscalls/linux:sysret_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     shard_count = 10,
     test = "//test/syscalls/linux:tcp_socket_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:tgkill_test")
+syscall_test(
+    test = "//test/syscalls/linux:tgkill_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:timerfd_test")
+syscall_test(
+    test = "//test/syscalls/linux:timerfd_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:timers_test")
+syscall_test(
+    test = "//test/syscalls/linux:timers_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:time_test")
+syscall_test(
+    test = "//test/syscalls/linux:time_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:tkill_test")
+syscall_test(
+    test = "//test/syscalls/linux:tkill_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:truncate_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:udp_bind_test")
+syscall_test(
+    test = "//test/syscalls/linux:tuntap_test",
+)
+
+syscall_test(
+    add_hostinet = True,
+    test = "//test/syscalls/linux:tuntap_hostinet_test",
+    vfs2 = "True",
+)
+
+syscall_test(
+    test = "//test/syscalls/linux:udp_bind_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     add_hostinet = True,
     shard_count = 10,
     test = "//test/syscalls/linux:udp_socket_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:uidgid_test")
+syscall_test(
+    test = "//test/syscalls/linux:uidgid_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:uname_test")
+syscall_test(
+    test = "//test/syscalls/linux:uname_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:unlink_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:unshare_test")
+syscall_test(
+    test = "//test/syscalls/linux:unshare_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:utimes_test")
+syscall_test(
+    test = "//test/syscalls/linux:utimes_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:vdso_clock_gettime_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:vdso_test")
+syscall_test(
+    test = "//test/syscalls/linux:vdso_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:vsyscall_test")
+syscall_test(
+    test = "//test/syscalls/linux:vsyscall_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:vfork_test")
+syscall_test(
+    test = "//test/syscalls/linux:vfork_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     shard_count = 5,
     test = "//test/syscalls/linux:wait_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:write_test",
+    vfs2 = "True",
+)
+
+syscall_test(
+    test = "//test/syscalls/linux:proc_net_unix_test",
+)
+
+syscall_test(
+    test = "//test/syscalls/linux:proc_net_tcp_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:proc_net_unix_test")
-
-syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
-
-syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
-
-go_binary(
-    name = "syscall_test_runner",
-    testonly = 1,
-    srcs = ["syscall_test_runner.go"],
-    data = [
-        "//runsc",
-    ],
-    deps = [
-        "//pkg/log",
-        "//runsc/specutils",
-        "//runsc/testutil",
-        "//test/syscalls/gtest",
-        "//test/uds",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@org_golang_x_sys//unix:go_default_library",
-    ],
+syscall_test(
+    test = "//test/syscalls/linux:proc_net_udp_test",
+    vfs2 = "True",
 )
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
deleted file mode 100644
index cbab85ef7..000000000
--- a/test/syscalls/build_defs.bzl
+++ /dev/null
@@ -1,180 +0,0 @@
-"""Defines a rule for syscall test targets."""
-
-load("//tools:defs.bzl", "loopback")
-
-def syscall_test(
-        test,
-        shard_count = 5,
-        size = "small",
-        use_tmpfs = False,
-        add_overlay = False,
-        add_uds_tree = False,
-        add_hostinet = False,
-        tags = None):
-    """syscall_test is a macro that will create targets for all platforms.
-
-    Args:
-      test: the test target.
-      shard_count: shards for defined tests.
-      size: the defined test size.
-      use_tmpfs: use tmpfs in the defined tests.
-      add_overlay: add an overlay test.
-      add_uds_tree: add a UDS test.
-      add_hostinet: add a hostinet test.
-      tags: starting test tags.
-    """
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "native",
-        use_tmpfs = False,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "kvm",
-        use_tmpfs = use_tmpfs,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "ptrace",
-        use_tmpfs = use_tmpfs,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    if add_overlay:
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-            overlay = True,
-        )
-
-    if not use_tmpfs:
-        # Also test shared gofer access.
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = use_tmpfs,
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-            file_access = "shared",
-        )
-
-    if add_hostinet:
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = use_tmpfs,
-            network = "host",
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-        )
-
-def _syscall_test(
-        test,
-        shard_count,
-        size,
-        platform,
-        use_tmpfs,
-        tags,
-        network = "none",
-        file_access = "exclusive",
-        overlay = False,
-        add_uds_tree = False):
-    test_name = test.split(":")[1]
-
-    # Prepend "runsc" to non-native platform names.
-    full_platform = platform if platform == "native" else "runsc_" + platform
-
-    name = test_name + "_" + full_platform
-    if file_access == "shared":
-        name += "_shared"
-    if overlay:
-        name += "_overlay"
-    if network != "none":
-        name += "_" + network + "net"
-
-    if tags == None:
-        tags = []
-
-    # Add the full_platform and file access in a tag to make it easier to run
-    # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
-    tags += [full_platform, "file_" + file_access]
-
-    # Hash this target into one of 15 buckets. This can be used to
-    # randomly split targets between different workflows.
-    hash15 = hash(native.package_name() + name) % 15
-    tags.append("hash15:" + str(hash15))
-
-    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
-    # we figure out how to request ipv4 sockets on Guitar machines.
-    if network == "host":
-        tags.append("noguitar")
-
-    # Disable off-host networking.
-    tags.append("requires-net:loopback")
-
-    # Add tag to prevent the tests from running in a Bazel sandbox.
-    # TODO(b/120560048): Make the tests run without this tag.
-    tags.append("no-sandbox")
-
-    # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
-    # more stable.
-    if platform == "kvm":
-        tags.append("manual")
-        tags.append("requires-kvm")
-
-        # TODO(b/112165693): Remove when tests pass reliably.
-        tags.append("notap")
-
-    args = [
-        # Arguments are passed directly to syscall_test_runner binary.
-        "--test-name=" + test_name,
-        "--platform=" + platform,
-        "--network=" + network,
-        "--use-tmpfs=" + str(use_tmpfs),
-        "--file-access=" + file_access,
-        "--overlay=" + str(overlay),
-        "--add-uds-tree=" + str(add_uds_tree),
-    ]
-
-    sh_test(
-        srcs = ["syscall_test_runner.sh"],
-        name = name,
-        data = [
-            ":syscall_test_runner",
-            loopback,
-            test,
-        ],
-        args = args,
-        size = size,
-        tags = tags,
-        shard_count = shard_count,
-    )
-
-def sh_test(**kwargs):
-    """Wraps the standard sh_test."""
-    native.sh_test(
-        **kwargs
-    )
diff --git a/test/syscalls/gtest/gtest.go b/test/syscalls/gtest/gtest.go
deleted file mode 100644
index bdec8eb07..000000000
--- a/test/syscalls/gtest/gtest.go
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package gtest contains helpers for running google-test tests from Go.
-package gtest
-
-import (
-	"fmt"
-	"os/exec"
-	"strings"
-)
-
-var (
-	// ListTestFlag is the flag that will list tests in gtest binaries.
-	ListTestFlag = "--gtest_list_tests"
-
-	// FilterTestFlag is the flag that will filter tests in gtest binaries.
-	FilterTestFlag = "--gtest_filter"
-)
-
-// TestCase is a single gtest test case.
-type TestCase struct {
-	// Suite is the suite for this test.
-	Suite string
-
-	// Name is the name of this individual test.
-	Name string
-}
-
-// FullName returns the name of the test including the suite. It is suitable to
-// pass to "-gtest_filter".
-func (tc TestCase) FullName() string {
-	return fmt.Sprintf("%s.%s", tc.Suite, tc.Name)
-}
-
-// ParseTestCases calls a gtest test binary to list its test and returns a
-// slice with the name and suite of each test.
-func ParseTestCases(testBin string, extraArgs ...string) ([]TestCase, error) {
-	args := append([]string{ListTestFlag}, extraArgs...)
-	cmd := exec.Command(testBin, args...)
-	out, err := cmd.Output()
-	if err != nil {
-		exitErr, ok := err.(*exec.ExitError)
-		if !ok {
-			return nil, fmt.Errorf("could not enumerate gtest tests: %v", err)
-		}
-		return nil, fmt.Errorf("could not enumerate gtest tests: %v\nstderr:\n%s", err, exitErr.Stderr)
-	}
-
-	var t []TestCase
-	var suite string
-	for _, line := range strings.Split(string(out), "\n") {
-		// Strip comments.
-		line = strings.Split(line, "#")[0]
-
-		// New suite?
-		if !strings.HasPrefix(line, " ") {
-			suite = strings.TrimSuffix(strings.TrimSpace(line), ".")
-			continue
-		}
-
-		// Individual test.
-		name := strings.TrimSpace(line)
-
-		// Do we have a suite yet?
-		if suite == "" {
-			return nil, fmt.Errorf("test without a suite: %v", name)
-		}
-
-		// Add this individual test.
-		t = append(t, TestCase{
-			Suite: suite,
-			Name:  name,
-		})
-
-	}
-
-	if len(t) == 0 {
-		return nil, fmt.Errorf("no tests parsed from %v", testBin)
-	}
-	return t, nil
-}
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index c47a05181..3c825477c 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -74,7 +74,7 @@ void ExitGroup32(const char instruction[2], int code) {
       "int $3\n"
       :
       : [ code ] "m"(code), [ ip ] "d"(m.ptr())
-      : "rax", "rbx", "rsp");
+      : "rax", "rbx");
 }
 
 constexpr int kExitCode = 42;
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e7c82adfc..ae2aa44dc 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -12,8 +12,12 @@ exports_files(
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_generic_loopback.cc",
         "socket_ip_tcp_loopback.cc",
+        "socket_ip_tcp_loopback_blocking.cc",
+        "socket_ip_tcp_loopback_nonblock.cc",
         "socket_ip_tcp_udp_generic.cc",
         "socket_ip_udp_loopback.cc",
+        "socket_ip_udp_loopback_blocking.cc",
+        "socket_ip_udp_loopback_nonblock.cc",
         "socket_ip_unbound.cc",
         "socket_ipv4_tcp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_external_networking_test.cc",
@@ -128,6 +132,16 @@ cc_library(
 )
 
 cc_library(
+    name = "socket_netlink_route_util",
+    testonly = 1,
+    srcs = ["socket_netlink_route_util.cc"],
+    hdrs = ["socket_netlink_route_util.h"],
+    deps = [
+        ":socket_netlink_util",
+    ],
+)
+
+cc_library(
     name = "socket_test_util",
     testonly = 1,
     srcs = [
@@ -152,11 +166,6 @@ cc_library(
 )
 
 cc_library(
-    name = "temp_umask",
-    hdrs = ["temp_umask.h"],
-)
-
-cc_library(
     name = "unix_domain_socket_test_util",
     testonly = 1,
     srcs = ["unix_domain_socket_test_util.cc"],
@@ -356,6 +365,7 @@ cc_binary(
         ":socket_test_util",
         "//test/util:file_descriptor",
         gtest,
+        "//test/util:temp_umask",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
@@ -593,7 +603,10 @@ cc_binary(
 cc_binary(
     name = "exceptions_test",
     testonly = 1,
-    srcs = ["exceptions.cc"],
+    srcs = select_arch(
+        amd64 = ["exceptions.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         gtest,
@@ -650,10 +663,7 @@ cc_binary(
 cc_binary(
     name = "exec_binary_test",
     testonly = 1,
-    srcs = select_arch(
-        amd64 = ["exec_binary.cc"],
-        arm64 = [],
-    ),
+    srcs = ["exec_binary.cc"],
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
@@ -825,10 +835,7 @@ cc_binary(
 cc_binary(
     name = "fpsig_fork_test",
     testonly = 1,
-    srcs = select_arch(
-        amd64 = ["fpsig_fork.cc"],
-        arm64 = [],
-    ),
+    srcs = ["fpsig_fork.cc"],
     linkstatic = 1,
     deps = [
         gtest,
@@ -944,6 +951,7 @@ cc_binary(
         "//test/util:epoll_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
@@ -1125,11 +1133,11 @@ cc_binary(
     srcs = ["mkdir.cc"],
     linkstatic = 1,
     deps = [
-        ":temp_umask",
         "//test/util:capability_util",
         "//test/util:fs_util",
         gtest,
         "//test/util:temp_path",
+        "//test/util:temp_umask",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
@@ -1284,12 +1292,12 @@ cc_binary(
     srcs = ["open_create.cc"],
     linkstatic = 1,
     deps = [
-        ":temp_umask",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
         gtest,
         "//test/util:temp_path",
+        "//test/util:temp_umask",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
@@ -1375,7 +1383,7 @@ cc_binary(
     srcs = ["partial_bad_buffer.cc"],
     linkstatic = 1,
     deps = [
-        "//test/syscalls/linux:socket_test_util",
+        ":socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
         "@com_google_absl//absl/time",
@@ -1404,6 +1412,21 @@ cc_binary(
 )
 
 cc_binary(
+    name = "ping_socket_test",
+    testonly = 1,
+    srcs = ["ping_socket.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        gtest,
+        "//test/util:save_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
     name = "pipe_test",
     testonly = 1,
     srcs = ["pipe.cc"],
@@ -1460,7 +1483,10 @@ cc_binary(
 cc_binary(
     name = "arch_prctl_test",
     testonly = 1,
-    srcs = ["arch_prctl.cc"],
+    srcs = select_arch(
+        amd64 = ["arch_prctl.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
@@ -1616,6 +1642,19 @@ cc_binary(
 )
 
 cc_binary(
+    name = "proc_pid_oomscore_test",
+    testonly = 1,
+    srcs = ["proc_pid_oomscore.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:fs_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_binary(
     name = "proc_pid_smaps_test",
     testonly = 1,
     srcs = ["proc_pid_smaps.cc"],
@@ -1997,6 +2036,8 @@ cc_binary(
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings",
         gtest,
+        ":ip_socket_test_util",
+        ":unix_domain_socket_test_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
@@ -2592,6 +2633,7 @@ cc_binary(
         ":socket_bind_to_device_util",
         ":socket_test_util",
         "//test/util:capability_util",
+        "@com_google_absl//absl/container:node_hash_map",
         gtest,
         "//test/util:test_main",
         "//test/util:test_util",
@@ -2754,6 +2796,26 @@ cc_binary(
 )
 
 cc_binary(
+    name = "socket_inet_loopback_nogotsan_test",
+    testonly = 1,
+    srcs = ["socket_inet_loopback_nogotsan.cc"],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        gtest,
+        "//test/util:posix_error",
+        "//test/util:save_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+    ],
+)
+
+cc_binary(
     name = "socket_netlink_test",
     testonly = 1,
     srcs = ["socket_netlink.cc"],
@@ -2773,13 +2835,13 @@ cc_binary(
     srcs = ["socket_netlink_route.cc"],
     linkstatic = 1,
     deps = [
+        ":socket_netlink_route_util",
         ":socket_netlink_util",
         ":socket_test_util",
         "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:optional",
         gtest,
         "//test/util:test_main",
         "//test/util:test_util",
@@ -3259,6 +3321,7 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/time",
         gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
@@ -3426,6 +3489,37 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "tuntap_test",
+    testonly = 1,
+    srcs = ["tuntap.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        gtest,
+        ":socket_netlink_route_util",
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_binary(
+    name = "tuntap_hostinet_test",
+    testonly = 1,
+    srcs = ["tuntap_hostinet.cc"],
+    linkstatic = 1,
+    deps = [
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
@@ -3636,6 +3730,22 @@ cc_binary(
 )
 
 cc_binary(
+    name = "network_namespace_test",
+    testonly = 1,
+    srcs = ["network_namespace.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        gtest,
+        "//test/util:capability_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+    ],
+)
+
+cc_binary(
     name = "semaphore_test",
     testonly = 1,
     srcs = ["semaphore.cc"],
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index e08c578f0..f65a14fb8 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <stdio.h>
+#include <sys/socket.h>
 #include <sys/un.h>
 
 #include <algorithm>
@@ -141,6 +142,47 @@ TEST_P(AllSocketPairTest, Connect) {
               SyscallSucceeds());
 }
 
+TEST_P(AllSocketPairTest, ConnectWithWrongType) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int type;
+  socklen_t typelen = sizeof(type);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_TYPE, &type, &typelen),
+      SyscallSucceeds());
+  switch (type) {
+    case SOCK_STREAM:
+      type = SOCK_SEQPACKET;
+      break;
+    case SOCK_SEQPACKET:
+      type = SOCK_STREAM;
+      break;
+  }
+
+  const FileDescriptor another_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, type, 0));
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  if (sockets->first_addr()->sa_data[0] != 0) {
+    ASSERT_THAT(connect(another_socket.get(), sockets->first_addr(),
+                        sockets->first_addr_size()),
+                SyscallFailsWithErrno(EPROTOTYPE));
+  } else {
+    ASSERT_THAT(connect(another_socket.get(), sockets->first_addr(),
+                        sockets->first_addr_size()),
+                SyscallFailsWithErrno(ECONNREFUSED));
+  }
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+}
+
 TEST_P(AllSocketPairTest, ConnectNonListening) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index a33daff17..806d5729e 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -89,6 +89,7 @@ class AIOTest : public FileTest {
     FileTest::TearDown();
     if (ctx_ != 0) {
       ASSERT_THAT(DestroyContext(), SyscallSucceeds());
+      ctx_ = 0;
     }
   }
 
@@ -188,14 +189,19 @@ TEST_F(AIOTest, BadWrite) {
 }
 
 TEST_F(AIOTest, ExitWithPendingIo) {
-  // Setup a context that is 5 entries deep.
-  ASSERT_THAT(SetupContext(5), SyscallSucceeds());
+  // Setup a context that is 100 entries deep.
+  ASSERT_THAT(SetupContext(100), SyscallSucceeds());
 
   struct iocb cb = CreateCallback();
   struct iocb* cbs[] = {&cb};
 
   // Submit a request but don't complete it to make it pending.
-  EXPECT_THAT(Submit(1, cbs), SyscallSucceeds());
+  for (int i = 0; i < 100; ++i) {
+    EXPECT_THAT(Submit(1, cbs), SyscallSucceeds());
+  }
+
+  ASSERT_THAT(DestroyContext(), SyscallSucceeds());
+  ctx_ = 0;
 }
 
 int Submitter(void* arg) {
diff --git a/test/syscalls/linux/alarm.cc b/test/syscalls/linux/alarm.cc
index d89269985..940c97285 100644
--- a/test/syscalls/linux/alarm.cc
+++ b/test/syscalls/linux/alarm.cc
@@ -188,6 +188,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
index adfb149df..a26fc6af3 100644
--- a/test/syscalls/linux/bad.cc
+++ b/test/syscalls/linux/bad.cc
@@ -28,7 +28,7 @@ namespace {
 constexpr uint32_t kNotImplementedSyscall = SYS_get_kernel_syms;
 #elif __aarch64__
 // Use the last of arch_specific_syscalls which are not implemented on arm64.
-constexpr uint32_t kNotImplementedSyscall = SYS_arch_specific_syscall + 15;
+constexpr uint32_t kNotImplementedSyscall = __NR_arch_specific_syscall + 15;
 #endif
 
 TEST(BadSyscallTest, NotImplemented) {
diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc
index a4f8f3cec..f57d38dc7 100644
--- a/test/syscalls/linux/epoll.cc
+++ b/test/syscalls/linux/epoll.cc
@@ -56,10 +56,6 @@ TEST(EpollTest, AllWritable) {
   struct epoll_event result[kFDsPerEpoll];
   ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
               SyscallSucceedsWithValue(kFDsPerEpoll));
-  // TODO(edahlgren): Why do some tests check epoll_event::data, and others
-  // don't? Does Linux actually guarantee that, in any of these test cases,
-  // epoll_wait will necessarily write out the epoll_events in the order that
-  // they were registered?
   for (int i = 0; i < kFDsPerEpoll; i++) {
     ASSERT_EQ(result[i].events, EPOLLOUT);
   }
diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc
index 927001eee..548b05a64 100644
--- a/test/syscalls/linux/eventfd.cc
+++ b/test/syscalls/linux/eventfd.cc
@@ -100,6 +100,22 @@ TEST(EventfdTest, SmallRead) {
   ASSERT_THAT(read(efd.get(), &l, 4), SyscallFailsWithErrno(EINVAL));
 }
 
+TEST(EventfdTest, PreadIllegalSeek) {
+  FileDescriptor efd =
+      ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
+
+  uint64_t l = 0;
+  ASSERT_THAT(pread(efd.get(), &l, 4, 0), SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST(EventfdTest, PwriteIllegalSeek) {
+  FileDescriptor efd =
+      ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
+
+  uint64_t l = 0;
+  ASSERT_THAT(pwrite(efd.get(), &l, 4, 0), SyscallFailsWithErrno(ESPIPE));
+}
+
 TEST(EventfdTest, BigWrite) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index b5e0a512b..e09afafe9 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -673,6 +673,33 @@ TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
   EXPECT_EQ(execve_errno, ELOOP);
 }
 
+TEST(ExecveatTest, UnshareFiles) {
+  TempPath tempFile = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "bar", 0755));
+  const FileDescriptor fd_closed_on_exec =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC));
+
+  pid_t child;
+  EXPECT_THAT(child = syscall(__NR_clone, SIGCHLD | CLONE_VFORK | CLONE_FILES,
+                              0, 0, 0, 0),
+              SyscallSucceeds());
+  if (child == 0) {
+    ExecveArray argv = {"test"};
+    ExecveArray envp;
+    ASSERT_THAT(
+        execve(RunfilePath(kBasicWorkload).c_str(), argv.get(), envp.get()),
+        SyscallSucceeds());
+    _exit(1);
+  }
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+  EXPECT_EQ(status, 0);
+
+  struct stat st;
+  EXPECT_THAT(fstat(fd_closed_on_exec.get(), &st), SyscallSucceeds());
+}
+
 TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
   std::string parent_dir = "/tmp";
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
@@ -812,26 +839,28 @@ void ExecFromThread() {
 bool ValidateProcCmdlineVsArgv(const int argc, const char* const* argv) {
   auto contents_or = GetContents("/proc/self/cmdline");
   if (!contents_or.ok()) {
-    std::cerr << "Unable to get /proc/self/cmdline: " << contents_or.error();
+    std::cerr << "Unable to get /proc/self/cmdline: " << contents_or.error()
+              << std::endl;
     return false;
   }
   auto contents = contents_or.ValueOrDie();
   if (contents.back() != '\0') {
-    std::cerr << "Non-null terminated /proc/self/cmdline!";
+    std::cerr << "Non-null terminated /proc/self/cmdline!" << std::endl;
     return false;
   }
   contents.pop_back();
   std::vector<std::string> procfs_cmdline = absl::StrSplit(contents, '\0');
 
   if (static_cast<int>(procfs_cmdline.size()) != argc) {
-    std::cerr << "argc = " << argc << " != " << procfs_cmdline.size();
+    std::cerr << "argc = " << argc << " != " << procfs_cmdline.size()
+              << std::endl;
     return false;
   }
 
   for (int i = 0; i < argc; ++i) {
     if (procfs_cmdline[i] != argv[i]) {
       std::cerr << "Procfs command line argument " << i << " mismatch "
-                << procfs_cmdline[i] << " != " << argv[i];
+                << procfs_cmdline[i] << " != " << argv[i] << std::endl;
       return false;
     }
   }
@@ -868,6 +897,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 736452b0c..18d2f22c1 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -48,10 +48,17 @@ namespace {
 using ::testing::AnyOf;
 using ::testing::Eq;
 
-#ifndef __x86_64__
+#if !defined(__x86_64__) && !defined(__aarch64__)
 // The assembly stub and ELF internal details must be ported to other arches.
-#error "Test only supported on x86-64"
-#endif  // __x86_64__
+#error "Test only supported on x86-64/arm64"
+#endif  // __x86_64__ || __aarch64__
+
+#if defined(__x86_64__)
+#define EM_TYPE EM_X86_64
+#define IP_REG(p) ((p).rip)
+#define RAX_REG(p) ((p).rax)
+#define RDI_REG(p) ((p).rdi)
+#define RETURN_REG(p) ((p).rax)
 
 // amd64 stub that calls PTRACE_TRACEME and sends itself SIGSTOP.
 const char kPtraceCode[] = {
@@ -139,6 +146,76 @@ const char kPtraceCode[] = {
 // Size of a syscall instruction.
 constexpr int kSyscallSize = 2;
 
+#elif defined(__aarch64__)
+#define EM_TYPE EM_AARCH64
+#define IP_REG(p) ((p).pc)
+#define RAX_REG(p) ((p).regs[8])
+#define RDI_REG(p) ((p).regs[0])
+#define RETURN_REG(p) ((p).regs[0])
+
+const char kPtraceCode[] = {
+    // MOVD $117, R8 /* ptrace */
+    '\xa8',
+    '\x0e',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R0 /* PTRACE_TRACEME */
+    '\x00',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R1 /* pid */
+    '\x01',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R2 /* addr */
+    '\x02',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R3 /* data */
+    '\x03',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // SVC
+    '\x01',
+    '\x00',
+    '\x00',
+    '\xd4',
+    // MOVD $172, R8 /* getpid */
+    '\x88',
+    '\x15',
+    '\x80',
+    '\xd2',
+    // SVC
+    '\x01',
+    '\x00',
+    '\x00',
+    '\xd4',
+    // MOVD $129, R8 /* kill, R0=pid */
+    '\x28',
+    '\x10',
+    '\x80',
+    '\xd2',
+    // MOVD $19, R1  /* SIGSTOP */
+    '\x61',
+    '\x02',
+    '\x80',
+    '\xd2',
+    // SVC
+    '\x01',
+    '\x00',
+    '\x00',
+    '\xd4',
+};
+// Size of a syscall instruction.
+constexpr int kSyscallSize = 4;
+#else
+#error "Unknown architecture"
+#endif
+
 // This test suite tests executable loading in the kernel (ELF and interpreter
 // scripts).
 
@@ -281,7 +358,7 @@ ElfBinary<64> StandardElf() {
   elf.header.e_ident[EI_DATA] = ELFDATA2LSB;
   elf.header.e_ident[EI_VERSION] = EV_CURRENT;
   elf.header.e_type = ET_EXEC;
-  elf.header.e_machine = EM_X86_64;
+  elf.header.e_machine = EM_TYPE;
   elf.header.e_version = EV_CURRENT;
   elf.header.e_phoff = sizeof(elf.header);
   elf.header.e_phentsize = sizeof(decltype(elf)::ElfPhdr);
@@ -327,9 +404,15 @@ TEST(ElfTest, Execute) {
   ASSERT_NO_ERRNO(WaitStopped(child));
 
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
-  // RIP is just beyond the final syscall instruction.
-  EXPECT_EQ(regs.rip, elf.header.e_entry + sizeof(kPtraceCode));
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
+  // RIP/PC is just beyond the final syscall instruction.
+  EXPECT_EQ(IP_REG(regs), elf.header.e_entry + sizeof(kPtraceCode));
 
   EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
                          {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
@@ -355,7 +438,12 @@ TEST(ElfTest, MissingText) {
   ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
               SyscallSucceedsWithValue(child));
   // It runs off the end of the zeroes filling the end of the page.
+#if defined(__x86_64__)
   EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV) << status;
+#elif defined(__aarch64__)
+  // 0 is an invalid instruction opcode on arm64.
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGILL) << status;
+#endif
 }
 
 // Typical ELF with a data + bss segment
@@ -718,9 +806,16 @@ TEST(ElfTest, PIE) {
 
   // RIP tells us which page the first segment was loaded into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
                          // text page.
@@ -787,9 +882,15 @@ TEST(ElfTest, PIENonZeroStart) {
 
   // RIP tells us which page the first segment was loaded into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   // The ELF is loaded at an arbitrary address, not the first PT_LOAD vaddr.
   //
@@ -910,9 +1011,15 @@ TEST(ElfTest, ELFInterpreter) {
   // RIP tells us which page the first segment of the interpreter was loaded
   // into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t interp_load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   EXPECT_THAT(
       child, ContainsMappings(std::vector<ProcMapsEntry>({
@@ -1084,9 +1191,15 @@ TEST(ElfTest, ELFInterpreterRelative) {
   // RIP tells us which page the first segment of the interpreter was loaded
   // into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t interp_load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   EXPECT_THAT(
       child, ContainsMappings(std::vector<ProcMapsEntry>({
@@ -1480,14 +1593,21 @@ TEST(ExecveTest, BrkAfterBinary) {
   ASSERT_NO_ERRNO(WaitStopped(child));
 
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
   // RIP is just beyond the final syscall instruction. Rewind to execute a brk
   // syscall.
-  regs.rip -= kSyscallSize;
-  regs.rax = __NR_brk;
-  regs.rdi = 0;
-  ASSERT_THAT(ptrace(PTRACE_SETREGS, child, 0, &regs), SyscallSucceeds());
+  IP_REG(regs) -= kSyscallSize;
+  RAX_REG(regs) = __NR_brk;
+  RDI_REG(regs) = 0;
+  ASSERT_THAT(ptrace(PTRACE_SETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
 
   // Resume the child, waiting for syscall entry.
   ASSERT_THAT(ptrace(PTRACE_SYSCALL, child, 0, 0), SyscallSucceeds());
@@ -1504,7 +1624,12 @@ TEST(ExecveTest, BrkAfterBinary) {
   ASSERT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
       << "status = " << status;
 
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
   // brk is after the text page.
   //
@@ -1512,7 +1637,7 @@ TEST(ExecveTest, BrkAfterBinary) {
   // address will be, but it is always beyond the final page in the binary.
   // i.e., it does not start immediately after memsz in the middle of a page.
   // Userspace may expect to use that space.
-  EXPECT_GE(regs.rax, 0x41000);
+  EXPECT_GE(RETURN_REG(regs), 0x41000);
 }
 
 }  // namespace
diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
index 1c3d00287..7819f4ac3 100644
--- a/test/syscalls/linux/fallocate.cc
+++ b/test/syscalls/linux/fallocate.cc
@@ -33,7 +33,7 @@ namespace testing {
 namespace {
 
 int fallocate(int fd, int mode, off_t offset, off_t len) {
-  return syscall(__NR_fallocate, fd, mode, offset, len);
+  return RetryEINTR(syscall)(__NR_fallocate, fd, mode, offset, len);
 }
 
 class AllocateTest : public FileTest {
@@ -47,27 +47,27 @@ TEST_F(AllocateTest, Fallocate) {
   EXPECT_EQ(buf.st_size, 0);
 
   // Grow to ten bytes.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 10);
 
   // Allocate to a smaller size should be noop.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 10);
 
   // Grow again.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 20);
 
   // Grow with offset.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 30);
 
   // Grow with offset beyond EOF.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 40);
 }
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 421c15b87..c7cc5816e 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -1128,5 +1128,5 @@ int main(int argc, char** argv) {
     exit(err);
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 6f80bc97c..fb418e052 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -52,17 +52,6 @@ class FileTest : public ::testing::Test {
     test_file_fd_ = ASSERT_NO_ERRNO_AND_VALUE(
         Open(test_file_name_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR));
 
-    // FIXME(edahlgren): enable when mknod syscall is supported.
-    // test_fifo_name_ = NewTempAbsPath();
-    // ASSERT_THAT(mknod(test_fifo_name_.c_str()), S_IFIFO|0644, 0,
-    //             SyscallSucceeds());
-    // ASSERT_THAT(test_fifo_[1] = open(test_fifo_name_.c_str(),
-    //                                             O_WRONLY),
-    //             SyscallSucceeds());
-    // ASSERT_THAT(test_fifo_[0] = open(test_fifo_name_.c_str(),
-    //                                             O_RDONLY),
-    //             SyscallSucceeds());
-
     ASSERT_THAT(pipe(test_pipe_), SyscallSucceeds());
     ASSERT_THAT(fcntl(test_pipe_[0], F_SETFL, O_NONBLOCK), SyscallSucceeds());
   }
@@ -96,18 +85,12 @@ class FileTest : public ::testing::Test {
     CloseFile();
     UnlinkFile();
     ClosePipes();
-
-    // FIXME(edahlgren): enable when mknod syscall is supported.
-    // close(test_fifo_[0]);
-    // close(test_fifo_[1]);
-    // unlink(test_fifo_name_.c_str());
   }
 
+ protected:
   std::string test_file_name_;
-  std::string test_fifo_name_;
   FileDescriptor test_file_fd_;
 
-  int test_fifo_[2];
   int test_pipe_[2];
 };
 
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index ff8bdfeb0..853f6231a 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -431,7 +431,6 @@ TEST(CloneTest, NewUserNamespacePermitsAllOtherNamespaces) {
       << "status = " << status;
 }
 
-#ifdef __x86_64__
 // Clone with CLONE_SETTLS and a non-canonical TLS address is rejected.
 TEST(CloneTest, NonCanonicalTLS) {
   constexpr uintptr_t kNonCanonical = 1ull << 48;
@@ -440,11 +439,25 @@ TEST(CloneTest, NonCanonicalTLS) {
   // on this.
   char stack;
 
+  // The raw system call interface on x86-64 is:
+  // long clone(unsigned long flags, void *stack,
+  //            int *parent_tid, int *child_tid,
+  //            unsigned long tls);
+  //
+  // While on arm64, the order of the last two arguments is reversed:
+  // long clone(unsigned long flags, void *stack,
+  //            int *parent_tid, unsigned long tls,
+  //            int *child_tid);
+#if defined(__x86_64__)
   EXPECT_THAT(syscall(__NR_clone, SIGCHLD | CLONE_SETTLS, &stack, nullptr,
                       nullptr, kNonCanonical),
               SyscallFailsWithErrno(EPERM));
-}
+#elif defined(__aarch64__)
+  EXPECT_THAT(syscall(__NR_clone, SIGCHLD | CLONE_SETTLS, &stack, nullptr,
+                      kNonCanonical, nullptr),
+              SyscallFailsWithErrno(EPERM));
 #endif
+}
 
 }  // namespace
 }  // namespace testing
diff --git a/test/syscalls/linux/fpsig_fork.cc b/test/syscalls/linux/fpsig_fork.cc
index a346f1f00..d08111cd3 100644
--- a/test/syscalls/linux/fpsig_fork.cc
+++ b/test/syscalls/linux/fpsig_fork.cc
@@ -27,9 +27,22 @@ namespace testing {
 
 namespace {
 
+#ifdef __x86_64__
 #define GET_XMM(__var, __xmm) \
   asm volatile("movq %%" #__xmm ", %0" : "=r"(__var))
 #define SET_XMM(__var, __xmm) asm volatile("movq %0, %%" #__xmm : : "r"(__var))
+#define GET_FP0(__var) GET_XMM(__var, xmm0)
+#define SET_FP0(__var) SET_XMM(__var, xmm0)
+#elif __aarch64__
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+#define GET_FPREG(var, regname) \
+  asm volatile("str "__stringify(regname) ", %0" : "=m"(var))
+#define SET_FPREG(var, regname) \
+  asm volatile("ldr "__stringify(regname) ", %0" : "=m"(var))
+#define GET_FP0(var) GET_FPREG(var, d0)
+#define SET_FP0(var) GET_FPREG(var, d0)
+#endif
 
 int parent, child;
 
@@ -40,7 +53,10 @@ void sigusr1(int s, siginfo_t* siginfo, void* _uc) {
   TEST_CHECK_MSG(child >= 0, "fork failed");
 
   uint64_t val = SIGUSR1;
-  SET_XMM(val, xmm0);
+  SET_FP0(val);
+  uint64_t got;
+  GET_FP0(got);
+  TEST_CHECK_MSG(val == got, "Basic FP check failed in sigusr1()");
 }
 
 TEST(FPSigTest, Fork) {
@@ -67,8 +83,9 @@ TEST(FPSigTest, Fork) {
   // be the one clobbered.
 
   uint64_t expected = 0xdeadbeeffacefeed;
-  SET_XMM(expected, xmm0);
+  SET_FP0(expected);
 
+#ifdef __x86_64__
   asm volatile(
       "movl %[killnr], %%eax;"
       "movl %[parent], %%edi;"
@@ -81,9 +98,18 @@ TEST(FPSigTest, Fork) {
       : "rax", "rdi", "rsi", "rdx",
         // Clobbered by syscall.
         "rcx", "r11");
+#elif __aarch64__
+  asm volatile(
+      "mov x8, %0\n"
+      "mov x0, %1\n"
+      "mov x1, %2\n"
+      "mov x2, %3\n"
+      "svc #0\n" ::"r"(__NR_tgkill),
+      "r"(parent), "r"(parent_tid), "r"(SIGUSR1));
+#endif
 
   uint64_t got;
-  GET_XMM(got, xmm0);
+  GET_FP0(got);
 
   if (getpid() == parent) {  // Parent.
     int status;
diff --git a/test/syscalls/linux/getrandom.cc b/test/syscalls/linux/getrandom.cc
index f97f60029..f87cdd7a1 100644
--- a/test/syscalls/linux/getrandom.cc
+++ b/test/syscalls/linux/getrandom.cc
@@ -29,6 +29,8 @@ namespace {
 #define SYS_getrandom 318
 #elif defined(__i386__)
 #define SYS_getrandom 355
+#elif defined(__aarch64__)
+#define SYS_getrandom 278
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 0e13ad190..1d1a7171d 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -19,6 +19,7 @@
 #include <sys/inotify.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <sys/xattr.h>
 
 #include <atomic>
 #include <list>
@@ -33,6 +34,7 @@
 #include "test/util/epoll_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -335,6 +337,11 @@ TEST(Inotify, InotifyFdNotWritable) {
   EXPECT_THAT(write(fd.get(), "x", 1), SyscallFailsWithErrno(EBADF));
 }
 
+TEST(Inotify, InitFlags) {
+  EXPECT_THAT(inotify_init1(IN_NONBLOCK | IN_CLOEXEC), SyscallSucceeds());
+  EXPECT_THAT(inotify_init1(12345), SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(Inotify, NonBlockingReadReturnsEagain) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
@@ -395,7 +402,7 @@ TEST(Inotify, CanDeleteFileAfterRemovingWatch) {
   file1.reset();
 }
 
-TEST(Inotify, CanRemoveWatchAfterDeletingFile) {
+TEST(Inotify, RemoveWatchAfterDeletingFileFails) {
   const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath file1 =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
@@ -491,17 +498,23 @@ TEST(Inotify, DeletingChildGeneratesEvents) {
                     Event(IN_DELETE, root_wd, Basename(file1_path))}));
 }
 
+// Creating a file in "parent/child" should generate events for child, but not
+// parent.
 TEST(Inotify, CreatingFileGeneratesEvents) {
-  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath child =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
 
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS));
   const int wd = ASSERT_NO_ERRNO_AND_VALUE(
-      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+      InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS));
 
   // Create a new file in the directory.
   const TempPath file1 =
-      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(child.path()));
   const std::vector<Event> events =
       ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
 
@@ -554,6 +567,47 @@ TEST(Inotify, WritingFileGeneratesModifyEvent) {
   ASSERT_THAT(events, Are({Event(IN_MODIFY, wd, Basename(file1.path()))}));
 }
 
+TEST(Inotify, SizeZeroReadWriteGeneratesNothing) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  // Read from the empty file.
+  int val;
+  ASSERT_THAT(read(file1_fd.get(), &val, sizeof(val)),
+              SyscallSucceedsWithValue(0));
+
+  // Write zero bytes.
+  ASSERT_THAT(write(file1_fd.get(), "", 0), SyscallSucceedsWithValue(0));
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({}));
+}
+
+TEST(Inotify, FailedFileCreationGeneratesNoEvents) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string dir_path = dir.path();
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(fd.get(), dir_path, IN_ALL_EVENTS));
+
+  const char* p = dir_path.c_str();
+  ASSERT_THAT(mkdir(p, 0777), SyscallFails());
+  ASSERT_THAT(mknod(p, S_IFIFO, 0777), SyscallFails());
+  ASSERT_THAT(symlink(p, p), SyscallFails());
+  ASSERT_THAT(link(p, p), SyscallFails());
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({}));
+}
+
 TEST(Inotify, WatchSetAfterOpenReportsCloseFdEvent) {
   const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const FileDescriptor fd =
@@ -602,7 +656,7 @@ TEST(Inotify, ChildrenDeletionInWatchedDirGeneratesEvent) {
                    Event(IN_DELETE | IN_ISDIR, wd, Basename(dir1_path))}));
 }
 
-TEST(Inotify, WatchTargetDeletionGeneratesEvent) {
+TEST(Inotify, RmdirOnWatchedTargetGeneratesEvent) {
   const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
@@ -1228,7 +1282,7 @@ TEST(Inotify, LinkGeneratesAttribAndCreateEvents) {
       InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
 
   const int rc = link(file1.path().c_str(), link1.path().c_str());
-  // link(2) is only supported on tmpfs in the sandbox.
+  // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
   SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
           (errno == EPERM || errno == ENOENT));
   ASSERT_THAT(rc, SyscallSucceeds());
@@ -1322,21 +1376,27 @@ TEST(Inotify, HardlinksReuseSameWatch) {
                     Event(IN_DELETE, root_wd, Basename(file1_path))}));
 }
 
+// Calling mkdir within "parent/child" should generate an event for child, but
+// not parent.
 TEST(Inotify, MkdirGeneratesCreateEventWithDirFlag) {
-  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath child =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
-  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
-      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS));
+  const int child_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS));
 
-  const TempPath dir1(NewTempAbsPathInDir(root.path()));
+  const TempPath dir1(NewTempAbsPathInDir(child.path()));
   ASSERT_THAT(mkdir(dir1.path().c_str(), 0777), SyscallSucceeds());
 
   const std::vector<Event> events =
       ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
   ASSERT_THAT(
       events,
-      Are({Event(IN_CREATE | IN_ISDIR, root_wd, Basename(dir1.path()))}));
+      Are({Event(IN_CREATE | IN_ISDIR, child_wd, Basename(dir1.path()))}));
 }
 
 TEST(Inotify, MultipleInotifyInstancesAndWatchesAllGetEvents) {
@@ -1596,7 +1656,44 @@ TEST(Inotify, EpollNoDeadlock) {
   }
 }
 
-TEST(Inotify, SpliceEvent) {
+// On Linux, inotify behavior is not very consistent with splice(2). We try our
+// best to emulate Linux for very basic calls to splice.
+TEST(Inotify, SpliceOnWatchTarget) {
+  int pipes[2];
+  ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      dir.path(), "some content", TempPath::kDefaultFileMode));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+  const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), dir.path(), IN_ALL_EVENTS));
+  const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS));
+
+  EXPECT_THAT(splice(fd.get(), nullptr, pipes[1], nullptr, 1, /*flags=*/0),
+              SyscallSucceedsWithValue(1));
+
+  // Surprisingly, events are not generated in Linux if we read from a file.
+  std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  ASSERT_THAT(events, Are({}));
+
+  EXPECT_THAT(splice(pipes[0], nullptr, fd.get(), nullptr, 1, /*flags=*/0),
+              SyscallSucceedsWithValue(1));
+
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  ASSERT_THAT(events, Are({
+                          Event(IN_MODIFY, dir_wd, Basename(file.path())),
+                          Event(IN_MODIFY, file_wd),
+                      }));
+}
+
+TEST(Inotify, SpliceOnInotifyFD) {
   int pipes[2];
   ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
 
@@ -1624,6 +1721,315 @@ TEST(Inotify, SpliceEvent) {
   ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)}));
 }
 
+// Watches on a parent should not be triggered by actions on a hard link to one
+// of its children that has a different parent.
+TEST(Inotify, LinkOnOtherParent) {
+  const TempPath dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+  std::string link_path = NewTempAbsPathInDir(dir2.path());
+
+  const int rc = link(file.path().c_str(), link_path.c_str());
+  // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+  SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
+          (errno == EPERM || errno == ENOENT));
+  ASSERT_THAT(rc, SyscallSucceeds());
+
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), dir1.path(), IN_ALL_EVENTS));
+
+  // Perform various actions on the link outside of dir1, which should trigger
+  // no inotify events.
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(link_path.c_str(), O_RDWR));
+  int val = 0;
+  ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds());
+  ASSERT_THAT(unlink(link_path.c_str()), SyscallSucceeds());
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({}));
+}
+
+TEST(Inotify, Xattr) {
+  // TODO(gvisor.dev/issue/1636): Support extended attributes in runsc gofer.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string path = file.path();
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_RDWR));
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), path, IN_ALL_EVENTS));
+
+  const char* cpath = path.c_str();
+  const char* name = "user.test";
+  int val = 123;
+  ASSERT_THAT(setxattr(cpath, name, &val, sizeof(val), /*flags=*/0),
+              SyscallSucceeds());
+  std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+
+  ASSERT_THAT(getxattr(cpath, name, &val, sizeof(val)), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({}));
+
+  char list[100];
+  ASSERT_THAT(listxattr(cpath, list, sizeof(list)), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({}));
+
+  ASSERT_THAT(removexattr(cpath, name), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+
+  ASSERT_THAT(fsetxattr(fd.get(), name, &val, sizeof(val), /*flags=*/0),
+              SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+
+  ASSERT_THAT(fgetxattr(fd.get(), name, &val, sizeof(val)), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({}));
+
+  ASSERT_THAT(flistxattr(fd.get(), list, sizeof(list)), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({}));
+
+  ASSERT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+}
+
+TEST(Inotify, Exec) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath bin = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(dir.path(), "/bin/true"));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), bin.path(), IN_ALL_EVENTS));
+
+  // Perform exec.
+  ScopedThread t([&bin]() {
+    ASSERT_THAT(execl(bin.path().c_str(), bin.path().c_str(), (char*)nullptr),
+                SyscallSucceeds());
+  });
+  t.Join();
+
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_OPEN, wd), Event(IN_ACCESS, wd)}));
+}
+
+// Watches without IN_EXCL_UNLINK, should continue to emit events for file
+// descriptors after their corresponding files have been unlinked.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) {
+  const DisableSave ds;
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(dir.path(), "123", TempPath::kDefaultFileMode));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), dir.path(), IN_ALL_EVENTS));
+  const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS));
+
+  ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+  int val = 0;
+  ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({
+                          Event(IN_ATTRIB, file_wd),
+                          Event(IN_DELETE, dir_wd, Basename(file.path())),
+                          Event(IN_ACCESS, dir_wd, Basename(file.path())),
+                          Event(IN_ACCESS, file_wd),
+                          Event(IN_MODIFY, dir_wd, Basename(file.path())),
+                          Event(IN_MODIFY, file_wd),
+                      }));
+}
+
+// Watches created with IN_EXCL_UNLINK will stop emitting events on fds for
+// children that have already been unlinked.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlink_NoRandomSave) {
+  const DisableSave ds;
+  // TODO(gvisor.dev/issue/1624): This test fails on VFS1.
+  SKIP_IF(IsRunningWithVFS1());
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+      inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+  // Unlink the child, which should cause further operations on the open file
+  // descriptor to be ignored.
+  ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+  int val = 0;
+  ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_DELETE, wd, Basename(file.path()))}));
+}
+
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkDirectory_NoRandomSave) {
+  const DisableSave ds;
+
+  const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath dir =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
+  std::string dirPath = dir.path();
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dirPath.c_str(), O_RDONLY | O_DIRECTORY));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+      inotify_fd.get(), parent.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+  // Unlink the dir, and then close the open fd.
+  ASSERT_THAT(rmdir(dirPath.c_str()), SyscallSucceeds());
+  dir.reset();
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  // No close event should appear.
+  ASSERT_THAT(events,
+              Are({Event(IN_DELETE | IN_ISDIR, wd, Basename(dirPath))}));
+}
+
+// If "dir/child" and "dir/child2" are links to the same file, and "dir/child"
+// is unlinked, a watch on "dir" with IN_EXCL_UNLINK will exclude future events
+// for fds on "dir/child" but not "dir/child2".
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkMultipleChildren_NoRandomSave) {
+  const DisableSave ds;
+  // TODO(gvisor.dev/issue/1624): This test fails on VFS1.
+  SKIP_IF(IsRunningWithVFS1());
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+  std::string path1 = file.path();
+  std::string path2 = NewTempAbsPathInDir(dir.path());
+
+  const int rc = link(path1.c_str(), path2.c_str());
+  // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+  SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
+          (errno == EPERM || errno == ENOENT));
+  ASSERT_THAT(rc, SyscallSucceeds());
+  const FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path1.c_str(), O_RDWR));
+  const FileDescriptor fd2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path2.c_str(), O_RDWR));
+
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+      inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+  // After unlinking path1, only events on the fd for path2 should be generated.
+  ASSERT_THAT(unlink(path1.c_str()), SyscallSucceeds());
+  ASSERT_THAT(write(fd1.get(), "x", 1), SyscallSucceeds());
+  ASSERT_THAT(write(fd2.get(), "x", 1), SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({
+                          Event(IN_DELETE, wd, Basename(path1)),
+                          Event(IN_MODIFY, wd, Basename(path2)),
+                      }));
+}
+
+// On native Linux, actions of data type FSNOTIFY_EVENT_INODE are not affected
+// by IN_EXCL_UNLINK (see
+// fs/notify/inotify/inotify_fsnotify.c:inotify_handle_event). Inode-level
+// events include changes to metadata and extended attributes.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) {
+  const DisableSave ds;
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR));
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+      inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+  // NOTE(b/157163751): Create another link before unlinking. This is needed for
+  // the gofer filesystem in gVisor, where open fds will not work once the link
+  // count hits zero. In VFS2, we end up skipping the gofer test anyway, because
+  // hard links are not supported for gofer fs.
+  if (IsRunningOnGvisor()) {
+    std::string link_path = NewTempAbsPath();
+    const int rc = link(file.path().c_str(), link_path.c_str());
+    // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+    SKIP_IF(rc != 0 && (errno == EPERM || errno == ENOENT));
+    ASSERT_THAT(rc, SyscallSucceeds());
+  }
+
+  // Even after unlinking, inode-level operations will trigger events regardless
+  // of IN_EXCL_UNLINK.
+  ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+
+  // Perform various actions on fd.
+  ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds());
+  std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({
+                          Event(IN_DELETE, wd, Basename(file.path())),
+                          Event(IN_MODIFY, wd, Basename(file.path())),
+                      }));
+
+  struct timeval times[2] = {{1, 0}, {2, 0}};
+  ASSERT_THAT(futimes(fd.get(), times), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))}));
+
+  // S/R is disabled on this entire test due to behavior with unlink; it must
+  // also be disabled after this point because of fchmod.
+  ASSERT_THAT(fchmod(fd.get(), 0777), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))}));
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index bba022a41..98d07ae85 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -16,7 +16,6 @@
 
 #include <net/if.h>
 #include <netinet/in.h>
-#include <sys/ioctl.h>
 #include <sys/socket.h>
 
 #include <cstring>
@@ -35,12 +34,11 @@ uint16_t PortFromInetSockaddr(const struct sockaddr* addr) {
 }
 
 PosixErrorOr<int> InterfaceIndex(std::string name) {
-  // TODO(igudger): Consider using netlink.
-  ifreq req = {};
-  memcpy(req.ifr_name, name.c_str(), name.size());
-  ASSIGN_OR_RETURN_ERRNO(auto sock, Socket(AF_INET, SOCK_DGRAM, 0));
-  RETURN_ERROR_IF_SYSCALL_FAIL(ioctl(sock.get(), SIOCGIFINDEX, &req));
-  return req.ifr_ifindex;
+  int index = if_nametoindex(name.c_str());
+  if (index) {
+    return index;
+  }
+  return PosixError(errno);
 }
 
 namespace {
@@ -177,17 +175,17 @@ SocketKind IPv6TCPUnboundSocket(int type) {
 PosixError IfAddrHelper::Load() {
   Release();
   RETURN_ERROR_IF_SYSCALL_FAIL(getifaddrs(&ifaddr_));
-  return PosixError(0);
+  return NoError();
 }
 
 void IfAddrHelper::Release() {
   if (ifaddr_) {
     freeifaddrs(ifaddr_);
+    ifaddr_ = nullptr;
   }
-  ifaddr_ = nullptr;
 }
 
-std::vector<std::string> IfAddrHelper::InterfaceList(int family) {
+std::vector<std::string> IfAddrHelper::InterfaceList(int family) const {
   std::vector<std::string> names;
   for (auto ifa = ifaddr_; ifa != NULL; ifa = ifa->ifa_next) {
     if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != family) {
@@ -198,7 +196,7 @@ std::vector<std::string> IfAddrHelper::InterfaceList(int family) {
   return names;
 }
 
-sockaddr* IfAddrHelper::GetAddr(int family, std::string name) {
+const sockaddr* IfAddrHelper::GetAddr(int family, std::string name) const {
   for (auto ifa = ifaddr_; ifa != NULL; ifa = ifa->ifa_next) {
     if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != family) {
       continue;
@@ -210,7 +208,7 @@ sockaddr* IfAddrHelper::GetAddr(int family, std::string name) {
   return nullptr;
 }
 
-PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) {
+PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) const {
   return InterfaceIndex(name);
 }
 
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 083ebbcf0..9c3859fcd 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -84,20 +84,20 @@ SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type);
 // SocketPairs created with AF_INET and the given type.
 SocketPairKind IPv4UDPUnboundSocketPair(int type);
 
-// IPv4UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_DGRAM, and the given type.
+// IPv4UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_DGRAM, and the given type.
 SocketKind IPv4UDPUnboundSocket(int type);
 
-// IPv6UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_DGRAM, and the given type.
+// IPv6UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_DGRAM, and the given type.
 SocketKind IPv6UDPUnboundSocket(int type);
 
-// IPv4TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_STREAM and the given type.
+// IPv4TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_STREAM and the given type.
 SocketKind IPv4TCPUnboundSocket(int type);
 
-// IPv6TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_STREAM and the given type.
+// IPv6TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_STREAM and the given type.
 SocketKind IPv6TCPUnboundSocket(int type);
 
 // IfAddrHelper is a helper class that determines the local interfaces present
@@ -110,10 +110,10 @@ class IfAddrHelper {
   PosixError Load();
   void Release();
 
-  std::vector<std::string> InterfaceList(int family);
+  std::vector<std::string> InterfaceList(int family) const;
 
-  struct sockaddr* GetAddr(int family, std::string name);
-  PosixErrorOr<int> GetIndex(std::string name);
+  const sockaddr* GetAddr(int family, std::string name) const;
+  PosixErrorOr<int> GetIndex(std::string name) const;
 
  private:
   struct ifaddrs* ifaddr_;
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index b77e4cbd1..e397d5f57 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -246,7 +246,7 @@ int TestSIGPROFFairness(absl::Duration sleep) {
 
   // The number of samples on the main thread should be very low as it did
   // nothing.
-  TEST_CHECK(result.main_thread_samples < 60);
+  TEST_CHECK(result.main_thread_samples < 80);
 
   // Both workers should get roughly equal number of samples.
   TEST_CHECK(result.worker_samples.size() == 2);
@@ -267,8 +267,19 @@ int TestSIGPROFFairness(absl::Duration sleep) {
 // Random save/restore is disabled as it introduces additional latency and
 // unpredictable distribution patterns.
 TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) {
-  // TODO(b/143247272): CPU time accounting is inaccurate for the KVM platform.
-  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+  // On the KVM and ptrace platforms, switches between sentry and application
+  // context are sometimes extremely slow, causing the itimer to send SIGPROF to
+  // a thread that either already has one pending or has had SIGPROF delivered,
+  // but hasn't handled it yet (and thus therefore still has SIGPROF masked). In
+  // either case, since itimer signals are group-directed, signal sending falls
+  // back to notifying the thread group leader. ItimerSignalTest() fails if "too
+  // many" signals are delivered to the thread group leader, so these tests are
+  // flaky on these platforms.
+  //
+  // TODO(b/143247272): Clarify why context switches are so slow on KVM.
+  const auto gvisor_platform = GvisorPlatform();
+  SKIP_IF(gvisor_platform == Platform::kKVM ||
+          gvisor_platform == Platform::kPtrace);
 
   pid_t child;
   int execve_errno;
@@ -291,8 +302,10 @@ TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) {
 // Random save/restore is disabled as it introduces additional latency and
 // unpredictable distribution patterns.
 TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyIdle_NoRandomSave) {
-  // TODO(b/143247272): CPU time accounting is inaccurate for the KVM platform.
-  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+  // See comment in DeliversSIGPROFToThreadsRoughlyFairlyActive.
+  const auto gvisor_platform = GvisorPlatform();
+  SKIP_IF(gvisor_platform == Platform::kKVM ||
+          gvisor_platform == Platform::kPtrace);
 
   pid_t child;
   int execve_errno;
@@ -349,6 +362,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/lseek.cc b/test/syscalls/linux/lseek.cc
index a8af8e545..6ce1e6cc3 100644
--- a/test/syscalls/linux/lseek.cc
+++ b/test/syscalls/linux/lseek.cc
@@ -53,7 +53,7 @@ TEST(LseekTest, NegativeOffset) {
 // A 32-bit off_t is not large enough to represent an offset larger than
 // maximum file size on standard file systems, so it isn't possible to cause
 // overflow.
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__aarch64__)
 TEST(LseekTest, Overflow) {
   // HA! Classic Linux. We really should have an EOVERFLOW
   // here, since we're seeking to something that cannot be
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
index e57b49a4a..f8b7f7938 100644
--- a/test/syscalls/linux/memfd.cc
+++ b/test/syscalls/linux/memfd.cc
@@ -16,6 +16,7 @@
 #include <fcntl.h>
 #include <linux/magic.h>
 #include <linux/memfd.h>
+#include <linux/unistd.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/statfs.h>
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index cf138d328..4036a9275 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -18,10 +18,10 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/temp_umask.h"
 #include "test/util/capability_util.h"
 #include "test/util/fs_util.h"
 #include "test/util/temp_path.h"
+#include "test/util/temp_umask.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
@@ -36,21 +36,12 @@ class MkdirTest : public ::testing::Test {
 
   // TearDown unlinks created files.
   void TearDown() override {
-    // FIXME(edahlgren): We don't currently implement rmdir.
-    // We do this unconditionally because there's no harm in trying.
-    rmdir(dirname_.c_str());
+    EXPECT_THAT(rmdir(dirname_.c_str()), SyscallSucceeds());
   }
 
   std::string dirname_;
 };
 
-TEST_F(MkdirTest, DISABLED_CanCreateReadbleDir) {
-  ASSERT_THAT(mkdir(dirname_.c_str(), 0444), SyscallSucceeds());
-  ASSERT_THAT(
-      open(JoinPath(dirname_, "anything").c_str(), O_RDWR | O_CREAT, 0666),
-      SyscallFailsWithErrno(EACCES));
-}
-
 TEST_F(MkdirTest, CanCreateWritableDir) {
   ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
   std::string filename = JoinPath(dirname_, "anything");
@@ -84,10 +75,11 @@ TEST_F(MkdirTest, FailsOnDirWithoutWritePerms) {
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
 
-  auto parent = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0555));
-  auto dir = JoinPath(parent.path(), "foo");
-  ASSERT_THAT(mkdir(dir.c_str(), 0777), SyscallFailsWithErrno(EACCES));
+  ASSERT_THAT(mkdir(dirname_.c_str(), 0555), SyscallSucceeds());
+  auto dir = JoinPath(dirname_.c_str(), "foo");
+  EXPECT_THAT(mkdir(dir.c_str(), 0777), SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(open(JoinPath(dirname_, "file").c_str(), O_RDWR | O_CREAT, 0666),
+              SyscallFailsWithErrno(EACCES));
 }
 
 }  // namespace
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
index 367a90fe1..78ac96bed 100644
--- a/test/syscalls/linux/mlock.cc
+++ b/test/syscalls/linux/mlock.cc
@@ -199,8 +199,10 @@ TEST(MunlockallTest, Basic) {
 }
 
 #ifndef SYS_mlock2
-#ifdef __x86_64__
+#if defined(__x86_64__)
 #define SYS_mlock2 325
+#elif defined(__aarch64__)
+#define SYS_mlock2 284
 #endif
 #endif
 
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 11fb1b457..6d3227ab6 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -361,7 +361,7 @@ TEST_F(MMapTest, MapFixed) {
 }
 
 // 64-bit addresses work too
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__aarch64__)
 TEST_F(MMapTest, MapFixed64) {
   EXPECT_THAT(Map(0x300000000000, kPageSize, PROT_NONE,
                   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0),
@@ -571,6 +571,12 @@ const uint8_t machine_code[] = {
     0xb8, 0x2a, 0x00, 0x00, 0x00,  // movl $42, %eax
     0xc3,                          // retq
 };
+#elif defined(__aarch64__)
+const uint8_t machine_code[] = {
+    0x40, 0x05, 0x80, 0x52,  // mov w0, #42
+    0xc0, 0x03, 0x5f, 0xd6,  // ret
+};
+#endif
 
 // PROT_EXEC allows code execution
 TEST_F(MMapTest, ProtExec) {
@@ -605,7 +611,6 @@ TEST_F(MMapTest, NoProtExecDeath) {
 
   EXPECT_EXIT(func(), ::testing::KilledBySignal(SIGSEGV), "");
 }
-#endif
 
 TEST_F(MMapTest, NoExceedLimitData) {
   void* prevbrk;
@@ -1644,6 +1649,7 @@ TEST(MMapNoFixtureTest, MapReadOnlyAfterCreateWriteOnly) {
 }
 
 // Conditional on MAP_32BIT.
+// This flag is supported only on x86-64, for 64-bit programs.
 #ifdef __x86_64__
 
 TEST(MMapNoFixtureTest, Map32Bit) {
diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc
new file mode 100644
index 000000000..133fdecf0
--- /dev/null
+++ b/test/syscalls/linux/network_namespace.cc
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <net/if.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+TEST(NetworkNamespaceTest, LoopbackExists) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  ScopedThread t([&] {
+    ASSERT_THAT(unshare(CLONE_NEWNET), SyscallSucceedsWithValue(0));
+
+    // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists.
+    // Check loopback device exists.
+    int sock = socket(AF_INET, SOCK_DGRAM, 0);
+    ASSERT_THAT(sock, SyscallSucceeds());
+    struct ifreq ifr;
+    strncpy(ifr.ifr_name, "lo", IFNAMSIZ);
+    EXPECT_THAT(ioctl(sock, SIOCGIFINDEX, &ifr), SyscallSucceeds())
+        << "lo cannot be found";
+  });
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index 267ae19f6..670c0284b 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -186,6 +186,28 @@ TEST_F(OpenTest, OpenNoFollowStillFollowsLinksInPath) {
       ASSERT_NO_ERRNO_AND_VALUE(Open(path_via_symlink, O_RDONLY | O_NOFOLLOW));
 }
 
+// Test that open(2) can follow symlinks that point back to the same tree.
+// Test sets up files as follows:
+//   root/child/symlink => redirects to ../..
+//   root/child/target => regular file
+//
+// open("root/child/symlink/root/child/file")
+TEST_F(OpenTest, SymlinkRecurse) {
+  auto root =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(GetAbsoluteTestTmpdir()));
+  auto child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+  auto symlink = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(child.path(), "../.."));
+  auto target = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(child.path(), "abc", 0644));
+  auto path_via_symlink =
+      JoinPath(symlink.path(), Basename(root.path()), Basename(child.path()),
+               Basename(target.path()));
+  const auto contents =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents(path_via_symlink));
+  ASSERT_EQ(contents, "abc");
+}
+
 TEST_F(OpenTest, Fault) {
   char* totally_not_null = nullptr;
   ASSERT_THAT(open(totally_not_null, O_RDONLY), SyscallFailsWithErrno(EFAULT));
@@ -394,6 +416,29 @@ TEST_F(OpenTest, CanTruncateWriteOnlyNoReadPermission_NoRandomSave) {
   EXPECT_EQ(stat.st_size, 0);
 }
 
+TEST_F(OpenTest, CanTruncateWithStrangePermissions) {
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+  const DisableSave ds;  // Permissions are dropped.
+  std::string path = NewTempAbsPath();
+  int fd;
+  // Create a file without user permissions.
+  EXPECT_THAT(  // SAVE_BELOW
+      fd = open(path.c_str(), O_CREAT | O_TRUNC | O_WRONLY, 055),
+      SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  // Cannot open file because we are owner and have no permissions set.
+  EXPECT_THAT(open(path.c_str(), O_RDONLY), SyscallFailsWithErrno(EACCES));
+
+  // We *can* chmod the file, because we are the owner.
+  EXPECT_THAT(chmod(path.c_str(), 0755), SyscallSucceeds());
+
+  // Now we can open the file again.
+  EXPECT_THAT(fd = open(path.c_str(), O_RDWR), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index 902d0a0dc..51eacf3f2 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -19,11 +19,11 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/temp_umask.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/temp_path.h"
+#include "test/util/temp_umask.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index 92ae55eec..5ac68feb4 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
+#include <ifaddrs.h>
 #include <linux/capability.h>
 #include <linux/if_arp.h>
 #include <linux/if_packet.h>
@@ -163,16 +164,11 @@ int CookedPacketTest::GetLoopbackIndex() {
   return ifr.ifr_ifindex;
 }
 
-// Receive via a packet socket.
-TEST_P(CookedPacketTest, Receive) {
-  // Let's use a simple IP payload: a UDP datagram.
-  FileDescriptor udp_sock =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
-  SendUDPMessage(udp_sock.get());
-
+// Receive and verify the message via packet socket on interface.
+void ReceiveMessage(int sock, int ifindex) {
   // Wait for the socket to become readable.
   struct pollfd pfd = {};
-  pfd.fd = socket_;
+  pfd.fd = sock;
   pfd.events = POLLIN;
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1));
 
@@ -182,9 +178,10 @@ TEST_P(CookedPacketTest, Receive) {
   char buf[64];
   struct sockaddr_ll src = {};
   socklen_t src_len = sizeof(src);
-  ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0,
+  ASSERT_THAT(recvfrom(sock, buf, sizeof(buf), 0,
                        reinterpret_cast<struct sockaddr*>(&src), &src_len),
               SyscallSucceedsWithValue(packet_size));
+
   // sockaddr_ll ends with an 8 byte physical address field, but ethernet
   // addresses only use 6 bytes.  Linux used to return sizeof(sockaddr_ll)-2
   // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns
@@ -194,7 +191,7 @@ TEST_P(CookedPacketTest, Receive) {
   // TODO(b/129292371): Verify protocol once we return it.
   // Verify the source address.
   EXPECT_EQ(src.sll_family, AF_PACKET);
-  EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex());
+  EXPECT_EQ(src.sll_ifindex, ifindex);
   EXPECT_EQ(src.sll_halen, ETH_ALEN);
   // This came from the loopback device, so the address is all 0s.
   for (int i = 0; i < src.sll_halen; i++) {
@@ -222,6 +219,18 @@ TEST_P(CookedPacketTest, Receive) {
   EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0);
 }
 
+// Receive via a packet socket.
+TEST_P(CookedPacketTest, Receive) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  int loopback_index = GetLoopbackIndex();
+  ReceiveMessage(socket_, loopback_index);
+}
+
 // Send via a packet socket.
 TEST_P(CookedPacketTest, Send) {
   // TODO(b/129292371): Remove once we support packet socket writing.
@@ -313,6 +322,115 @@ TEST_P(CookedPacketTest, Send) {
   EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK));
 }
 
+// Bind and receive via packet socket.
+TEST_P(CookedPacketTest, BindReceive) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  ReceiveMessage(socket_, bind_addr.sll_ifindex);
+}
+
+// Double Bind socket.
+TEST_P(CookedPacketTest, DoubleBind) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Binding socket again should fail.
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+           sizeof(bind_addr)),
+      // Linux 4.09 returns EINVAL here, but some time before 4.19 it switched
+      // to EADDRINUSE.
+      AnyOf(SyscallFailsWithErrno(EADDRINUSE), SyscallFailsWithErrno(EINVAL)));
+}
+
+// Bind and verify we do not receive data on interface which is not bound
+TEST_P(CookedPacketTest, BindDrop) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  struct ifaddrs* if_addr_list = nullptr;
+  auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); });
+
+  ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds());
+
+  // Get interface other than loopback.
+  struct ifreq ifr = {};
+  for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) {
+    if (strcmp(i->ifa_name, "lo") != 0) {
+      strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name));
+      break;
+    }
+  }
+
+  // Skip if no interface is available other than loopback.
+  if (strlen(ifr.ifr_name) == 0) {
+    GTEST_SKIP();
+  }
+
+  // Get interface index.
+  EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  EXPECT_NE(ifr.ifr_ifindex, 0);
+
+  // Bind to packet socket requires only family, protocol and ifindex.
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = ifr.ifr_ifindex;
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Send to loopback interface.
+  struct sockaddr_in dest = {};
+  dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  dest.sin_family = AF_INET;
+  dest.sin_port = kPort;
+  EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0,
+                     reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+              SyscallSucceedsWithValue(sizeof(kMessage)));
+
+  // Wait and make sure the socket never receives any data.
+  struct pollfd pfd = {};
+  pfd.fd = socket_;
+  pfd.events = POLLIN;
+  EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
+}
+
+// Bind with invalid address.
+TEST_P(CookedPacketTest, BindFail) {
+  // Null address.
+  ASSERT_THAT(
+      bind(socket_, nullptr, sizeof(struct sockaddr)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallFailsWithErrno(EINVAL)));
+
+  // Address of size 1.
+  uint8_t addr = 0;
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
diff --git a/test/syscalls/linux/ping_socket.cc b/test/syscalls/linux/ping_socket.cc
new file mode 100644
index 000000000..a9bfdb37b
--- /dev/null
+++ b/test/syscalls/linux/ping_socket.cc
@@ -0,0 +1,91 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+class PingSocket : public ::testing::Test {
+ protected:
+  // Creates a socket to be used in tests.
+  void SetUp() override;
+
+  // Closes the socket created by SetUp().
+  void TearDown() override;
+
+  // The loopback address.
+  struct sockaddr_in addr_;
+};
+
+void PingSocket::SetUp() {
+  // On some hosts ping sockets are restricted to specific groups using the
+  // sysctl "ping_group_range".
+  int s = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP);
+  if (s < 0 && errno == EPERM) {
+    GTEST_SKIP();
+  }
+  close(s);
+
+  addr_ = {};
+  // Just a random port as the destination port number is irrelevant for ping
+  // sockets.
+  addr_.sin_port = 12345;
+  addr_.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  addr_.sin_family = AF_INET;
+}
+
+void PingSocket::TearDown() {}
+
+// Test ICMP port exhaustion returns EAGAIN.
+//
+// We disable both random/cooperative S/R for this test as it makes way too many
+// syscalls.
+TEST_F(PingSocket, ICMPPortExhaustion_NoRandomSave) {
+  DisableSave ds;
+  std::vector<FileDescriptor> sockets;
+  constexpr int kSockets = 65536;
+  addr_.sin_port = 0;
+  for (int i = 0; i < kSockets; i++) {
+    auto s =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP));
+    int ret = connect(s.get(), reinterpret_cast<struct sockaddr*>(&addr_),
+                      sizeof(addr_));
+    if (ret == 0) {
+      sockets.push_back(std::move(s));
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallFailsWithErrno(EAGAIN));
+    break;
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index d8e19e910..34291850d 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -265,6 +265,8 @@ TEST_P(PipeTest, OffsetCalls) {
               SyscallFailsWithErrno(ESPIPE));
 
   struct iovec iov;
+  iov.iov_base = &buf;
+  iov.iov_len = sizeof(buf);
   EXPECT_THAT(preadv(wfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
   EXPECT_THAT(pwritev(rfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
 }
@@ -629,11 +631,14 @@ INSTANTIATE_TEST_SUITE_P(
             "namednonblocking",
             [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
               // Create a new file-based pipe (non-blocking).
-              auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-              ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
-              SKIP_IF(mkfifo(file.path().c_str(), 0644) != 0);
-              fds[0] = open(file.path().c_str(), O_NONBLOCK | O_RDONLY);
-              fds[1] = open(file.path().c_str(), O_NONBLOCK | O_WRONLY);
+              std::string path;
+              {
+                auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+                path = file.path();
+              }
+              SKIP_IF(mkfifo(path.c_str(), 0644) != 0);
+              fds[0] = open(path.c_str(), O_NONBLOCK | O_RDONLY);
+              fds[1] = open(path.c_str(), O_NONBLOCK | O_WRONLY);
               MaybeSave();
               *is_blocking = false;
               *is_namedpipe = true;
@@ -643,13 +648,15 @@ INSTANTIATE_TEST_SUITE_P(
             "namedblocking",
             [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
               // Create a new file-based pipe (blocking).
-              auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-              ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
-              SKIP_IF(mkfifo(file.path().c_str(), 0644) != 0);
-              ScopedThread t([&file, &fds]() {
-                fds[1] = open(file.path().c_str(), O_WRONLY);
-              });
-              fds[0] = open(file.path().c_str(), O_RDONLY);
+              std::string path;
+              {
+                auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+                path = file.path();
+              }
+              SKIP_IF(mkfifo(path.c_str(), 0644) != 0);
+              ScopedThread t(
+                  [&path, &fds]() { fds[1] = open(path.c_str(), O_WRONLY); });
+              fds[0] = open(path.c_str(), O_RDONLY);
               t.Join();
               MaybeSave();
               *is_blocking = true;
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index c42472474..7a316427d 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -259,14 +259,14 @@ TEST_F(PollTest, Nfds) {
   TEST_PCHECK(getrlimit(RLIMIT_NOFILE, &rlim) == 0);
 
   // gVisor caps the number of FDs that epoll can use beyond RLIMIT_NOFILE.
-  constexpr rlim_t gVisorMax = 1048576;
-  if (rlim.rlim_cur > gVisorMax) {
-    rlim.rlim_cur = gVisorMax;
+  constexpr rlim_t maxFD = 4096;
+  if (rlim.rlim_cur > maxFD) {
+    rlim.rlim_cur = maxFD;
     TEST_PCHECK(setrlimit(RLIMIT_NOFILE, &rlim) == 0);
   }
 
   rlim_t max_fds = rlim.rlim_cur;
-  std::cout << "Using limit: " << max_fds;
+  std::cout << "Using limit: " << max_fds << std::endl;
 
   // Create an eventfd. Since its value is initially zero, it is writable.
   FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
index d07571a5f..04c5161f5 100644
--- a/test/syscalls/linux/prctl.cc
+++ b/test/syscalls/linux/prctl.cc
@@ -226,5 +226,5 @@ int main(int argc, char** argv) {
          prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0));
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/prctl_setuid.cc b/test/syscalls/linux/prctl_setuid.cc
index 30f0d75b3..c4e9cf528 100644
--- a/test/syscalls/linux/prctl_setuid.cc
+++ b/test/syscalls/linux/prctl_setuid.cc
@@ -264,5 +264,5 @@ int main(int argc, char** argv) {
            prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0);
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/pread64.cc b/test/syscalls/linux/pread64.cc
index 2cecf2e5f..bcdbbb044 100644
--- a/test/syscalls/linux/pread64.cc
+++ b/test/syscalls/linux/pread64.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/types.h>
@@ -118,6 +119,21 @@ TEST_F(Pread64Test, EndOfFile) {
   EXPECT_THAT(pread64(fd.get(), buf, 1024, 0), SyscallSucceedsWithValue(0));
 }
 
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST_F(Pread64Test, Overflow) {
+  int f = memfd_create("negative", 0);
+  const FileDescriptor fd(f);
+
+  EXPECT_THAT(ftruncate(fd.get(), 0x7fffffffffffffffull), SyscallSucceeds());
+
+  char buf[10];
+  EXPECT_THAT(pread64(fd.get(), buf, sizeof(buf), 0x7fffffffffffffffull),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(Pread64TestNoTempFile, CantReadSocketPair_NoRandomSave) {
   int sock_fds[2];
   EXPECT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds), SyscallSucceeds());
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index a23fdb58d..63642880a 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -994,7 +994,7 @@ constexpr uint64_t kMappingSize = 100 << 20;
 
 // Tolerance on RSS comparisons to account for background thread mappings,
 // reclaimed pages, newly faulted pages, etc.
-constexpr uint64_t kRSSTolerance = 5 << 20;
+constexpr uint64_t kRSSTolerance = 10 << 20;
 
 // Capture RSS before and after an anonymous mapping with passed prot.
 void MapPopulateRSS(int prot, uint64_t* before, uint64_t* after) {
@@ -1326,8 +1326,6 @@ TEST(ProcPidSymlink, SubprocessRunning) {
               SyscallSucceedsWithValue(sizeof(buf)));
 }
 
-// FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
-// on proc files.
 TEST(ProcPidSymlink, SubprocessZombied) {
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -1337,7 +1335,7 @@ TEST(ProcPidSymlink, SubprocessZombied) {
   int want = EACCES;
   if (!IsRunningOnGvisor()) {
     auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
-    if (version.major == 4 && version.minor > 3) {
+    if (version.major > 4 || (version.major == 4 && version.minor > 3)) {
       want = ENOENT;
     }
   }
@@ -1350,30 +1348,25 @@ TEST(ProcPidSymlink, SubprocessZombied) {
                 SyscallFailsWithErrno(want));
   }
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
-  // on proc files.
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between linux on proc
+  // files.
   //
   // ~4.3: Syscall fails with EACCES.
-  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  // 4.17: Syscall succeeds and returns 1.
   //
-  // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
-  //            SyscallFailsWithErrno(EACCES));
+  if (!IsRunningOnGvisor()) {
+    return;
+  }
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
-  // on proc files.
-  //
-  // ~4.3: Syscall fails with EACCES.
-  // 4.17 & gVisor: Syscall succeeds and returns 1.
-  //
-  // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
-  //            SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
+              SyscallFailsWithErrno(want));
+
+  EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
+              SyscallFailsWithErrno(want));
 }
 
 // Test whether /proc/PID/ symlinks can be read for an exited process.
 TEST(ProcPidSymlink, SubprocessExited) {
-  // FIXME(gvisor.dev/issue/164): These all succeed on gVisor.
-  SKIP_IF(IsRunningOnGvisor());
-
   char buf[1];
 
   EXPECT_THAT(ReadlinkWhileExited("exe", buf, sizeof(buf)),
@@ -1431,6 +1424,12 @@ TEST(ProcPidFile, SubprocessRunning) {
 
   EXPECT_THAT(ReadWhileRunning("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("oom_score", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("oom_score_adj", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
 }
 
 // Test whether /proc/PID/ files can be read for a zombie process.
@@ -1466,6 +1465,12 @@ TEST(ProcPidFile, SubprocessZombie) {
   EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 
+  EXPECT_THAT(ReadWhileZombied("oom_score", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileZombied("oom_score_adj", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   //
@@ -1527,6 +1532,15 @@ TEST(ProcPidFile, SubprocessExited) {
 
   EXPECT_THAT(ReadWhileExited("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
+    EXPECT_THAT(ReadWhileExited("oom_score", buf, sizeof(buf)),
+                SyscallFailsWithErrno(ESRCH));
+  }
+
+  EXPECT_THAT(ReadWhileExited("oom_score_adj", buf, sizeof(buf)),
+              SyscallFailsWithErrno(ESRCH));
 }
 
 PosixError DirContainsImpl(absl::string_view path,
@@ -1925,43 +1939,66 @@ TEST(ProcSelfMounts, RequiredFieldsArePresent) {
 }
 
 void CheckDuplicatesRecursively(std::string path) {
-  errno = 0;
-  DIR* dir = opendir(path.c_str());
-  if (dir == nullptr) {
-    // Ignore any directories we can't read or missing directories as the
-    // directory could have been deleted/mutated from the time the parent
-    // directory contents were read.
-    return;
-  }
-  auto dir_closer = Cleanup([&dir]() { closedir(dir); });
-  std::unordered_set<std::string> children;
-  while (true) {
-    // Readdir(3): If the end of the directory stream is reached, NULL is
-    // returned and errno is not changed.  If an error occurs, NULL is returned
-    // and errno is set appropriately.  To distinguish end of stream and from an
-    // error, set errno to zero before calling readdir() and then check the
-    // value of errno if NULL is returned.
+  std::vector<std::string> child_dirs;
+
+  // There is the known issue of the linux procfs, that two consequent calls of
+  // readdir can return the same entry twice if between these calls one or more
+  // entries have been removed from this directory.
+  int max_attempts = 5;
+  for (int i = 0; i < max_attempts; i++) {
+    child_dirs.clear();
     errno = 0;
-    struct dirent* dp = readdir(dir);
-    if (dp == nullptr) {
-      ASSERT_EQ(errno, 0) << path;
-      break;  // We're done.
+    bool success = true;
+    DIR* dir = opendir(path.c_str());
+    if (dir == nullptr) {
+      // Ignore any directories we can't read or missing directories as the
+      // directory could have been deleted/mutated from the time the parent
+      // directory contents were read.
+      return;
     }
-
-    if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0) {
-      continue;
+    auto dir_closer = Cleanup([&dir]() { closedir(dir); });
+    std::unordered_set<std::string> children;
+    while (true) {
+      // Readdir(3): If the end of the directory stream is reached, NULL is
+      // returned and errno is not changed.  If an error occurs, NULL is
+      // returned and errno is set appropriately.  To distinguish end of stream
+      // and from an error, set errno to zero before calling readdir() and then
+      // check the value of errno if NULL is returned.
+      errno = 0;
+      struct dirent* dp = readdir(dir);
+      if (dp == nullptr) {
+        ASSERT_EQ(errno, 0) << path;
+        break;  // We're done.
+      }
+
+      if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0) {
+        continue;
+      }
+
+      // Ignore a duplicate entry if it isn't the last attempt.
+      if (i == max_attempts - 1) {
+        ASSERT_EQ(children.find(std::string(dp->d_name)), children.end())
+            << absl::StrCat(path, "/", dp->d_name);
+      } else if (children.find(std::string(dp->d_name)) != children.end()) {
+        std::cerr << "Duplicate entry: " << i << ":"
+                  << absl::StrCat(path, "/", dp->d_name) << std::endl;
+        success = false;
+        break;
+      }
+      children.insert(std::string(dp->d_name));
+
+      ASSERT_NE(dp->d_type, DT_UNKNOWN);
+
+      if (dp->d_type == DT_DIR) {
+        child_dirs.push_back(std::string(dp->d_name));
+      }
     }
-
-    ASSERT_EQ(children.find(std::string(dp->d_name)), children.end())
-        << dp->d_name;
-    children.insert(std::string(dp->d_name));
-
-    ASSERT_NE(dp->d_type, DT_UNKNOWN);
-
-    if (dp->d_type != DT_DIR) {
-      continue;
+    if (success) {
+      break;
     }
-    CheckDuplicatesRecursively(absl::StrCat(path, "/", dp->d_name));
+  }
+  for (auto dname = child_dirs.begin(); dname != child_dirs.end(); dname++) {
+    CheckDuplicatesRecursively(absl::StrCat(path, "/", *dname));
   }
 }
 
@@ -2076,5 +2113,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 3a611a86f..cac394910 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -33,6 +33,31 @@ namespace gvisor {
 namespace testing {
 namespace {
 
+constexpr const char kProcNet[] = "/proc/net";
+
+TEST(ProcNetSymlinkTarget, FileMode) {
+  struct stat s;
+  ASSERT_THAT(stat(kProcNet, &s), SyscallSucceeds());
+  EXPECT_EQ(s.st_mode & S_IFMT, S_IFDIR);
+  EXPECT_EQ(s.st_mode & 0777, 0555);
+}
+
+TEST(ProcNetSymlink, FileMode) {
+  struct stat s;
+  ASSERT_THAT(lstat(kProcNet, &s), SyscallSucceeds());
+  EXPECT_EQ(s.st_mode & S_IFMT, S_IFLNK);
+  EXPECT_EQ(s.st_mode & 0777, 0777);
+}
+
+TEST(ProcNetSymlink, Contents) {
+  char buf[40] = {};
+  int n = readlink(kProcNet, buf, sizeof(buf));
+  ASSERT_THAT(n, SyscallSucceeds());
+
+  buf[n] = 0;
+  EXPECT_STREQ(buf, "self/net");
+}
+
 TEST(ProcNetIfInet6, Format) {
   auto ifinet6 = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/if_inet6"));
   EXPECT_THAT(ifinet6,
@@ -67,6 +92,59 @@ TEST(ProcSysNetIpv4Sack, CanReadAndWrite) {
   EXPECT_EQ(buf, to_write);
 }
 
+// DeviceEntry is an entry in /proc/net/dev
+struct DeviceEntry {
+  std::string name;
+  uint64_t stats[16];
+};
+
+PosixErrorOr<std::vector<DeviceEntry>> GetDeviceMetricsFromProc(
+    const std::string dev) {
+  std::vector<std::string> lines = absl::StrSplit(dev, '\n');
+  std::vector<DeviceEntry> entries;
+
+  // /proc/net/dev prints 2 lines of headers followed by a line of metrics for
+  // each network interface.
+  for (unsigned i = 2; i < lines.size(); i++) {
+    // Ignore empty lines.
+    if (lines[i].empty()) {
+      continue;
+    }
+
+    std::vector<std::string> values =
+        absl::StrSplit(lines[i], ' ', absl::SkipWhitespace());
+
+    // Interface name + 16 values.
+    if (values.size() != 17) {
+      return PosixError(EINVAL, "invalid line: " + lines[i]);
+    }
+
+    DeviceEntry entry;
+    entry.name = values[0];
+    // Skip the interface name and read only the values.
+    for (unsigned j = 1; j < 17; j++) {
+      uint64_t num;
+      if (!absl::SimpleAtoi(values[j], &num)) {
+        return PosixError(EINVAL, "invalid value: " + values[j]);
+      }
+      entry.stats[j - 1] = num;
+    }
+
+    entries.push_back(entry);
+  }
+
+  return entries;
+}
+
+// TEST(ProcNetDev, Format) tests that /proc/net/dev is parsable and
+// contains at least one entry.
+TEST(ProcNetDev, Format) {
+  auto dev = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/dev"));
+  auto entries = ASSERT_NO_ERRNO_AND_VALUE(GetDeviceMetricsFromProc(dev));
+
+  EXPECT_GT(entries.size(), 0);
+}
+
 PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp,
                                              const std::string& type,
                                              const std::string& item) {
@@ -275,7 +353,7 @@ TEST(ProcNetSnmp, UdpNoPorts_NoRandomSave) {
   EXPECT_EQ(oldNoPorts, newNoPorts - 1);
 }
 
-TEST(ProcNetSnmp, UdpIn) {
+TEST(ProcNetSnmp, UdpIn_NoRandomSave) {
   // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
   const DisableSave ds;
 
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index 66db0acaa..a63067586 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -106,7 +106,7 @@ PosixErrorOr<std::vector<UnixEntry>> ProcNetUnixEntries() {
   std::vector<UnixEntry> entries;
   std::vector<std::string> lines = absl::StrSplit(content, '\n');
   std::cerr << "<contents of /proc/net/unix>" << std::endl;
-  for (std::string line : lines) {
+  for (const std::string& line : lines) {
     // Emit the proc entry to the test output to provide context for the test
     // results.
     std::cerr << line << std::endl;
@@ -374,7 +374,7 @@ TEST(ProcNetUnix, DgramSocketStateDisconnectingOnBind) {
   // corresponding entries, as they don't have an address yet.
   if (IsRunningOnGvisor()) {
     ASSERT_EQ(entries.size(), 2);
-    for (auto e : entries) {
+    for (const auto& e : entries) {
       ASSERT_EQ(e.state, SS_DISCONNECTING);
     }
   }
@@ -403,7 +403,7 @@ TEST(ProcNetUnix, DgramSocketStateConnectingOnConnect) {
   // corresponding entries, as they don't have an address yet.
   if (IsRunningOnGvisor()) {
     ASSERT_EQ(entries.size(), 2);
-    for (auto e : entries) {
+    for (const auto& e : entries) {
       ASSERT_EQ(e.state, SS_DISCONNECTING);
     }
   }
diff --git a/test/syscalls/linux/proc_pid_oomscore.cc b/test/syscalls/linux/proc_pid_oomscore.cc
new file mode 100644
index 000000000..707821a3f
--- /dev/null
+++ b/test/syscalls/linux/proc_pid_oomscore.cc
@@ -0,0 +1,72 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+
+#include <exception>
+#include <iostream>
+#include <string>
+
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<int> ReadProcNumber(std::string path) {
+  ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents(path));
+  EXPECT_EQ(contents[contents.length() - 1], '\n');
+
+  int num;
+  if (!absl::SimpleAtoi(contents, &num)) {
+    return PosixError(EINVAL, "invalid value: " + contents);
+  }
+
+  return num;
+}
+
+TEST(ProcPidOomscoreTest, BasicRead) {
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score"));
+  EXPECT_LE(oom_score, 1000);
+  EXPECT_GE(oom_score, -1000);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicRead) {
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+
+  // oom_score_adj defaults to 0.
+  EXPECT_EQ(oom_score, 0);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicWrite) {
+  constexpr int test_value = 7;
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/oom_score_adj", O_WRONLY));
+  ASSERT_THAT(
+      RetryEINTR(write)(fd.get(), std::to_string(test_value).c_str(), 1),
+      SyscallSucceeds());
+
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+  EXPECT_EQ(oom_score, test_value);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc
index 7f2e8f203..9fb1b3a2c 100644
--- a/test/syscalls/linux/proc_pid_smaps.cc
+++ b/test/syscalls/linux/proc_pid_smaps.cc
@@ -173,7 +173,7 @@ PosixErrorOr<std::vector<ProcPidSmapsEntry>> ParseProcPidSmaps(
       return;
     }
     unknown_fields.insert(std::string(key));
-    std::cerr << "skipping unknown smaps field " << key;
+    std::cerr << "skipping unknown smaps field " << key << std::endl;
   };
 
   auto lines = absl::StrSplit(contents, '\n', absl::SkipEmpty());
@@ -191,7 +191,7 @@ PosixErrorOr<std::vector<ProcPidSmapsEntry>> ParseProcPidSmaps(
     // amount of whitespace).
     if (!entry) {
       std::cerr << "smaps line not considered a maps line: "
-                << maybe_maps_entry.error_message();
+                << maybe_maps_entry.error_message() << std::endl;
       return PosixError(
           EINVAL,
           absl::StrCat("smaps field line without preceding maps line: ", l));
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 4dd5cf27b..926690eb8 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -400,9 +400,11 @@ TEST(PtraceTest, GetRegSet) {
   // Read exactly the full register set.
   EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-#ifdef __x86_64__
+#if defined(__x86_64__)
   // Child called kill(2), with SIGSTOP as arg 2.
   EXPECT_EQ(regs.rsi, SIGSTOP);
+#elif defined(__aarch64__)
+  EXPECT_EQ(regs.regs[1], SIGSTOP);
 #endif
 
   // Suppress SIGSTOP and resume the child.
@@ -752,15 +754,23 @@ TEST(PtraceTest,
               SyscallSucceeds());
   EXPECT_TRUE(siginfo.si_code == SIGTRAP || siginfo.si_code == (SIGTRAP | 0x80))
       << "si_code = " << siginfo.si_code;
-#ifdef __x86_64__
+
   {
     struct user_regs_struct regs = {};
-    ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+    struct iovec iov;
+    iov.iov_base = &regs;
+    iov.iov_len = sizeof(regs);
+    EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
+#if defined(__x86_64__)
     EXPECT_TRUE(regs.orig_rax == SYS_vfork || regs.orig_rax == SYS_clone)
         << "orig_rax = " << regs.orig_rax;
     EXPECT_EQ(grandchild_pid, regs.rax);
-  }
+#elif defined(__aarch64__)
+    EXPECT_TRUE(regs.regs[8] == SYS_clone) << "regs[8] = " << regs.regs[8];
+    EXPECT_EQ(grandchild_pid, regs.regs[0]);
 #endif  // defined(__x86_64__)
+  }
 
   // After this point, the child will be making wait4 syscalls that will be
   // interrupted by saving, so saving is not permitted. Note that this is
@@ -805,14 +815,21 @@ TEST(PtraceTest,
               SyscallSucceedsWithValue(child_pid));
   EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
       << " status " << status;
-#ifdef __x86_64__
   {
     struct user_regs_struct regs = {};
-    ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+    struct iovec iov;
+    iov.iov_base = &regs;
+    iov.iov_len = sizeof(regs);
+    EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
+#if defined(__x86_64__)
     EXPECT_EQ(SYS_wait4, regs.orig_rax);
     EXPECT_EQ(grandchild_pid, regs.rax);
-  }
+#elif defined(__aarch64__)
+    EXPECT_EQ(SYS_wait4, regs.regs[8]);
+    EXPECT_EQ(grandchild_pid, regs.regs[0]);
 #endif  // defined(__x86_64__)
+  }
 
   // Detach from the child and wait for it to exit.
   ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
@@ -1188,7 +1205,7 @@ TEST(PtraceTest, SeizeSetOptions) {
     // gVisor is not susceptible to this race because
     // kernel.Task.waitCollectTraceeStopLocked() checks specifically for an
     // active ptraceStop, which is not initiated if SIGKILL is pending.
-    std::cout << "Observed syscall-exit after SIGKILL";
+    std::cout << "Observed syscall-exit after SIGKILL" << std::endl;
     ASSERT_THAT(waitpid(child_pid, &status, 0),
                 SyscallSucceedsWithValue(child_pid));
   }
@@ -1208,5 +1225,5 @@ int main(int argc, char** argv) {
     gvisor::testing::RunExecveChild();
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index dafe64d20..aabfa6955 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -364,6 +364,12 @@ PosixErrorOr<size_t> PollAndReadFd(int fd, void* buf, size_t count,
     ssize_t n =
         ReadFd(fd, static_cast<char*>(buf) + completed, count - completed);
     if (n < 0) {
+      if (errno == EAGAIN) {
+        // Linux sometimes returns EAGAIN from this read, despite the fact that
+        // poll returned success. Let's just do what do as we are told and try
+        // again.
+        continue;
+      }
       return PosixError(errno, "read failed");
     }
     completed += n;
@@ -1126,7 +1132,7 @@ TEST_F(PtyTest, SwitchTwiceMultiline) {
   std::string kExpected = "GO\nBLUE\n!";
 
   // Write each line.
-  for (std::string input : kInputs) {
+  for (const std::string& input : kInputs) {
     ASSERT_THAT(WriteFd(master_.get(), input.c_str(), input.size()),
                 SyscallSucceedsWithValue(input.size()));
   }
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index b48fe540d..e69794910 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -27,14 +28,7 @@ namespace testing {
 
 namespace {
 
-// This test is currently very rudimentary.
-//
-// TODO(edahlgren):
-// * bad buffer states (EFAULT).
-// * bad fds (wrong permission, wrong type of file, EBADF).
-// * check offset is not incremented.
-// * check for EOF.
-// * writing to pipes, symlinks, special files.
+// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary.
 class Pwrite64 : public ::testing::Test {
   void SetUp() override {
     name_ = NewTempAbsPath();
@@ -72,6 +66,17 @@ TEST_F(Pwrite64, InvalidArgs) {
   EXPECT_THAT(close(fd), SyscallSucceeds());
 }
 
+TEST_F(Pwrite64, Overflow) {
+  int fd;
+  ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
+  constexpr int64_t kBufSize = 1024;
+  std::vector<char> buf(kBufSize);
+  std::fill(buf.begin(), buf.end(), 'a');
+  EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), 0x7fffffffffffffffull),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD
index ed488dbc2..853258b04 100644
--- a/test/syscalls/linux/rseq/BUILD
+++ b/test/syscalls/linux/rseq/BUILD
@@ -1,7 +1,7 @@
 # This package contains a standalone rseq test binary. This binary must not
 # depend on libc, which might use rseq itself.
 
-load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain")
+load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain", "select_arch")
 
 package(licenses = ["notice"])
 
@@ -9,32 +9,35 @@ genrule(
     name = "rseq_binary",
     srcs = [
         "critical.h",
-        "critical.S",
+        "critical_amd64.S",
+        "critical_arm64.S",
         "rseq.cc",
         "syscalls.h",
-        "start.S",
+        "start_amd64.S",
+        "start_arm64.S",
         "test.h",
         "types.h",
         "uapi.h",
     ],
     outs = ["rseq"],
-    cmd = " ".join([
-        "$(CC)",
-        "$(CC_FLAGS) ",
-        "-I.",
-        "-Wall",
-        "-Werror",
-        "-O2",
-        "-std=c++17",
-        "-static",
-        "-nostdlib",
-        "-ffreestanding",
-        "-o",
-        "$(location rseq)",
-        "$(location critical.S)",
-        "$(location rseq.cc)",
-        "$(location start.S)",
-    ]),
+    cmd = "$(CC) " +
+          "$(CC_FLAGS) " +
+          "-I. " +
+          "-Wall " +
+          "-Werror " +
+          "-O2 " +
+          "-std=c++17 " +
+          "-static " +
+          "-nostdlib " +
+          "-ffreestanding " +
+          "-o " +
+          "$(location rseq) " +
+          select_arch(
+              amd64 = "$(location critical_amd64.S) $(location start_amd64.S) ",
+              arm64 = "$(location critical_arm64.S) $(location start_arm64.S) ",
+              no_match_error = "unsupported architecture",
+          ) +
+          "$(location rseq.cc)",
     toolchains = [
         cc_toolchain,
         ":no_pie_cc_flags",
diff --git a/test/syscalls/linux/rseq/critical.S b/test/syscalls/linux/rseq/critical_amd64.S
index 8c0687e6d..8c0687e6d 100644
--- a/test/syscalls/linux/rseq/critical.S
+++ b/test/syscalls/linux/rseq/critical_amd64.S
diff --git a/test/syscalls/linux/rseq/critical_arm64.S b/test/syscalls/linux/rseq/critical_arm64.S
new file mode 100644
index 000000000..bfe7e8307
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical_arm64.S
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+  .text
+  .globl  rseq_loop
+  .type   rseq_loop, @function
+
+rseq_loop:
+  b begin
+
+  // Abort block before the critical section.
+  // Abort signature.
+  .byte 0x90, 0x90, 0x90, 0x90
+  .globl  rseq_loop_early_abort
+rseq_loop_early_abort:
+  ret
+
+begin:
+  // r->rseq_cs = cs
+  str x1, [x0, #8]
+
+  // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+  // section. Thus it must be set in or immediately before the critical section
+  // to ensure it is not cleared before the section begins.
+  .globl  rseq_loop_start
+rseq_loop_start:
+  b rseq_loop_start
+
+  // "Pre-commit": extra instructions inside the critical section.  These are
+  // used as the abort point in TestAbortPreCommit, which is not valid.
+  .globl  rseq_loop_pre_commit
+rseq_loop_pre_commit:
+  // Extra abort signature + nop for TestAbortPostCommit.
+  .byte 0x90, 0x90, 0x90, 0x90
+  nop
+
+  // "Post-commit": never reached in this case.
+  .globl  rseq_loop_post_commit
+rseq_loop_post_commit:
+
+  // Abort signature.
+  .byte 0x90, 0x90, 0x90, 0x90
+
+  .globl  rseq_loop_abort
+rseq_loop_abort:
+  ret
+
+  .size  rseq_loop,.-rseq_loop
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/start.S b/test/syscalls/linux/rseq/start_amd64.S
index b9611b276..b9611b276 100644
--- a/test/syscalls/linux/rseq/start.S
+++ b/test/syscalls/linux/rseq/start_amd64.S
diff --git a/test/syscalls/linux/rseq/start_arm64.S b/test/syscalls/linux/rseq/start_arm64.S
new file mode 100644
index 000000000..693c1c6eb
--- /dev/null
+++ b/test/syscalls/linux/rseq/start_arm64.S
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+  .text
+  .align 4
+  .type  _start,@function
+  .globl  _start
+
+_start:
+  mov  x29, sp
+  bl   __init
+  wfi
+
+  .size  _start,.-_start
+  .section  .note.GNU-stack,"",@progbits
+
+  .text
+  .globl  raw_syscall
+  .type   raw_syscall, @function
+
+raw_syscall:
+  mov  x8,x0   // syscall #
+  mov  x0,x1   // arg0
+  mov  x1,x2   // arg1
+  mov  x2,x3   // arg2
+  mov  x3,x4   // arg3
+  mov  x4,x5   // arg4
+  mov  x5,x6   // arg5
+  svc  #0
+  ret
+
+  .size  raw_syscall,.-raw_syscall
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/syscalls.h b/test/syscalls/linux/rseq/syscalls.h
index e5299c188..c4118e6c5 100644
--- a/test/syscalls/linux/rseq/syscalls.h
+++ b/test/syscalls/linux/rseq/syscalls.h
@@ -17,10 +17,13 @@
 
 #include "test/syscalls/linux/rseq/types.h"
 
-#ifdef __x86_64__
 // Syscall numbers.
+#if defined(__x86_64__)
 constexpr int kGetpid = 39;
 constexpr int kExitGroup = 231;
+#elif defined(__aarch64__)
+constexpr int kGetpid = 172;
+constexpr int kExitGroup = 94;
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
index e3ff0579a..d3e60d0a4 100644
--- a/test/syscalls/linux/rseq/uapi.h
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -15,37 +15,34 @@
 #ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
 #define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
 
-// User-kernel ABI for restartable sequences.
+#include <stdint.h>
 
-// Standard types.
-//
-// N.B. This header will be included in targets that do have the standard
-// library, so we can't shadow the standard type names.
-using __u32 = __UINT32_TYPE__;
-using __u64 = __UINT64_TYPE__;
+// User-kernel ABI for restartable sequences.
 
-#ifdef __x86_64__
 // Syscall numbers.
+#if defined(__x86_64__)
 constexpr int kRseqSyscall = 334;
+#elif defined(__aarch64__)
+constexpr int kRseqSyscall = 293;
 #else
 #error "Unknown architecture"
 #endif  // __x86_64__
 
 struct rseq_cs {
-  __u32 version;
-  __u32 flags;
-  __u64 start_ip;
-  __u64 post_commit_offset;
-  __u64 abort_ip;
-} __attribute__((aligned(4 * sizeof(__u64))));
+  uint32_t version;
+  uint32_t flags;
+  uint64_t start_ip;
+  uint64_t post_commit_offset;
+  uint64_t abort_ip;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
 
 // N.B. alignment is enforced by the kernel.
 struct rseq {
-  __u32 cpu_id_start;
-  __u32 cpu_id;
+  uint32_t cpu_id_start;
+  uint32_t cpu_id;
   struct rseq_cs* rseq_cs;
-  __u32 flags;
-} __attribute__((aligned(4 * sizeof(__u64))));
+  uint32_t flags;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
 
 constexpr int kRseqFlagUnregister = 1 << 0;
 
diff --git a/test/syscalls/linux/rtsignal.cc b/test/syscalls/linux/rtsignal.cc
index 81d193ffd..ed27e2566 100644
--- a/test/syscalls/linux/rtsignal.cc
+++ b/test/syscalls/linux/rtsignal.cc
@@ -167,6 +167,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 2c947feb7..ce88d90dd 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -53,7 +53,7 @@ namespace {
 constexpr uint32_t kFilteredSyscall = SYS_vserver;
 #elif __aarch64__
 // Use the last of arch_specific_syscalls which are not implemented on arm64.
-constexpr uint32_t kFilteredSyscall = SYS_arch_specific_syscall + 15;
+constexpr uint32_t kFilteredSyscall = __NR_arch_specific_syscall + 15;
 #endif
 
 // Applies a seccomp-bpf filter that returns `filtered_result` for
@@ -70,20 +70,27 @@ void ApplySeccompFilter(uint32_t sysno, uint32_t filtered_result,
   MaybeSave();
 
   struct sock_filter filter[] = {
-      // A = seccomp_data.arch
-      BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 4),
-      // if (A != AUDIT_ARCH_X86_64) goto kill
-      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 4),
-      // A = seccomp_data.nr
-      BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0),
-      // if (A != sysno) goto allow
-      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, sysno, 0, 1),
-      // return filtered_result
-      BPF_STMT(BPF_RET | BPF_K, filtered_result),
-      // allow: return SECCOMP_RET_ALLOW
-      BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
-      // kill: return SECCOMP_RET_KILL
-      BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
+    // A = seccomp_data.arch
+    BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 4),
+#if defined(__x86_64__)
+    // if (A != AUDIT_ARCH_X86_64) goto kill
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 4),
+#elif defined(__aarch64__)
+    // if (A != AUDIT_ARCH_AARCH64) goto kill
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_AARCH64, 0, 4),
+#else
+#error "Unknown architecture"
+#endif
+    // A = seccomp_data.nr
+    BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0),
+    // if (A != sysno) goto allow
+    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, sysno, 0, 1),
+    // return filtered_result
+    BPF_STMT(BPF_RET | BPF_K, filtered_result),
+    // allow: return SECCOMP_RET_ALLOW
+    BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+    // kill: return SECCOMP_RET_KILL
+    BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
   };
   struct sock_fprog prog;
   prog.len = ABSL_ARRAYSIZE(filter);
@@ -179,9 +186,12 @@ TEST(SeccompTest, RetTrapCausesSIGSYS) {
           TEST_CHECK(info->si_errno == kTrapValue);
           TEST_CHECK(info->si_call_addr != nullptr);
           TEST_CHECK(info->si_syscall == kFilteredSyscall);
-#ifdef __x86_64__
+#if defined(__x86_64__)
           TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
           TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == kFilteredSyscall);
+#elif defined(__aarch64__)
+          TEST_CHECK(info->si_arch == AUDIT_ARCH_AARCH64);
+          TEST_CHECK(uc->uc_mcontext.regs[8] == kFilteredSyscall);
 #endif  // defined(__x86_64__)
           _exit(0);
         });
@@ -411,5 +421,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index 580ab5193..64123e904 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/eventfd.h>
 #include <sys/sendfile.h>
 #include <unistd.h>
@@ -70,6 +71,28 @@ TEST(SendFileTest, InvalidOffset) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST(SendFileTest, Overflow) {
+  // Create input file.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("overflow", 0), SyscallSucceeds());
+  const FileDescriptor outf(fd);
+
+  // out_offset + kSize overflows INT64_MAX.
+  loff_t out_offset = 0x7ffffffffffffffeull;
+  constexpr int kSize = 3;
+  EXPECT_THAT(sendfile(outf.get(), inf.get(), &out_offset, kSize),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(SendFileTest, SendTrivially) {
   // Create temp files.
   constexpr char kData[] = "To be, or not to be, that is the question:";
@@ -530,6 +553,34 @@ TEST(SendFileTest, SendToSpecialFile) {
               SyscallSucceedsWithValue(kSize & (~7)));
 }
 
+TEST(SendFileTest, SendFileToPipe) {
+  // Create temp file.
+  constexpr char kData[] = "<insert-quote-here>";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Create a pipe for sending to a pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Expect to read up to the given size.
+  std::vector<char> buf(kDataSize);
+  ScopedThread t([&]() {
+    absl::SleepFor(absl::Milliseconds(100));
+    ASSERT_THAT(read(rfd.get(), buf.data(), buf.size()),
+                SyscallSucceedsWithValue(kDataSize));
+  });
+
+  // Send with twice the size of the file, which should hit EOF.
+  EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, kDataSize * 2),
+              SyscallSucceedsWithValue(kDataSize));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index 8f7ee4163..c101fe9d2 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -23,6 +23,7 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
@@ -35,61 +36,39 @@ namespace {
 
 class SendFileTest : public ::testing::TestWithParam<int> {
  protected:
-  PosixErrorOr<std::tuple<int, int>> Sockets() {
+  PosixErrorOr<std::unique_ptr<SocketPair>> Sockets(int type) {
     // Bind a server socket.
     int family = GetParam();
-    struct sockaddr server_addr = {};
     switch (family) {
       case AF_INET: {
-        struct sockaddr_in* server_addr_in =
-            reinterpret_cast<struct sockaddr_in*>(&server_addr);
-        server_addr_in->sin_family = family;
-        server_addr_in->sin_addr.s_addr = INADDR_ANY;
-        break;
+        if (type == SOCK_STREAM) {
+          return SocketPairKind{
+              "TCP", AF_INET, type, 0,
+              TCPAcceptBindSocketPairCreator(AF_INET, type, 0, false)}
+              .Create();
+        } else {
+          return SocketPairKind{
+              "UDP", AF_INET, type, 0,
+              UDPBidirectionalBindSocketPairCreator(AF_INET, type, 0, false)}
+              .Create();
+        }
       }
       case AF_UNIX: {
-        struct sockaddr_un* server_addr_un =
-            reinterpret_cast<struct sockaddr_un*>(&server_addr);
-        server_addr_un->sun_family = family;
-        server_addr_un->sun_path[0] = '\0';
-        break;
+        if (type == SOCK_STREAM) {
+          return SocketPairKind{
+              "UNIX", AF_UNIX, type, 0,
+              FilesystemAcceptBindSocketPairCreator(AF_UNIX, type, 0)}
+              .Create();
+        } else {
+          return SocketPairKind{
+              "UNIX", AF_UNIX, type, 0,
+              FilesystemBidirectionalBindSocketPairCreator(AF_UNIX, type, 0)}
+              .Create();
+        }
       }
       default:
         return PosixError(EINVAL);
     }
-    int server = socket(family, SOCK_STREAM, 0);
-    if (bind(server, &server_addr, sizeof(server_addr)) < 0) {
-      return PosixError(errno);
-    }
-    if (listen(server, 1) < 0) {
-      close(server);
-      return PosixError(errno);
-    }
-
-    // Fetch the address; both are anonymous.
-    socklen_t length = sizeof(server_addr);
-    if (getsockname(server, &server_addr, &length) < 0) {
-      close(server);
-      return PosixError(errno);
-    }
-
-    // Connect the client.
-    int client = socket(family, SOCK_STREAM, 0);
-    if (connect(client, &server_addr, length) < 0) {
-      close(server);
-      close(client);
-      return PosixError(errno);
-    }
-
-    // Accept on the server.
-    int server_client = accept(server, nullptr, 0);
-    if (server_client < 0) {
-      close(server);
-      close(client);
-      return PosixError(errno);
-    }
-    close(server);
-    return std::make_tuple(client, server_client);
   }
 };
 
@@ -106,9 +85,7 @@ TEST_P(SendFileTest, SendMultiple) {
   const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
 
   // Create sockets.
-  std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
-  const FileDescriptor server(std::get<0>(fds));
-  FileDescriptor client(std::get<1>(fds));  // non-const, reset is used.
+  auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_STREAM));
 
   // Thread that reads data from socket and dumps to a file.
   ScopedThread th([&] {
@@ -118,7 +95,7 @@ TEST_P(SendFileTest, SendMultiple) {
     // Read until socket is closed.
     char buf[10240];
     for (int cnt = 0;; cnt++) {
-      int r = RetryEINTR(read)(server.get(), buf, sizeof(buf));
+      int r = RetryEINTR(read)(socks->first_fd(), buf, sizeof(buf));
       // We cannot afford to save on every read() call.
       if (cnt % 1000 == 0) {
         ASSERT_THAT(r, SyscallSucceeds());
@@ -149,10 +126,10 @@ TEST_P(SendFileTest, SendMultiple) {
   for (size_t sent = 0; sent < data.size(); cnt++) {
     const size_t remain = data.size() - sent;
     std::cout << "sendfile, size=" << data.size() << ", sent=" << sent
-              << ", remain=" << remain;
+              << ", remain=" << remain << std::endl;
 
     // Send data and verify that sendfile returns the correct value.
-    int res = sendfile(client.get(), inf.get(), nullptr, remain);
+    int res = sendfile(socks->second_fd(), inf.get(), nullptr, remain);
     // We cannot afford to save on every sendfile() call.
     if (cnt % 120 == 0) {
       MaybeSave();
@@ -169,7 +146,7 @@ TEST_P(SendFileTest, SendMultiple) {
   }
 
   // Close socket to stop thread.
-  client.reset();
+  close(socks->release_second_fd());
   th.Join();
 
   // Verify that the output file has the correct data.
@@ -183,9 +160,7 @@ TEST_P(SendFileTest, SendMultiple) {
 
 TEST_P(SendFileTest, Shutdown) {
   // Create a socket.
-  std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
-  const FileDescriptor client(std::get<0>(fds));
-  FileDescriptor server(std::get<1>(fds));  // non-const, reset below.
+  auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_STREAM));
 
   // If this is a TCP socket, then turn off linger.
   if (GetParam() == AF_INET) {
@@ -193,7 +168,7 @@ TEST_P(SendFileTest, Shutdown) {
     sl.l_onoff = 1;
     sl.l_linger = 0;
     ASSERT_THAT(
-        setsockopt(server.get(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+        setsockopt(socks->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
         SyscallSucceeds());
   }
 
@@ -212,12 +187,12 @@ TEST_P(SendFileTest, Shutdown) {
   ScopedThread t([&]() {
     size_t done = 0;
     while (done < data.size()) {
-      int n = RetryEINTR(read)(server.get(), data.data(), data.size());
+      int n = RetryEINTR(read)(socks->first_fd(), data.data(), data.size());
       ASSERT_THAT(n, SyscallSucceeds());
       done += n;
     }
     // Close the server side socket.
-    server.reset();
+    close(socks->release_first_fd());
   });
 
   // Continuously stream from the file to the socket. Note we do not assert
@@ -225,7 +200,7 @@ TEST_P(SendFileTest, Shutdown) {
   // data is written. Eventually, we should get a connection reset error.
   while (1) {
     off_t offset = 0;  // Always read from the start.
-    int n = sendfile(client.get(), inf.get(), &offset, data.size());
+    int n = sendfile(socks->second_fd(), inf.get(), &offset, data.size());
     EXPECT_THAT(n, AnyOf(SyscallFailsWithErrno(ECONNRESET),
                          SyscallFailsWithErrno(EPIPE), SyscallSucceeds()));
     if (n <= 0) {
@@ -234,6 +209,20 @@ TEST_P(SendFileTest, Shutdown) {
   }
 }
 
+TEST_P(SendFileTest, SendpageFromEmptyFileToUDP) {
+  auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_DGRAM));
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  // The value to the count argument has to be so that it is impossible to
+  // allocate a buffer of this size. In Linux, sendfile transfer at most
+  // 0x7ffff000 (MAX_RW_COUNT) bytes.
+  EXPECT_THAT(sendfile(socks->first_fd(), fd.get(), 0x0, 0x8000000000004),
+              SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AddressFamily, SendFileTest,
                          ::testing::Values(AF_UNIX, AF_INET));
 
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
index 4deb1ae95..6227774a4 100644
--- a/test/syscalls/linux/sigiret.cc
+++ b/test/syscalls/linux/sigiret.cc
@@ -132,6 +132,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
index 95be4b66c..389e5fca2 100644
--- a/test/syscalls/linux/signalfd.cc
+++ b/test/syscalls/linux/signalfd.cc
@@ -369,5 +369,5 @@ int main(int argc, char** argv) {
 
   gvisor::testing::TestInit(&argc, &argv);
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigstop.cc b/test/syscalls/linux/sigstop.cc
index 7db57d968..b2fcedd62 100644
--- a/test/syscalls/linux/sigstop.cc
+++ b/test/syscalls/linux/sigstop.cc
@@ -147,5 +147,5 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigtimedwait.cc b/test/syscalls/linux/sigtimedwait.cc
index 1e5bf5942..4f8afff15 100644
--- a/test/syscalls/linux/sigtimedwait.cc
+++ b/test/syscalls/linux/sigtimedwait.cc
@@ -319,6 +319,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc
index 3a07ac8d2..e0a4d0985 100644
--- a/test/syscalls/linux/socket.cc
+++ b/test/syscalls/linux/socket.cc
@@ -13,11 +13,14 @@
 // limitations under the License.
 
 #include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #include <unistd.h>
 
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
+#include "test/util/temp_umask.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
@@ -58,12 +61,70 @@ TEST(SocketTest, ProtocolInet) {
   }
 }
 
+TEST(SocketTest, UnixSocketStat) {
+  SKIP_IF(IsRunningWithVFS1());
+
+  FileDescriptor bound =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
+
+  // The permissions of the file created with bind(2) should be defined by the
+  // permissions of the bound socket and the umask.
+  mode_t sock_perm = 0765, mask = 0123;
+  ASSERT_THAT(fchmod(bound.get(), sock_perm), SyscallSucceeds());
+  TempUmask m(mask);
+
+  struct sockaddr_un addr =
+      ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(/*abstract=*/false, AF_UNIX));
+  ASSERT_THAT(bind(bound.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                   sizeof(addr)),
+              SyscallSucceeds());
+
+  struct stat statbuf = {};
+  ASSERT_THAT(stat(addr.sun_path, &statbuf), SyscallSucceeds());
+
+  // Mode should be S_IFSOCK.
+  EXPECT_EQ(statbuf.st_mode, S_IFSOCK | sock_perm & ~mask);
+
+  // Timestamps should be equal and non-zero.
+  EXPECT_NE(statbuf.st_atime, 0);
+  EXPECT_EQ(statbuf.st_atime, statbuf.st_mtime);
+  EXPECT_EQ(statbuf.st_atime, statbuf.st_ctime);
+}
+
+TEST(SocketTest, UnixConnectNeedsWritePerm) {
+  SKIP_IF(IsRunningWithVFS1());
+
+  FileDescriptor bound =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
+
+  struct sockaddr_un addr =
+      ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(/*abstract=*/false, AF_UNIX));
+  ASSERT_THAT(bind(bound.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                   sizeof(addr)),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(bound.get(), 1), SyscallSucceeds());
+
+  // Connect should fail without write perms.
+  ASSERT_THAT(chmod(addr.sun_path, 0500), SyscallSucceeds());
+  FileDescriptor client =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
+  EXPECT_THAT(connect(client.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      sizeof(addr)),
+              SyscallFailsWithErrno(EACCES));
+
+  // Connect should succeed with write perms.
+  ASSERT_THAT(chmod(addr.sun_path, 0200), SyscallSucceeds());
+  EXPECT_THAT(connect(client.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      sizeof(addr)),
+              SyscallSucceeds());
+}
+
 using SocketOpenTest = ::testing::TestWithParam<int>;
 
 // UDS cannot be opened.
 TEST_P(SocketOpenTest, Unix) {
   // FIXME(b/142001530): Open incorrectly succeeds on gVisor.
-  SKIP_IF(IsRunningOnGvisor());
+  SKIP_IF(IsRunningWithVFS1());
 
   FileDescriptor bound =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index 637d1151a..1967329ee 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -33,6 +33,7 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include "absl/container/node_hash_map.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_bind_to_device_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -192,8 +193,8 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
   in_port_t port_ = 0;
   // sockets_to_close_ is a map from action index to the socket that was
   // created.
-  std::unordered_map<int,
-                     std::unique_ptr<gvisor::testing::FileDescriptor>>
+  absl::node_hash_map<int,
+                      std::unique_ptr<gvisor::testing::FileDescriptor>>
       sockets_to_close_;
   int next_socket_id_ = 0;
 };
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index e8f24a59e..f7d6139f1 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -447,6 +447,60 @@ TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgSucceeds) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST_P(AllSocketPairTest, SendTimeoutDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval actual_tv = {.tv_sec = -1, .tv_usec = -1};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv_usec, 0);
+}
+
+TEST_P(AllSocketPairTest, SetGetSendTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval tv = {.tv_sec = 89, .tv_usec = 42000};
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  timeval actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 89);
+  EXPECT_EQ(actual_tv.tv_usec, 42000);
+}
+
+TEST_P(AllSocketPairTest, SetGetSendTimeoutLargerArg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval_with_extra {
+    struct timeval tv;
+    int64_t extra_data;
+  } ABSL_ATTRIBUTE_PACKED;
+
+  timeval_with_extra tv_extra = {
+      .tv = {.tv_sec = 0, .tv_usec = 123000},
+  };
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &tv_extra, sizeof(tv_extra)),
+              SyscallSucceeds());
+
+  timeval_with_extra actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv.tv_usec, 123000);
+}
+
 TEST_P(AllSocketPairTest, SendTimeoutAllowsWrite) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -491,18 +545,36 @@ TEST_P(AllSocketPairTest, SendTimeoutAllowsSendmsg) {
   ASSERT_NO_FATAL_FAILURE(SendNullCmsg(sockets->first_fd(), buf, sizeof(buf)));
 }
 
-TEST_P(AllSocketPairTest, SoRcvTimeoIsSet) {
+TEST_P(AllSocketPairTest, RecvTimeoutDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-  struct timeval tv {
-    .tv_sec = 0, .tv_usec = 35
-  };
+  timeval actual_tv = {.tv_sec = -1, .tv_usec = -1};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv_usec, 0);
+}
+
+TEST_P(AllSocketPairTest, SetGetRecvTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval tv = {.tv_sec = 123, .tv_usec = 456000};
   EXPECT_THAT(
       setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
       SyscallSucceeds());
+
+  timeval actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 123);
+  EXPECT_EQ(actual_tv.tv_usec, 456000);
 }
 
-TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
+TEST_P(AllSocketPairTest, SetGetRecvTimeoutLargerArg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval_with_extra {
@@ -510,13 +582,21 @@ TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
     int64_t extra_data;
   } ABSL_ATTRIBUTE_PACKED;
 
-  timeval_with_extra tv_extra;
-  tv_extra.tv.tv_sec = 0;
-  tv_extra.tv.tv_usec = 25;
+  timeval_with_extra tv_extra = {
+      .tv = {.tv_sec = 0, .tv_usec = 432000},
+  };
 
   EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
                          &tv_extra, sizeof(tv_extra)),
               SyscallSucceeds());
+
+  timeval_with_extra actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv.tv_usec, 432000);
 }
 
 TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgOneSecondSucceeds) {
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index b24618a88..fa890ec98 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -162,7 +162,7 @@ TEST_P(DualStackSocketTest, AddressOperations) {
         ASSERT_NO_ERRNO(SetAddrPort(
             addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
 
-        EXPECT_THAT(connect(fd.get(), addr_in, addr.addr_len),
+        EXPECT_THAT(RetryEINTR(connect)(fd.get(), addr_in, addr.addr_len),
                     SyscallSucceeds())
             << GetAddrStr(addr_in);
         bound = true;
@@ -234,7 +234,7 @@ TEST_P(DualStackSocketTest, AddressOperations) {
   }
 }
 
-// TODO(gvisor.dev/issues/1556): uncomment V4MappedAny.
+// TODO(gvisor.dev/issue/1556): uncomment V4MappedAny.
 INSTANTIATE_TEST_SUITE_P(
     All, DualStackSocketTest,
     ::testing::Combine(
@@ -319,17 +319,58 @@ TEST_P(SocketInetLoopbackTest, TCPListenUnbound) {
   tcpSimpleConnectTest(listener, connector, false);
 }
 
-TEST_P(SocketInetLoopbackTest, TCPListenClose) {
+TEST_P(SocketInetLoopbackTest, TCPListenShutdownListen) {
+  const auto& param = GetParam();
+
+  const TestAddress& listener = param.listener;
+  const TestAddress& connector = param.connector;
+
+  constexpr int kBacklog = 5;
+
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+  ASSERT_THAT(shutdown(listen_fd.get(), SHUT_RD), SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+
+  for (int i = 0; i < kBacklog; i++) {
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(RetryEINTR(connect)(client.get(),
+                                    reinterpret_cast<sockaddr*>(&conn_addr),
+                                    connector.addr_len),
+                SyscallSucceeds());
+  }
+  for (int i = 0; i < kBacklog; i++) {
+    ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr), SyscallSucceeds());
+  }
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenShutdown) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
-  constexpr int kAcceptCount = 32;
-  constexpr int kBacklog = kAcceptCount * 2;
-  constexpr int kFDs = 128;
-  constexpr int kThreadCount = 4;
-  constexpr int kFDsPerThread = kFDs / kThreadCount;
+  constexpr int kBacklog = 2;
+  constexpr int kFDs = kBacklog + 1;
 
   // Create the listening socket.
   FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
@@ -348,39 +389,169 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
-  DisableSave ds;  // Too many system calls.
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
-  FileDescriptor clients[kFDs];
-  std::unique_ptr<ScopedThread> threads[kThreadCount];
+
+  // Shutdown the write of the listener, expect to not have any effect.
+  ASSERT_THAT(shutdown(listen_fd.get(), SHUT_WR), SyscallSucceeds());
+
   for (int i = 0; i < kFDs; i++) {
-    clients[i] = ASSERT_NO_ERRNO_AND_VALUE(
-        Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(RetryEINTR(connect)(client.get(),
+                                    reinterpret_cast<sockaddr*>(&conn_addr),
+                                    connector.addr_len),
+                SyscallSucceeds());
+    ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr), SyscallSucceeds());
   }
-  for (int i = 0; i < kThreadCount; i++) {
-    threads[i] = absl::make_unique<ScopedThread>([&connector, &conn_addr,
-                                                  &clients, i]() {
-      for (int j = 0; j < kFDsPerThread; j++) {
-        int k = i * kFDsPerThread + j;
-        int ret =
-            connect(clients[k].get(), reinterpret_cast<sockaddr*>(&conn_addr),
-                    connector.addr_len);
-        if (ret != 0) {
-          EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
-        }
-      }
-    });
+
+  // Shutdown the read of the listener, expect to fail subsequent
+  // server accepts, binds and client connects.
+  ASSERT_THAT(shutdown(listen_fd.get(), SHUT_RD), SyscallSucceeds());
+
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Check that shutdown did not release the port.
+  FileDescriptor new_listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(
+      bind(new_listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+           listener.addr_len),
+      SyscallFailsWithErrno(EADDRINUSE));
+
+  // Check that subsequent connection attempts receive a RST.
+  auto client = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  for (int i = 0; i < kFDs; i++) {
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(RetryEINTR(connect)(client.get(),
+                                    reinterpret_cast<sockaddr*>(&conn_addr),
+                                    connector.addr_len),
+                SyscallFailsWithErrno(ECONNREFUSED));
   }
-  for (int i = 0; i < kThreadCount; i++) {
-    threads[i]->Join();
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenClose) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  constexpr int kAcceptCount = 2;
+  constexpr int kBacklog = kAcceptCount + 2;
+  constexpr int kFDs = kBacklog * 3;
+
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  std::vector<FileDescriptor> clients;
+  for (int i = 0; i < kFDs; i++) {
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+    int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                      connector.addr_len);
+    if (ret != 0) {
+      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+    }
+    clients.push_back(std::move(client));
   }
   for (int i = 0; i < kAcceptCount; i++) {
     auto accepted =
         ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
   }
-  // TODO(b/138400178): Fix cooperative S/R failure when ds.reset() is invoked
-  // before function end.
-  // ds.reset();
+}
+
+void TestListenWhileConnect(const TestParam& param,
+                            void (*stopListen)(FileDescriptor&)) {
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  constexpr int kBacklog = 2;
+  constexpr int kClients = kBacklog + 1;
+
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  std::vector<FileDescriptor> clients;
+  for (int i = 0; i < kClients; i++) {
+    FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+    int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                      connector.addr_len);
+    if (ret != 0) {
+      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+      clients.push_back(std::move(client));
+    }
+  }
+
+  stopListen(listen_fd);
+
+  for (auto& client : clients) {
+    const int kTimeout = 10000;
+    struct pollfd pfd = {
+        .fd = client.get(),
+        .events = POLLIN,
+    };
+    // When the listening socket is closed, then we expect the remote to reset
+    // the connection.
+    ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+    ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR);
+    char c;
+    // Subsequent read can fail with:
+    // ECONNRESET: If the client connection was established and was reset by the
+    // remote.
+    // ECONNREFUSED: If the client connection failed to be established.
+    ASSERT_THAT(read(client.get(), &c, sizeof(c)),
+                AnyOf(SyscallFailsWithErrno(ECONNRESET),
+                      SyscallFailsWithErrno(ECONNREFUSED)));
+  }
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
+  TestListenWhileConnect(GetParam(), [](FileDescriptor& f) {
+    ASSERT_THAT(close(f.release()), SyscallSucceeds());
+  });
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenShutdownWhileConnect) {
+  TestListenWhileConnect(GetParam(), [](FileDescriptor& f) {
+    ASSERT_THAT(shutdown(f.get(), SHUT_RD), SyscallSucceeds());
+  });
 }
 
 TEST_P(SocketInetLoopbackTest, TCPbacklog) {
@@ -605,15 +776,23 @@ TEST_P(SocketInetLoopbackTest, TCPLinger2TimeoutAfterClose_NoRandomSave) {
                   &conn_addrlen),
       SyscallSucceeds());
 
-  constexpr int kTCPLingerTimeout = 5;
-  EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
-                         &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
-              SyscallSucceedsWithValue(0));
+  // Disable cooperative saves after this point as TCP timers are not restored
+  // across a S/R.
+  {
+    DisableSave ds;
+    constexpr int kTCPLingerTimeout = 5;
+    EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
+                           &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+                SyscallSucceedsWithValue(0));
 
-  // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
-  conn_fd.reset();
+    // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
+    conn_fd.reset();
+
+    absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
 
-  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+    // ds going out of scope will Re-enable S/R's since at this point the timer
+    // must have fired and cleaned up the endpoint.
+  }
 
   // Now bind and connect a new socket and verify that we can immediately
   // rebind the address bound by the conn_fd as it never entered TIME_WAIT.
@@ -1082,6 +1261,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
               if (connects_received >= kConnectAttempts) {
                 // Another thread have shutdown our read side causing the
                 // accept to fail.
+                ASSERT_EQ(errno, EINVAL);
                 break;
               }
               ASSERT_NO_ERRNO(fd);
@@ -1149,7 +1329,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
                 EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
 }
 
-TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -1262,7 +1442,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
                 EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
 }
 
-TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -1647,10 +1827,10 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
     // Connect to bind an ephemeral port.
     const FileDescriptor connected_fd =
         ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
-    ASSERT_THAT(
-        connect(connected_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
-                bound_addr_len),
-        SyscallSucceeds());
+    ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+                                    reinterpret_cast<sockaddr*>(&bound_addr),
+                                    bound_addr_len),
+                SyscallSucceeds());
 
     // Get the ephemeral port.
     sockaddr_storage connected_addr = {};
@@ -1753,8 +1933,9 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReservedReuseAddr) {
   ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
                          &kSockOptOn, sizeof(kSockOptOn)),
               SyscallSucceeds());
-  ASSERT_THAT(connect(connected_fd.get(),
-                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+  ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&bound_addr),
+                                  bound_addr_len),
               SyscallSucceeds());
 
   // Get the ephemeral port.
@@ -1814,10 +1995,10 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
     // Connect to bind an ephemeral port.
     const FileDescriptor connected_fd =
         ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
-    ASSERT_THAT(
-        connect(connected_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
-                bound_addr_len),
-        SyscallSucceeds());
+    ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+                                    reinterpret_cast<sockaddr*>(&bound_addr),
+                                    bound_addr_len),
+                SyscallSucceeds());
 
     // Get the ephemeral port.
     sockaddr_storage connected_addr = {};
@@ -1944,8 +2125,9 @@ TEST_P(SocketMultiProtocolInetLoopbackTest,
   ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
                          &kSockOptOn, sizeof(kSockOptOn)),
               SyscallSucceeds());
-  ASSERT_THAT(connect(connected_fd.get(),
-                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+  ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&bound_addr),
+                                  bound_addr_len),
               SyscallSucceeds());
 
   // Get the ephemeral port.
@@ -2005,10 +2187,10 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
     // Connect to bind an ephemeral port.
     const FileDescriptor connected_fd =
         ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
-    ASSERT_THAT(
-        connect(connected_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
-                bound_addr_len),
-        SyscallSucceeds());
+    ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+                                    reinterpret_cast<sockaddr*>(&bound_addr),
+                                    bound_addr_len),
+                SyscallSucceeds());
 
     // Get the ephemeral port.
     sockaddr_storage connected_addr = {};
@@ -2138,8 +2320,9 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
                          &kSockOptOn, sizeof(kSockOptOn)),
               SyscallSucceeds());
 
-  ASSERT_THAT(connect(connected_fd.get(),
-                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+  ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&bound_addr),
+                                  bound_addr_len),
               SyscallSucceeds());
 
   // Get the ephemeral port.
@@ -2204,7 +2387,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) {
           setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &portreuse2, sizeof(int)),
           SyscallSucceeds());
 
-      std::cout << portreuse1 << " " << portreuse2;
+      std::cout << portreuse1 << " " << portreuse2 << std::endl;
       int ret = bind(fd2, reinterpret_cast<sockaddr*>(&addr), addrlen);
 
       // Verify that two sockets can be bound to the same port only if
diff --git a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
new file mode 100644
index 000000000..2324c7f6a
--- /dev/null
+++ b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
@@ -0,0 +1,171 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <string.h>
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::Gt;
+
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
+  switch (family) {
+    case AF_INET:
+      return static_cast<uint16_t>(
+          reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
+    case AF_INET6:
+      return static_cast<uint16_t>(
+          reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+}
+
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
+  switch (family) {
+    case AF_INET:
+      reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
+      return NoError();
+    case AF_INET6:
+      reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port;
+      return NoError();
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+}
+
+struct TestParam {
+  TestAddress listener;
+  TestAddress connector;
+};
+
+std::string DescribeTestParam(::testing::TestParamInfo<TestParam> const& info) {
+  return absl::StrCat("Listen", info.param.listener.description, "_Connect",
+                      info.param.connector.description);
+}
+
+using SocketInetLoopbackTest = ::testing::TestWithParam<TestParam>;
+
+// This test verifies that connect returns EADDRNOTAVAIL if all local ephemeral
+// ports are already in use for a given destination ip/port.
+// We disable S/R because this test creates a large number of sockets.
+TEST_P(SocketInetLoopbackTest, TestTCPPortExhaustion_NoRandomSave) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  constexpr int kBacklog = 10;
+  constexpr int kClients = 65536;
+
+  // Create the listening socket.
+  auto listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Disable cooperative S/R as we are making too many syscalls.
+  DisableSave ds;
+
+  // Now we keep opening connections till we run out of local ephemeral ports.
+  // and assert the error we get back.
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  std::vector<FileDescriptor> clients;
+  std::vector<FileDescriptor> servers;
+
+  for (int i = 0; i < kClients; i++) {
+    FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                      connector.addr_len);
+    if (ret == 0) {
+      clients.push_back(std::move(client));
+      FileDescriptor server =
+          ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+      servers.push_back(std::move(server));
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallFailsWithErrno(EADDRNOTAVAIL));
+    break;
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    All, SocketInetLoopbackTest,
+    ::testing::Values(
+        // Listeners bound to IPv4 addresses refuse connections using IPv6
+        // addresses.
+        TestParam{V4Any(), V4Any()}, TestParam{V4Any(), V4Loopback()},
+        TestParam{V4Any(), V4MappedAny()},
+        TestParam{V4Any(), V4MappedLoopback()},
+        TestParam{V4Loopback(), V4Any()}, TestParam{V4Loopback(), V4Loopback()},
+        TestParam{V4Loopback(), V4MappedLoopback()},
+        TestParam{V4MappedAny(), V4Any()},
+        TestParam{V4MappedAny(), V4Loopback()},
+        TestParam{V4MappedAny(), V4MappedAny()},
+        TestParam{V4MappedAny(), V4MappedLoopback()},
+        TestParam{V4MappedLoopback(), V4Any()},
+        TestParam{V4MappedLoopback(), V4Loopback()},
+        TestParam{V4MappedLoopback(), V4MappedLoopback()},
+
+        // Listeners bound to IN6ADDR_ANY accept all connections.
+        TestParam{V6Any(), V4Any()}, TestParam{V6Any(), V4Loopback()},
+        TestParam{V6Any(), V4MappedAny()},
+        TestParam{V6Any(), V4MappedLoopback()}, TestParam{V6Any(), V6Any()},
+        TestParam{V6Any(), V6Loopback()},
+
+        // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4
+        // addresses.
+        TestParam{V6Loopback(), V6Any()},
+        TestParam{V6Loopback(), V6Loopback()}),
+    DescribeTestParam);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 27779e47c..fa81845fd 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -876,6 +876,51 @@ TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
   EXPECT_EQ(get, kAbove);
 }
 
+TEST_P(TCPSocketPairTest, SetTCPWindowClampBelowMinRcvBufConnectedSocket) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  // Discover minimum receive buf by setting a really low value
+  // for the receive buffer.
+  constexpr int kZero = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &kZero,
+                         sizeof(kZero)),
+              SyscallSucceeds());
+
+  // Now retrieve the minimum value for SO_RCVBUF as the set above should
+  // have caused SO_RCVBUF for the socket to be set to the minimum.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  int min_so_rcvbuf = get;
+
+  {
+    // Setting TCP_WINDOW_CLAMP to zero for a connected socket is not permitted.
+    constexpr int kZero = 0;
+    EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_WINDOW_CLAMP,
+                           &kZero, sizeof(kZero)),
+                SyscallFailsWithErrno(EINVAL));
+
+    // Non-zero clamp values below MIN_SO_RCVBUF/2 should result in the clamp
+    // being set to MIN_SO_RCVBUF/2.
+    int below_half_min_so_rcvbuf = min_so_rcvbuf / 2 - 1;
+    EXPECT_THAT(
+        setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_WINDOW_CLAMP,
+                   &below_half_min_so_rcvbuf, sizeof(below_half_min_so_rcvbuf)),
+        SyscallSucceeds());
+
+    int get = -1;
+    socklen_t get_len = sizeof(get);
+
+    ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_WINDOW_CLAMP,
+                           &get, &get_len),
+                SyscallSucceedsWithValue(0));
+    EXPECT_EQ(get_len, sizeof(get));
+    EXPECT_EQ(min_so_rcvbuf / 2, get);
+  }
+}
+
 TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) {
   DisableSave ds;  // Too many syscalls.
   constexpr int kThreadCount = 1000;
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index db5663ecd..1c533fdf2 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -14,6 +14,7 @@
 
 #include "test/syscalls/linux/socket_ip_udp_generic.h"
 
+#include <errno.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <poll.h>
@@ -209,46 +210,6 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
-// Ensure that Receiving TOS is off by default.
-TEST_P(UDPSocketPairTest, RecvTosDefault) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-}
-
-// Test that setting and getting IP_RECVTOS works as expected.
-TEST_P(UDPSocketPairTest, SetRecvTos) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOff, sizeof(kSockOptOff)),
-              SyscallSucceeds());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOn, sizeof(kSockOptOn)),
-              SyscallSucceeds());
-
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOn);
-}
-
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -401,5 +362,97 @@ TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
   EXPECT_EQ(get_len, sizeof(get));
 }
 
+// Holds TOS or TClass information for IPv4 or IPv6 respectively.
+struct RecvTosOption {
+  int level;
+  int option;
+};
+
+RecvTosOption GetRecvTosOption(int domain) {
+  TEST_CHECK(domain == AF_INET || domain == AF_INET6);
+  RecvTosOption opt;
+  switch (domain) {
+    case AF_INET:
+      opt.level = IPPROTO_IP;
+      opt.option = IP_RECVTOS;
+      break;
+    case AF_INET6:
+      opt.level = IPPROTO_IPV6;
+      opt.option = IPV6_RECVTCLASS;
+      break;
+  }
+  return opt;
+}
+
+// Ensure that Receiving TOS or TCLASS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS or IPV6_RECVTCLASS works as
+// expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
+// Test that any socket (including IPv6 only) accepts the IPv4 TOS option: this
+// mirrors behavior in linux.
+TEST_P(UDPSocketPairTest, TOSRecvMismatch) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(AF_INET);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+}
+
+// Test that an IPv4 socket does not support the IPv6 TClass option.
+TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
+  // This should only test AF_INET sockets for the mismatch behavior.
+  SKIP_IF(GetParam().domain != AF_INET);
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IPV6, IPV6_RECVTCLASS,
+                         &get, &get_len),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index bc4b07a62..1294d9050 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -2129,6 +2129,39 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
               SyscallSucceedsWithValue(kMessageSize));
 }
 
+// Check that connect returns EADDRNOTAVAIL when out of local ephemeral ports.
+// We disable S/R because this test creates a large number of sockets.
+TEST_P(IPv4UDPUnboundSocketTest, UDPConnectPortExhaustion_NoRandomSave) {
+  auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  constexpr int kClients = 65536;
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(receiver1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(receiver1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Disable cooperative S/R as we are making too many syscalls.
+  DisableSave ds;
+  std::vector<std::unique_ptr<FileDescriptor>> sockets;
+  for (int i = 0; i < kClients; i++) {
+    auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+    int ret = connect(s->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                      addr.addr_len);
+    if (ret == 0) {
+      sockets.push_back(std::move(s));
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallFailsWithErrno(EAGAIN));
+    break;
+  }
+}
+
 // Test that socket will receive packet info control message.
 TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) {
   // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 40e673625..d690d9564 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -45,37 +45,31 @@ void IPv4UDPUnboundExternalNetworkingSocketTest::SetUp() {
   got_if_infos_ = false;
 
   // Get interface list.
-  std::vector<std::string> if_names;
   ASSERT_NO_ERRNO(if_helper_.Load());
-  if_names = if_helper_.InterfaceList(AF_INET);
+  std::vector<std::string> if_names = if_helper_.InterfaceList(AF_INET);
   if (if_names.size() != 2) {
     return;
   }
 
   // Figure out which interface is where.
-  int lo = 0, eth = 1;
-  if (if_names[lo] != "lo") {
-    lo = 1;
-    eth = 0;
-  }
-
-  if (if_names[lo] != "lo") {
-    return;
-  }
-
-  lo_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(if_names[lo]));
-  lo_if_addr_ = if_helper_.GetAddr(AF_INET, if_names[lo]);
-  if (lo_if_addr_ == nullptr) {
+  std::string lo = if_names[0];
+  std::string eth = if_names[1];
+  if (lo != "lo") std::swap(lo, eth);
+  if (lo != "lo") return;
+
+  lo_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(lo));
+  auto lo_if_addr = if_helper_.GetAddr(AF_INET, lo);
+  if (lo_if_addr == nullptr) {
     return;
   }
-  lo_if_sin_addr_ = reinterpret_cast<sockaddr_in*>(lo_if_addr_)->sin_addr;
+  lo_if_addr_ = *reinterpret_cast<const sockaddr_in*>(lo_if_addr);
 
-  eth_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(if_names[eth]));
-  eth_if_addr_ = if_helper_.GetAddr(AF_INET, if_names[eth]);
-  if (eth_if_addr_ == nullptr) {
+  eth_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(eth));
+  auto eth_if_addr = if_helper_.GetAddr(AF_INET, eth);
+  if (eth_if_addr == nullptr) {
     return;
   }
-  eth_if_sin_addr_ = reinterpret_cast<sockaddr_in*>(eth_if_addr_)->sin_addr;
+  eth_if_addr_ = *reinterpret_cast<const sockaddr_in*>(eth_if_addr);
 
   got_if_infos_ = true;
 }
@@ -242,7 +236,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
   // Bind the non-receiving socket to the unicast ethernet address.
   auto norecv_addr = rcv1_addr;
   reinterpret_cast<sockaddr_in*>(&norecv_addr.addr)->sin_addr =
-      eth_if_sin_addr_;
+      eth_if_addr_.sin_addr;
   ASSERT_THAT(bind(norcv->get(), reinterpret_cast<sockaddr*>(&norecv_addr.addr),
                    norecv_addr.addr_len),
               SyscallSucceedsWithValue(0));
@@ -1028,7 +1022,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
   auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   ip_mreqn iface = {};
   iface.imr_ifindex = lo_if_idx_;
-  iface.imr_address = eth_if_sin_addr_;
+  iface.imr_address = eth_if_addr_.sin_addr;
   ASSERT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
                          sizeof(iface)),
               SyscallSucceeds());
@@ -1058,7 +1052,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
   SKIP_IF(IsRunningOnGvisor());
 
   // Verify the received source address.
-  EXPECT_EQ(eth_if_sin_addr_.s_addr, src_addr_in->sin_addr.s_addr);
+  EXPECT_EQ(eth_if_addr_.sin_addr.s_addr, src_addr_in->sin_addr.s_addr);
 }
 
 // Check that when we are bound to one interface we can set IP_MULTICAST_IF to
@@ -1075,7 +1069,8 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
 
   // Create sender and bind to eth interface.
   auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
-  ASSERT_THAT(bind(sender->get(), eth_if_addr_, sizeof(sockaddr_in)),
+  ASSERT_THAT(bind(sender->get(), reinterpret_cast<sockaddr*>(&eth_if_addr_),
+                   sizeof(eth_if_addr_)),
               SyscallSucceeds());
 
   // Run through all possible combinations of index and address for
@@ -1085,9 +1080,9 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
     struct in_addr imr_address;
   } test_data[] = {
       {lo_if_idx_, {}},
-      {0, lo_if_sin_addr_},
-      {lo_if_idx_, lo_if_sin_addr_},
-      {lo_if_idx_, eth_if_sin_addr_},
+      {0, lo_if_addr_.sin_addr},
+      {lo_if_idx_, lo_if_addr_.sin_addr},
+      {lo_if_idx_, eth_if_addr_.sin_addr},
   };
   for (auto t : test_data) {
     ip_mreqn iface = {};
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
index bec2e96ee..10b90b1e0 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
@@ -36,10 +36,8 @@ class IPv4UDPUnboundExternalNetworkingSocketTest : public SimpleSocketTest {
   // Interface infos.
   int lo_if_idx_;
   int eth_if_idx_;
-  sockaddr* lo_if_addr_;
-  sockaddr* eth_if_addr_;
-  in_addr lo_if_sin_addr_;
-  in_addr eth_if_sin_addr_;
+  sockaddr_in lo_if_addr_;
+  sockaddr_in eth_if_addr_;
 };
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index e5aed1eec..fbe61c5a0 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -26,7 +26,7 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
-#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/capability_util.h"
@@ -118,24 +118,6 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
   // TODO(mpratt): Check ifinfomsg contents and following attrs.
 }
 
-PosixError DumpLinks(
-    const FileDescriptor& fd, uint32_t seq,
-    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
-  struct request {
-    struct nlmsghdr hdr;
-    struct ifinfomsg ifm;
-  };
-
-  struct request req = {};
-  req.hdr.nlmsg_len = sizeof(req);
-  req.hdr.nlmsg_type = RTM_GETLINK;
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
-  req.hdr.nlmsg_seq = seq;
-  req.ifm.ifi_family = AF_UNSPEC;
-
-  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
-}
-
 TEST(NetlinkRouteTest, GetLinkDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -152,7 +134,7 @@ TEST(NetlinkRouteTest, GetLinkDump) {
     const struct ifinfomsg* msg =
         reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
     std::cout << "Found interface idx=" << msg->ifi_index
-              << ", type=" << std::hex << msg->ifi_type;
+              << ", type=" << std::hex << msg->ifi_type << std::endl;
     if (msg->ifi_type == ARPHRD_LOOPBACK) {
       loopbackFound = true;
       EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
@@ -161,37 +143,6 @@ TEST(NetlinkRouteTest, GetLinkDump) {
   EXPECT_TRUE(loopbackFound);
 }
 
-struct Link {
-  int index;
-  std::string name;
-};
-
-PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
-  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
-
-  absl::optional<Link> link;
-  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
-    if (hdr->nlmsg_type != RTM_NEWLINK ||
-        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
-      return;
-    }
-    const struct ifinfomsg* msg =
-        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
-    if (msg->ifi_type == ARPHRD_LOOPBACK) {
-      const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
-      if (rta == nullptr) {
-        // Ignore links that do not have a name.
-        return;
-      }
-
-      link = Link();
-      link->index = msg->ifi_index;
-      link->name = std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
-    }
-  }));
-  return link;
-}
-
 // CheckLinkMsg checks a netlink message against an expected link.
 void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
   ASSERT_THAT(hdr->nlmsg_type, Eq(RTM_NEWLINK));
@@ -209,9 +160,7 @@ void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
 }
 
 TEST(NetlinkRouteTest, GetLinkByIndex) {
-  absl::optional<Link> loopback_link =
-      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
-  ASSERT_TRUE(loopback_link.has_value());
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
 
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -227,13 +176,13 @@ TEST(NetlinkRouteTest, GetLinkByIndex) {
   req.hdr.nlmsg_flags = NLM_F_REQUEST;
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
-  req.ifm.ifi_index = loopback_link->index;
+  req.ifm.ifi_index = loopback_link.index;
 
   bool found = false;
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckLinkMsg(hdr, *loopback_link);
+        CheckLinkMsg(hdr, loopback_link);
         found = true;
       },
       false));
@@ -241,9 +190,7 @@ TEST(NetlinkRouteTest, GetLinkByIndex) {
 }
 
 TEST(NetlinkRouteTest, GetLinkByName) {
-  absl::optional<Link> loopback_link =
-      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
-  ASSERT_TRUE(loopback_link.has_value());
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
 
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -262,8 +209,8 @@ TEST(NetlinkRouteTest, GetLinkByName) {
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
   req.rtattr.rta_type = IFLA_IFNAME;
-  req.rtattr.rta_len = RTA_LENGTH(loopback_link->name.size() + 1);
-  strncpy(req.ifname, loopback_link->name.c_str(), sizeof(req.ifname));
+  req.rtattr.rta_len = RTA_LENGTH(loopback_link.name.size() + 1);
+  strncpy(req.ifname, loopback_link.name.c_str(), sizeof(req.ifname));
   req.hdr.nlmsg_len =
       NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
 
@@ -271,7 +218,7 @@ TEST(NetlinkRouteTest, GetLinkByName) {
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckLinkMsg(hdr, *loopback_link);
+        CheckLinkMsg(hdr, loopback_link);
         found = true;
       },
       false));
@@ -523,9 +470,7 @@ TEST(NetlinkRouteTest, LookupAll) {
 TEST(NetlinkRouteTest, AddAddr) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
 
-  absl::optional<Link> loopback_link =
-      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
-  ASSERT_TRUE(loopback_link.has_value());
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
 
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -545,7 +490,7 @@ TEST(NetlinkRouteTest, AddAddr) {
   req.ifa.ifa_prefixlen = 24;
   req.ifa.ifa_flags = 0;
   req.ifa.ifa_scope = 0;
-  req.ifa.ifa_index = loopback_link->index;
+  req.ifa.ifa_index = loopback_link.index;
   req.rtattr.rta_type = IFA_LOCAL;
   req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr));
   inet_pton(AF_INET, "10.0.0.1", &req.addr);
diff --git a/test/syscalls/linux/socket_netlink_route_util.cc b/test/syscalls/linux/socket_netlink_route_util.cc
new file mode 100644
index 000000000..bde1dbb4d
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.cc
@@ -0,0 +1,162 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+
+#include <linux/if.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr uint32_t kSeq = 12345;
+
+}  // namespace
+
+PosixError DumpLinks(
+    const FileDescriptor& fd, uint32_t seq,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = seq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
+PosixErrorOr<std::vector<Link>> DumpLinks() {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  std::vector<Link> links;
+  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    if (hdr->nlmsg_type != RTM_NEWLINK ||
+        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+      return;
+    }
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+    if (rta == nullptr) {
+      // Ignore links that do not have a name.
+      return;
+    }
+
+    links.emplace_back();
+    links.back().index = msg->ifi_index;
+    links.back().type = msg->ifi_type;
+    links.back().name =
+        std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+  }));
+  return links;
+}
+
+PosixErrorOr<Link> LoopbackLink() {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  for (const auto& link : links) {
+    if (link.type == ARPHRD_LOOPBACK) {
+      return link;
+    }
+  }
+  return PosixError(ENOENT, "loopback link not found");
+}
+
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifaddr;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifaddr));
+  req.hdr.nlmsg_type = RTM_NEWADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifaddr.ifa_index = index;
+  req.ifaddr.ifa_family = family;
+  req.ifaddr.ifa_prefixlen = prefixlen;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFA_LOCAL;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifinfo;
+    char pad[NLMSG_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+  req.hdr.nlmsg_type = RTM_NEWLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifinfo.ifi_index = index;
+  req.ifinfo.ifi_flags = flags;
+  req.ifinfo.ifi_change = change;
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifinfo;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+  req.hdr.nlmsg_type = RTM_NEWLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifinfo.ifi_index = index;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFLA_ADDRESS;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route_util.h b/test/syscalls/linux/socket_netlink_route_util.h
new file mode 100644
index 000000000..149c4a7f6
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.h
@@ -0,0 +1,55 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+
+struct Link {
+  int index;
+  int16_t type;
+  std::string name;
+};
+
+PosixError DumpLinks(const FileDescriptor& fd, uint32_t seq,
+                     const std::function<void(const struct nlmsghdr* hdr)>& fn);
+
+PosixErrorOr<std::vector<Link>> DumpLinks();
+
+// Returns the loopback link on the system. ENOENT if not found.
+PosixErrorOr<Link> LoopbackLink();
+
+// LinkAddLocalAddr sets IFA_LOCAL attribute on the interface.
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen);
+
+// LinkChangeFlags changes interface flags. E.g. IFF_UP.
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change);
+
+// LinkSetMacAddr sets IFLA_ADDRESS attribute of the interface.
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 5d3a39868..53b678e94 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -364,11 +364,6 @@ CreateTCPConnectAcceptSocketPair(int bound, int connected, int type,
   }
   MaybeSave();  // Successful accept.
 
-  // FIXME(b/110484944)
-  if (connect_result == -1) {
-    absl::SleepFor(absl::Seconds(1));
-  }
-
   T extra_addr = {};
   LocalhostAddr(&extra_addr, dual_stack);
   return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 4cf1f76f1..591cab3fd 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -256,8 +256,9 @@ TEST_P(UnixSocketPairTest, ShutdownWrite) {
 }
 
 TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) {
-  // TODO(b/122310852): We should be returning ENXIO and NOT EIO.
-  SKIP_IF(IsRunningOnGvisor());
+  // TODO(gvisor.dev/issue/1624): In VFS1, we return EIO instead of ENXIO (see
+  // b/122310852). Remove this skip once VFS1 is deleted.
+  SKIP_IF(IsRunningWithVFS1());
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   // Opening a socket pair via /proc/self/fd/X is a ENXIO.
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index faa1247f6..08fc4b1b7 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/eventfd.h>
 #include <sys/resource.h>
 #include <sys/sendfile.h>
@@ -429,6 +430,55 @@ TEST(SpliceTest, TwoPipes) {
   EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
 }
 
+TEST(SpliceTest, TwoPipesCircular) {
+  // This test deadlocks the sentry on VFS1 because VFS1 splice ordering is
+  // based on fs.File.UniqueID, which does not prevent circular ordering between
+  // e.g. inode-level locks taken by fs.FileOperations.
+  SKIP_IF(IsRunningWithVFS1());
+
+  // Create two pipes.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor first_rfd(fds[0]);
+  const FileDescriptor first_wfd(fds[1]);
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor second_rfd(fds[0]);
+  const FileDescriptor second_wfd(fds[1]);
+
+  // On Linux, each pipe is normally limited to
+  // include/linux/pipe_fs_i.h:PIPE_DEF_BUFFERS buffers worth of data.
+  constexpr size_t PIPE_DEF_BUFFERS = 16;
+
+  // Write some data to each pipe. Below we splice 1 byte at a time between
+  // pipes, which very quickly causes each byte to be stored in a separate
+  // buffer, so we must ensure that the total amount of data in the system is <=
+  // PIPE_DEF_BUFFERS bytes.
+  std::vector<char> buf(PIPE_DEF_BUFFERS / 2);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(first_wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  ASSERT_THAT(write(second_wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Have another thread splice from the second pipe to the first, while we
+  // splice from the first to the second. The test passes if this does not
+  // deadlock.
+  const int kIterations = 1000;
+  DisableSave ds;
+  ScopedThread t([&]() {
+    for (int i = 0; i < kIterations; i++) {
+      ASSERT_THAT(
+          splice(second_rfd.get(), nullptr, first_wfd.get(), nullptr, 1, 0),
+          SyscallSucceedsWithValue(1));
+    }
+  });
+  for (int i = 0; i < kIterations; i++) {
+    ASSERT_THAT(
+        splice(first_rfd.get(), nullptr, second_wfd.get(), nullptr, 1, 0),
+        SyscallSucceedsWithValue(1));
+  }
+}
+
 TEST(SpliceTest, Blocking) {
   // Create two new pipes.
   int first[2], second[2];
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index c951ac3b3..2503960f3 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -34,6 +34,13 @@
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 
+#ifndef AT_STATX_FORCE_SYNC
+#define AT_STATX_FORCE_SYNC 0x2000
+#endif
+#ifndef AT_STATX_DONT_SYNC
+#define AT_STATX_DONT_SYNC 0x4000
+#endif
+
 namespace gvisor {
 namespace testing {
 
@@ -607,7 +614,7 @@ int statx(int dirfd, const char* pathname, int flags, unsigned int mask,
 }
 
 TEST_F(StatTest, StatxAbsPath) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   struct kernel_statx stx;
@@ -617,7 +624,7 @@ TEST_F(StatTest, StatxAbsPath) {
 }
 
 TEST_F(StatTest, StatxRelPathDirFD) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   struct kernel_statx stx;
@@ -631,7 +638,7 @@ TEST_F(StatTest, StatxRelPathDirFD) {
 }
 
 TEST_F(StatTest, StatxRelPathCwd) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
@@ -643,7 +650,7 @@ TEST_F(StatTest, StatxRelPathCwd) {
 }
 
 TEST_F(StatTest, StatxEmptyPath) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
@@ -653,6 +660,60 @@ TEST_F(StatTest, StatxEmptyPath) {
   EXPECT_TRUE(S_ISREG(stx.stx_mode));
 }
 
+TEST_F(StatTest, StatxDoesNotRejectExtraneousMaskBits) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  struct kernel_statx stx;
+  // Set all mask bits except for STATX__RESERVED.
+  uint mask = 0xffffffff & ~0x80000000;
+  EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, mask, &stx),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxRejectsReservedMaskBit) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  struct kernel_statx stx;
+  // Set STATX__RESERVED in the mask.
+  EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, 0x80000000, &stx),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(StatTest, StatxSymlink) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  std::string parent_dir = "/tmp";
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(parent_dir, test_file_name_));
+  std::string p = link.path();
+
+  struct kernel_statx stx;
+  EXPECT_THAT(statx(AT_FDCWD, p.c_str(), AT_SYMLINK_NOFOLLOW, STATX_ALL, &stx),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISLNK(stx.stx_mode));
+  EXPECT_THAT(statx(AT_FDCWD, p.c_str(), 0, STATX_ALL, &stx),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxInvalidFlags) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  struct kernel_statx stx;
+  EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Sync flags are mutually exclusive.
+  EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(),
+                    AT_STATX_FORCE_SYNC | AT_STATX_DONT_SYNC, 0, &stx),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc
index 7e73325bf..92eec0449 100644
--- a/test/syscalls/linux/sticky.cc
+++ b/test/syscalls/linux/sticky.cc
@@ -42,8 +42,9 @@ TEST(StickyTest, StickyBitPermDenied) {
 
   auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
-  std::string path = JoinPath(dir.path(), "NewDir");
-  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY));
+  ASSERT_THAT(mkdirat(dirfd.get(), "NewDir", 0755), SyscallSucceeds());
 
   // Drop privileges and change IDs only in child thread, or else this parent
   // thread won't be able to open some log files after the test ends.
@@ -61,7 +62,8 @@ TEST(StickyTest, StickyBitPermDenied) {
         syscall(SYS_setresuid, -1, absl::GetFlag(FLAGS_scratch_uid), -1),
         SyscallSucceeds());
 
-    EXPECT_THAT(rmdir(path.c_str()), SyscallFailsWithErrno(EPERM));
+    EXPECT_THAT(unlinkat(dirfd.get(), "NewDir", AT_REMOVEDIR),
+                SyscallFailsWithErrno(EPERM));
   });
 }
 
@@ -96,8 +98,9 @@ TEST(StickyTest, StickyBitCapFOWNER) {
 
   auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
-  std::string path = JoinPath(dir.path(), "NewDir");
-  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY));
+  ASSERT_THAT(mkdirat(dirfd.get(), "NewDir", 0755), SyscallSucceeds());
 
   // Drop privileges and change IDs only in child thread, or else this parent
   // thread won't be able to open some log files after the test ends.
@@ -114,7 +117,8 @@ TEST(StickyTest, StickyBitCapFOWNER) {
         SyscallSucceeds());
 
     EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, true));
-    EXPECT_THAT(rmdir(path.c_str()), SyscallSucceeds());
+    EXPECT_THAT(unlinkat(dirfd.get(), "NewDir", AT_REMOVEDIR),
+                SyscallSucceeds());
   });
 }
 }  // namespace
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index 03ee1250d..a17ff62e9 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -20,6 +20,7 @@
 #include <string>
 
 #include "gtest/gtest.h"
+#include "absl/time/clock.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
@@ -272,6 +273,30 @@ TEST(SymlinkTest, ChmodSymlink) {
   EXPECT_EQ(FilePermission(newpath), 0777);
 }
 
+// Test that following a symlink updates the atime on the symlink.
+TEST(SymlinkTest, FollowUpdatesATime) {
+  const auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string link = NewTempAbsPath();
+  EXPECT_THAT(symlink(file.path().c_str(), link.c_str()), SyscallSucceeds());
+
+  // Lstat the symlink.
+  struct stat st_before_follow;
+  ASSERT_THAT(lstat(link.c_str(), &st_before_follow), SyscallSucceeds());
+
+  // Let the clock advance.
+  absl::SleepFor(absl::Seconds(1));
+
+  // Open the file via the symlink.
+  int fd;
+  ASSERT_THAT(fd = open(link.c_str(), O_RDWR, 0666), SyscallSucceeds());
+  FileDescriptor fd_closer(fd);
+
+  // Lstat the symlink again, and check that atime is updated.
+  struct stat st_after_follow;
+  ASSERT_THAT(lstat(link.c_str(), &st_after_follow), SyscallSucceeds());
+  EXPECT_LT(st_before_follow.st_atime, st_after_follow.st_atime);
+}
+
 class ParamSymlinkTest : public ::testing::TestWithParam<std::string> {};
 
 // Test that creating an existing symlink with creat will create the target.
diff --git a/test/syscalls/linux/sysret.cc b/test/syscalls/linux/sysret.cc
index 819fa655a..19ffbd85b 100644
--- a/test/syscalls/linux/sysret.cc
+++ b/test/syscalls/linux/sysret.cc
@@ -14,6 +14,8 @@
 
 // Tests to verify that the behavior of linux and gvisor matches when
 // 'sysret' returns to bad (aka non-canonical) %rip or %rsp.
+
+#include <linux/elf.h>
 #include <sys/ptrace.h>
 #include <sys/user.h>
 
@@ -32,6 +34,7 @@ constexpr uint64_t kNonCanonicalRsp = 0xFFFF000000000000;
 class SysretTest : public ::testing::Test {
  protected:
   struct user_regs_struct regs_;
+  struct iovec iov;
   pid_t child_;
 
   void SetUp() override {
@@ -48,10 +51,15 @@ class SysretTest : public ::testing::Test {
 
     // Parent.
     int status;
+    memset(&iov, 0, sizeof(iov));
     ASSERT_THAT(pid, SyscallSucceeds());  // Might still be < 0.
     ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
     EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
-    ASSERT_THAT(ptrace(PTRACE_GETREGS, pid, 0, &regs_), SyscallSucceeds());
+
+    iov.iov_base = &regs_;
+    iov.iov_len = sizeof(regs_);
+    ASSERT_THAT(ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
 
     child_ = pid;
   }
@@ -61,13 +69,27 @@ class SysretTest : public ::testing::Test {
   }
 
   void SetRip(uint64_t newrip) {
+#if defined(__x86_64__)
     regs_.rip = newrip;
-    ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
+#elif defined(__aarch64__)
+    regs_.pc = newrip;
+#else
+#error "Unknown architecture"
+#endif
+    ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
   }
 
   void SetRsp(uint64_t newrsp) {
+#if defined(__x86_64__)
     regs_.rsp = newrsp;
-    ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
+#elif defined(__aarch64__)
+    regs_.sp = newrsp;
+#else
+#error "Unknown architecture"
+#endif
+    ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
   }
 
   // Wait waits for the child pid and returns the exit status.
@@ -104,8 +126,15 @@ TEST_F(SysretTest, BadRsp) {
   SetRsp(kNonCanonicalRsp);
   Detach();
   int status = Wait();
+#if defined(__x86_64__)
   EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGBUS)
       << "status = " << status;
+#elif defined(__aarch64__)
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
+      << "status = " << status;
+#else
+#error "Unknown architecture"
+#endif
 }
 }  // namespace
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index c4591a3b9..a4d2953e1 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -143,6 +143,20 @@ TEST_P(TcpSocketTest, ConnectOnEstablishedConnection) {
       SyscallFailsWithErrno(EISCONN));
 }
 
+TEST_P(TcpSocketTest, ShutdownWriteInTimeWait) {
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  EXPECT_THAT(shutdown(s_, SHUT_RDWR), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));  // Wait to enter TIME_WAIT.
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(TcpSocketTest, ShutdownWriteInFinWait1) {
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));  // Wait to enter FIN-WAIT2.
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+}
+
 TEST_P(TcpSocketTest, DataCoalesced) {
   char buf[10];
 
@@ -1299,7 +1313,7 @@ TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptNeg) {
   int get = -1;
   socklen_t get_len = sizeof(get);
   ASSERT_THAT(
-      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len),
       SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
   EXPECT_EQ(get, 0);
@@ -1312,7 +1326,7 @@ TEST_P(SimpleTcpSocketTest, GetTCPDeferAcceptDefault) {
   int get = -1;
   socklen_t get_len = sizeof(get);
   ASSERT_THAT(
-      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len),
       SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
   EXPECT_EQ(get, 0);
@@ -1349,6 +1363,202 @@ TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
               SyscallFailsWithErrno(ENOTCONN));
 }
 
+TEST_P(SimpleTcpSocketTest, TCPConnectSoRcvBufRace) {
+  auto s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(GetParam(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  RetryEINTR(connect)(s.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      addrlen);
+  int buf_sz = 1 << 18;
+  EXPECT_THAT(
+      setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &buf_sz, sizeof(buf_sz)),
+      SyscallSucceedsWithValue(0));
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPSynCntLessThanOne) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  int default_syn_cnt = get;
+
+  {
+    // TCP_SYNCNT less than 1 should be rejected with an EINVAL.
+    constexpr int kZero = 0;
+    EXPECT_THAT(
+        setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kZero, sizeof(kZero)),
+        SyscallFailsWithErrno(EINVAL));
+
+    // TCP_SYNCNT less than 1 should be rejected with an EINVAL.
+    constexpr int kNeg = -1;
+    EXPECT_THAT(
+        setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kNeg, sizeof(kNeg)),
+        SyscallFailsWithErrno(EINVAL));
+
+    int get = -1;
+    socklen_t get_len = sizeof(get);
+
+    ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+                SyscallSucceedsWithValue(0));
+    EXPECT_EQ(get_len, sizeof(get));
+    EXPECT_EQ(default_syn_cnt, get);
+  }
+}
+
+TEST_P(SimpleTcpSocketTest, GetTCPSynCntDefault) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  constexpr int kDefaultSynCnt = 6;
+
+  ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kDefaultSynCnt);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPSynCntGreaterThanOne) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  constexpr int kTCPSynCnt = 20;
+  ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kTCPSynCnt,
+                         sizeof(kTCPSynCnt)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPSynCnt);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPSynCntAboveMax) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  int default_syn_cnt = get;
+  {
+    constexpr int kTCPSynCnt = 256;
+    ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kTCPSynCnt,
+                           sizeof(kTCPSynCnt)),
+                SyscallFailsWithErrno(EINVAL));
+
+    int get = -1;
+    socklen_t get_len = sizeof(get);
+    ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+                SyscallSucceeds());
+    EXPECT_EQ(get_len, sizeof(get));
+    EXPECT_EQ(get, default_syn_cnt);
+  }
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPWindowClampBelowMinRcvBuf) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Discover minimum receive buf by setting a really low value
+  // for the receive buffer.
+  constexpr int kZero = 0;
+  EXPECT_THAT(setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &kZero, sizeof(kZero)),
+              SyscallSucceeds());
+
+  // Now retrieve the minimum value for SO_RCVBUF as the set above should
+  // have caused SO_RCVBUF for the socket to be set to the minimum.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  int min_so_rcvbuf = get;
+
+  {
+    // TCP_WINDOW_CLAMP less than min_so_rcvbuf/2 should be set to
+    // min_so_rcvbuf/2.
+    int below_half_min_rcvbuf = min_so_rcvbuf / 2 - 1;
+    EXPECT_THAT(
+        setsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP,
+                   &below_half_min_rcvbuf, sizeof(below_half_min_rcvbuf)),
+        SyscallSucceeds());
+
+    int get = -1;
+    socklen_t get_len = sizeof(get);
+
+    ASSERT_THAT(
+        getsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &get, &get_len),
+        SyscallSucceedsWithValue(0));
+    EXPECT_EQ(get_len, sizeof(get));
+    EXPECT_EQ(min_so_rcvbuf / 2, get);
+  }
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPWindowClampZeroClosedSocket) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  constexpr int kZero = 0;
+  ASSERT_THAT(
+      setsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &kZero, sizeof(kZero)),
+      SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &get, &get_len),
+      SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kZero);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPWindowClampAboveHalfMinRcvBuf) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Discover minimum receive buf by setting a really low value
+  // for the receive buffer.
+  constexpr int kZero = 0;
+  EXPECT_THAT(setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &kZero, sizeof(kZero)),
+              SyscallSucceeds());
+
+  // Now retrieve the minimum value for SO_RCVBUF as the set above should
+  // have caused SO_RCVBUF for the socket to be set to the minimum.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  int min_so_rcvbuf = get;
+
+  {
+    int above_half_min_rcv_buf = min_so_rcvbuf / 2 + 1;
+    EXPECT_THAT(
+        setsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP,
+                   &above_half_min_rcv_buf, sizeof(above_half_min_rcv_buf)),
+        SyscallSucceeds());
+
+    int get = -1;
+    socklen_t get_len = sizeof(get);
+
+    ASSERT_THAT(
+        getsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &get, &get_len),
+        SyscallSucceedsWithValue(0));
+    EXPECT_EQ(get_len, sizeof(get));
+    EXPECT_EQ(above_half_min_rcv_buf, get);
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index 1ccb95733..e75bba669 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -26,6 +26,7 @@ namespace {
 
 constexpr long kFudgeSeconds = 5;
 
+#if defined(__x86_64__) || defined(__i386__)
 // Mimics the time(2) wrapper from glibc prior to 2.15.
 time_t vsyscall_time(time_t* t) {
   constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
@@ -98,6 +99,7 @@ TEST(TimeTest, VsyscallGettimeofday_InvalidAddressSIGSEGV) {
                                     reinterpret_cast<struct timezone*>(0x1)),
               ::testing::KilledBySignal(SIGSEGV), "");
 }
+#endif
 
 }  // namespace
 
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 2f92c27da..4b3c44527 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -658,5 +658,5 @@ int main(int argc, char** argv) {
     }
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
new file mode 100644
index 000000000..6195b11e1
--- /dev/null
+++ b/test/syscalls/linux/tuntap.cc
@@ -0,0 +1,402 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_tun.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr int kIPLen = 4;
+
+constexpr const char kDevNetTun[] = "/dev/net/tun";
+constexpr const char kTapName[] = "tap0";
+
+constexpr const uint8_t kMacA[ETH_ALEN] = {0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA};
+constexpr const uint8_t kMacB[ETH_ALEN] = {0xBB, 0xBB, 0xBB, 0xBB, 0xBB, 0xBB};
+
+PosixErrorOr<std::set<std::string>> DumpLinkNames() {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  std::set<std::string> names;
+  for (const auto& link : links) {
+    names.emplace(link.name);
+  }
+  return names;
+}
+
+PosixErrorOr<Link> GetLinkByName(const std::string& name) {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  for (const auto& link : links) {
+    if (link.name == name) {
+      return link;
+    }
+  }
+  return PosixError(ENOENT, "interface not found");
+}
+
+struct pihdr {
+  uint16_t pi_flags;
+  uint16_t pi_protocol;
+} __attribute__((packed));
+
+struct ping_pkt {
+  pihdr pi;
+  struct ethhdr eth;
+  struct iphdr ip;
+  struct icmphdr icmp;
+  char payload[64];
+} __attribute__((packed));
+
+ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+                          const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+  ping_pkt pkt = {};
+
+  pkt.pi.pi_protocol = htons(ETH_P_IP);
+
+  memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest));
+  memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source));
+  pkt.eth.h_proto = htons(ETH_P_IP);
+
+  pkt.ip.ihl = 5;
+  pkt.ip.version = 4;
+  pkt.ip.tos = 0;
+  pkt.ip.tot_len = htons(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+                         sizeof(pkt.payload));
+  pkt.ip.id = 1;
+  pkt.ip.frag_off = 1 << 6;  // Do not fragment
+  pkt.ip.ttl = 64;
+  pkt.ip.protocol = IPPROTO_ICMP;
+  inet_pton(AF_INET, dstip, &pkt.ip.daddr);
+  inet_pton(AF_INET, srcip, &pkt.ip.saddr);
+  pkt.ip.check = IPChecksum(pkt.ip);
+
+  pkt.icmp.type = ICMP_ECHO;
+  pkt.icmp.code = 0;
+  pkt.icmp.checksum = 0;
+  pkt.icmp.un.echo.sequence = 1;
+  pkt.icmp.un.echo.id = 1;
+
+  strncpy(pkt.payload, "abcd", sizeof(pkt.payload));
+  pkt.icmp.checksum = ICMPChecksum(pkt.icmp, pkt.payload, sizeof(pkt.payload));
+
+  return pkt;
+}
+
+struct arp_pkt {
+  pihdr pi;
+  struct ethhdr eth;
+  struct arphdr arp;
+  uint8_t arp_sha[ETH_ALEN];
+  uint8_t arp_spa[kIPLen];
+  uint8_t arp_tha[ETH_ALEN];
+  uint8_t arp_tpa[kIPLen];
+} __attribute__((packed));
+
+std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+                            const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+  std::string buffer;
+  buffer.resize(sizeof(arp_pkt));
+
+  arp_pkt* pkt = reinterpret_cast<arp_pkt*>(&buffer[0]);
+  {
+    pkt->pi.pi_protocol = htons(ETH_P_ARP);
+
+    memcpy(pkt->eth.h_dest, kMacA, sizeof(pkt->eth.h_dest));
+    memcpy(pkt->eth.h_source, kMacB, sizeof(pkt->eth.h_source));
+    pkt->eth.h_proto = htons(ETH_P_ARP);
+
+    pkt->arp.ar_hrd = htons(ARPHRD_ETHER);
+    pkt->arp.ar_pro = htons(ETH_P_IP);
+    pkt->arp.ar_hln = ETH_ALEN;
+    pkt->arp.ar_pln = kIPLen;
+    pkt->arp.ar_op = htons(ARPOP_REPLY);
+
+    memcpy(pkt->arp_sha, srcmac, sizeof(pkt->arp_sha));
+    inet_pton(AF_INET, srcip, pkt->arp_spa);
+    memcpy(pkt->arp_tha, dstmac, sizeof(pkt->arp_tha));
+    inet_pton(AF_INET, dstip, pkt->arp_tpa);
+  }
+  return buffer;
+}
+
+}  // namespace
+
+TEST(TuntapStaticTest, NetTunExists) {
+  struct stat statbuf;
+  ASSERT_THAT(stat(kDevNetTun, &statbuf), SyscallSucceeds());
+  // Check that it's a character device with rw-rw-rw- permissions.
+  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
+class TuntapTest : public ::testing::Test {
+ protected:
+  void TearDown() override {
+    if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))) {
+      // Bring back capability if we had dropped it in test case.
+      ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, true));
+    }
+  }
+};
+
+TEST_F(TuntapTest, CreateInterfaceNoCap) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, false));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  strncpy(ifr.ifr_name, kTapName, IFNAMSIZ);
+
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_F(TuntapTest, CreateFixedNameInterface) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr_set = {};
+  ifr_set.ifr_flags = IFF_TAP;
+  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
+              SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_get = {};
+  EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+              SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_expect = ifr_set;
+  // See __tun_chr_ioctl() in net/drivers/tun.c.
+  ifr_expect.ifr_flags |= IFF_NOFILTER;
+
+  EXPECT_THAT(DumpLinkNames(),
+              IsPosixErrorOkAndHolds(::testing::Contains(kTapName)));
+  EXPECT_THAT(memcmp(&ifr_expect, &ifr_get, sizeof(ifr_get)), ::testing::Eq(0));
+}
+
+TEST_F(TuntapTest, CreateInterface) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  // Empty ifr.ifr_name. Let kernel assign.
+
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_get = {};
+  EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+              SyscallSucceedsWithValue(0));
+
+  std::string ifname = ifr_get.ifr_name;
+  EXPECT_THAT(ifname, ::testing::StartsWith("tap"));
+  EXPECT_THAT(DumpLinkNames(),
+              IsPosixErrorOkAndHolds(::testing::Contains(ifname)));
+}
+
+TEST_F(TuntapTest, InvalidReadWrite) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  char buf[128] = {};
+  EXPECT_THAT(read(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+  EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+}
+
+TEST_F(TuntapTest, WriteToDownDevice) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
+  SKIP_IF(IsRunningOnGvisor());
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  // Device created should be down by default.
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+  char buf[128] = {};
+  EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO));
+}
+
+PosixErrorOr<FileDescriptor> OpenAndAttachTap(
+    const std::string& dev_name, const std::string& dev_ipv4_addr) {
+  // Interface creation.
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr_set = {};
+  ifr_set.ifr_flags = IFF_TAP;
+  strncpy(ifr_set.ifr_name, dev_name.c_str(), IFNAMSIZ);
+  if (ioctl(fd.get(), TUNSETIFF, &ifr_set) < 0) {
+    return PosixError(errno);
+  }
+
+  ASSIGN_OR_RETURN_ERRNO(auto link, GetLinkByName(dev_name));
+
+  // Interface setup.
+  struct in_addr addr;
+  inet_pton(AF_INET, dev_ipv4_addr.c_str(), &addr);
+  EXPECT_NO_ERRNO(LinkAddLocalAddr(link.index, AF_INET, /*prefixlen=*/24, &addr,
+                                   sizeof(addr)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME(b/110961832): gVisor doesn't support setting MAC address on
+    // interfaces yet.
+    RETURN_IF_ERRNO(LinkSetMacAddr(link.index, kMacA, sizeof(kMacA)));
+
+    // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
+    RETURN_IF_ERRNO(LinkChangeFlags(link.index, IFF_UP, IFF_UP));
+  }
+
+  return fd;
+}
+
+// This test sets up a TAP device and pings kernel by sending ICMP echo request.
+//
+// It works as the following:
+// * Open /dev/net/tun, and create kTapName interface.
+// * Use rtnetlink to do initial setup of the interface:
+//   * Assign IP address 10.0.0.1/24 to kernel.
+//   * MAC address: kMacA
+//   * Bring up the interface.
+// * Send an ICMP echo reqest (ping) packet from 10.0.0.2 (kMacB) to kernel.
+// * Loop to receive packets from TAP device/fd:
+//   * If packet is an ICMP echo reply, it stops and passes the test.
+//   * If packet is an ARP request, it responds with canned reply and resends
+//   the
+//     ICMP request packet.
+TEST_F(TuntapTest, PingKernel) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, "10.0.0.1"));
+  ping_pkt ping_req = CreatePingPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+  std::string arp_rep = CreateArpPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+
+  // Send ping, this would trigger an ARP request on Linux.
+  EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+              SyscallSucceedsWithValue(sizeof(ping_req)));
+
+  // Receive loop to process inbound packets.
+  struct inpkt {
+    union {
+      pihdr pi;
+      ping_pkt ping;
+      arp_pkt arp;
+    };
+  };
+  while (1) {
+    inpkt r = {};
+    int n = read(fd.get(), &r, sizeof(r));
+    EXPECT_THAT(n, SyscallSucceeds());
+
+    if (n < sizeof(pihdr)) {
+      std::cerr << "Ignored packet, protocol: " << r.pi.pi_protocol
+                << " len: " << n << std::endl;
+      continue;
+    }
+
+    // Process ARP packet.
+    if (n >= sizeof(arp_pkt) && r.pi.pi_protocol == htons(ETH_P_ARP)) {
+      // Respond with canned ARP reply.
+      EXPECT_THAT(write(fd.get(), arp_rep.data(), arp_rep.size()),
+                  SyscallSucceedsWithValue(arp_rep.size()));
+      // First ping request might have been dropped due to mac address not in
+      // ARP cache. Send it again.
+      EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+                  SyscallSucceedsWithValue(sizeof(ping_req)));
+    }
+
+    // Process ping response packet.
+    if (n >= sizeof(ping_pkt) && r.pi.pi_protocol == ping_req.pi.pi_protocol &&
+        r.ping.ip.protocol == ping_req.ip.protocol &&
+        !memcmp(&r.ping.ip.saddr, &ping_req.ip.daddr, kIPLen) &&
+        !memcmp(&r.ping.ip.daddr, &ping_req.ip.saddr, kIPLen) &&
+        r.ping.icmp.type == 0 && r.ping.icmp.code == 0) {
+      // Ends and passes the test.
+      break;
+    }
+  }
+}
+
+TEST_F(TuntapTest, SendUdpTriggersArpResolution) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, "10.0.0.1"));
+
+  // Send a UDP packet to remote.
+  int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+  ASSERT_THAT(sock, SyscallSucceeds());
+
+  struct sockaddr_in remote = {};
+  remote.sin_family = AF_INET;
+  remote.sin_port = htons(42);
+  inet_pton(AF_INET, "10.0.0.2", &remote.sin_addr);
+  int ret = sendto(sock, "hello", 5, 0, reinterpret_cast<sockaddr*>(&remote),
+                   sizeof(remote));
+  ASSERT_THAT(ret, ::testing::AnyOf(SyscallSucceeds(),
+                                    SyscallFailsWithErrno(EHOSTDOWN)));
+
+  struct inpkt {
+    union {
+      pihdr pi;
+      arp_pkt arp;
+    };
+  };
+  while (1) {
+    inpkt r = {};
+    int n = read(fd.get(), &r, sizeof(r));
+    EXPECT_THAT(n, SyscallSucceeds());
+
+    if (n < sizeof(pihdr)) {
+      std::cerr << "Ignored packet, protocol: " << r.pi.pi_protocol
+                << " len: " << n << std::endl;
+      continue;
+    }
+
+    if (n >= sizeof(arp_pkt) && r.pi.pi_protocol == htons(ETH_P_ARP)) {
+      break;
+    }
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/tuntap_hostinet.cc b/test/syscalls/linux/tuntap_hostinet.cc
new file mode 100644
index 000000000..1513fb9d5
--- /dev/null
+++ b/test/syscalls/linux/tuntap_hostinet.cc
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(TuntapHostInetTest, NoNetTun) {
+  SKIP_IF(!IsRunningOnGvisor());
+  SKIP_IF(!IsRunningWithHostinet());
+
+  struct stat statbuf;
+  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallFailsWithErrno(ENOENT));
+}
+
+}  // namespace
+}  // namespace testing
+
+}  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 9f8de6b48..42521efef 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -17,10 +17,15 @@
 #include <arpa/inet.h>
 #include <fcntl.h>
 #include <netinet/in.h>
+#include <poll.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/types.h>
 
+#ifndef SIOCGSTAMP
+#include <linux/sockios.h>
+#endif
+
 #include "gtest/gtest.h"
 #include "absl/base/macros.h"
 #include "absl/time/clock.h"
@@ -669,6 +674,11 @@ TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
   char buf[3];
   // Send zero length packet from s_ to t_.
   ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+
+  struct pollfd pfd = {t_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // Receive the packet.
   char received[3];
   EXPECT_THAT(read(t_, received, sizeof(received)),
@@ -694,6 +704,11 @@ TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
   char buf[3];
   // Send zero length packet from s_ to t_.
   ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+
+  struct pollfd pfd = {t_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // Receive the packet.
   char received[3];
   EXPECT_THAT(read(t_, received, sizeof(received)),
@@ -855,6 +870,10 @@ TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
 
   EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
 
+  struct pollfd pfd = {s_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // We should get the data even though read has been shutdown.
   EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2));
 
@@ -1108,6 +1127,10 @@ TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
   ASSERT_THAT(send(s_, str, sizeof(str), 0),
               SyscallSucceedsWithValue(sizeof(str)));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   n = -1;
   EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
   EXPECT_EQ(n, sizeof(str));
@@ -1119,6 +1142,8 @@ TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
   EXPECT_EQ(n, sizeof(str));
 }
 
+// NOTE: Do not use `FIONREAD` as test name because it will be replaced by the
+// corresponding macro and become `0x541B`.
 TEST_P(UdpSocketTest, Fionread) {
   // Bind s_ to loopback:TestPort.
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
@@ -1134,10 +1159,14 @@ TEST_P(UdpSocketTest, Fionread) {
   char buf[3 * psize];
   RandomizeBuffer(buf, sizeof(buf));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
   for (int i = 0; i < 3; ++i) {
     ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
                 SyscallSucceedsWithValue(psize));
 
+    ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+                SyscallSucceedsWithValue(1));
+
     // Check that regardless of how many packets are in the queue, the size
     // reported is that of a single packet.
     n = -1;
@@ -1161,10 +1190,18 @@ TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
   char buf[3 * psize];
   RandomizeBuffer(buf, sizeof(buf));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
   for (int i = 0; i < 3; ++i) {
     ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_),
                 SyscallSucceedsWithValue(0));
 
+    // TODO(gvisor.dev/issue/2726): sending a zero-length message to a hostinet
+    // socket does not cause a poll event to be triggered.
+    if (!IsRunningWithHostinet()) {
+      ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+                  SyscallSucceedsWithValue(1));
+    }
+
     // Check that regardless of how many packets are in the queue, the size
     // reported is that of a single packet.
     n = -1;
@@ -1231,6 +1268,10 @@ TEST_P(UdpSocketTest, SoTimestamp) {
   // Send zero length packet from t_ to s_.
   ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
   msghdr msg;
   memset(&msg, 0, sizeof(msg));
@@ -1274,6 +1315,10 @@ TEST_P(UdpSocketTest, TimestampIoctl) {
   ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // There should be no control messages.
   char recv_buf[sizeof(buf)];
   ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
@@ -1311,6 +1356,10 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
               SyscallSucceedsWithValue(sizeof(buf)));
   ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // There should be no control messages.
   char recv_buf[sizeof(buf)];
   ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
@@ -1326,6 +1375,9 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
               SyscallSucceeds());
   ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
 
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // There should be a message for SO_TIMESTAMP.
   char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
   msghdr msg = {};
@@ -1349,9 +1401,6 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
-          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1422,7 +1471,6 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
   // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index 6218fbce1..64d6d0b8f 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <grp.h>
+#include <sys/resource.h>
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -249,6 +250,26 @@ TEST(UidGidRootTest, Setgroups) {
               SyscallFailsWithErrno(EFAULT));
 }
 
+TEST(UidGidRootTest, Setuid_prlimit) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  // Do seteuid in a separate thread so that after finishing this test, the
+  // process can still open files the test harness created before starting this
+  // test. Otherwise, the files are created by root (UID before the test), but
+  // cannot be opened by the `uid` set below after the test.
+  ScopedThread([&] {
+    // Use syscall instead of glibc setuid wrapper because we want this seteuid
+    // call to only apply to this task. POSIX threads, however, require that all
+    // threads have the same UIDs, so using the seteuid wrapper sets all
+    // threads' UID.
+    EXPECT_THAT(syscall(SYS_setreuid, -1, 65534), SyscallSucceeds());
+
+    // Despite the UID change, we should be able to get our own limits.
+    struct rlimit rl = {};
+    EXPECT_THAT(prlimit(0, RLIMIT_NOFILE, NULL, &rl), SyscallSucceeds());
+  });
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index 3a927a430..e647d2896 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -34,17 +34,10 @@ namespace testing {
 
 namespace {
 
-// TODO(b/36516566): utimes(nullptr) does not pick the "now" time in the
-// application's time domain, so when asserting that times are within a window,
-// we expand the window to allow for differences between the time domains.
-constexpr absl::Duration kClockSlack = absl::Milliseconds(100);
-
 // TimeBoxed runs fn, setting before and after to (coarse realtime) times
 // guaranteed* to come before and after fn started and completed, respectively.
 //
 // fn may be called more than once if the clock is adjusted.
-//
-// * See the comment on kClockSlack. gVisor breaks this guarantee.
 void TimeBoxed(absl::Time* before, absl::Time* after,
                std::function<void()> const& fn) {
   do {
@@ -55,12 +48,15 @@ void TimeBoxed(absl::Time* before, absl::Time* after,
     // filesystems set it to 1, so we don't do any truncation.
     struct timespec ts;
     EXPECT_THAT(clock_gettime(CLOCK_REALTIME_COARSE, &ts), SyscallSucceeds());
-    *before = absl::TimeFromTimespec(ts);
+    // FIXME(b/132819225): gVisor filesystem timestamps inconsistently use the
+    // internal or host clock, which may diverge slightly. Allow some slack on
+    // times to account for the difference.
+    *before = absl::TimeFromTimespec(ts) - absl::Seconds(1);
 
     fn();
 
     EXPECT_THAT(clock_gettime(CLOCK_REALTIME_COARSE, &ts), SyscallSucceeds());
-    *after = absl::TimeFromTimespec(ts);
+    *after = absl::TimeFromTimespec(ts) + absl::Seconds(1);
 
     if (*after < *before) {
       // Clock jumped backwards; retry.
@@ -69,23 +65,17 @@ void TimeBoxed(absl::Time* before, absl::Time* after,
       // which could lead to test failures, but that is very unlikely to happen.
       continue;
     }
-
-    if (IsRunningOnGvisor()) {
-      // See comment on kClockSlack.
-      *before -= kClockSlack;
-      *after += kClockSlack;
-    }
   } while (*after < *before);
 }
 
 void TestUtimesOnPath(std::string const& path) {
   struct stat statbuf;
 
-  struct timeval times[2] = {{1, 0}, {2, 0}};
+  struct timeval times[2] = {{10, 0}, {20, 0}};
   EXPECT_THAT(utimes(path.c_str(), times), SyscallSucceeds());
   EXPECT_THAT(stat(path.c_str(), &statbuf), SyscallSucceeds());
-  EXPECT_EQ(1, statbuf.st_atime);
-  EXPECT_EQ(2, statbuf.st_mtime);
+  EXPECT_EQ(10, statbuf.st_atime);
+  EXPECT_EQ(20, statbuf.st_mtime);
 
   absl::Time before;
   absl::Time after;
@@ -116,18 +106,18 @@ TEST(UtimesTest, OnDir) {
 
 TEST(UtimesTest, MissingPath) {
   auto path = NewTempAbsPath();
-  struct timeval times[2] = {{1, 0}, {2, 0}};
+  struct timeval times[2] = {{10, 0}, {20, 0}};
   EXPECT_THAT(utimes(path.c_str(), times), SyscallFailsWithErrno(ENOENT));
 }
 
 void TestFutimesat(int dirFd, std::string const& path) {
   struct stat statbuf;
 
-  struct timeval times[2] = {{1, 0}, {2, 0}};
+  struct timeval times[2] = {{10, 0}, {20, 0}};
   EXPECT_THAT(futimesat(dirFd, path.c_str(), times), SyscallSucceeds());
   EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds());
-  EXPECT_EQ(1, statbuf.st_atime);
-  EXPECT_EQ(2, statbuf.st_mtime);
+  EXPECT_EQ(10, statbuf.st_atime);
+  EXPECT_EQ(20, statbuf.st_mtime);
 
   absl::Time before;
   absl::Time after;
@@ -188,11 +178,11 @@ TEST(FutimesatTest, InvalidNsec) {
 
 void TestUtimensat(int dirFd, std::string const& path) {
   struct stat statbuf;
-  const struct timespec times[2] = {{1, 0}, {2, 0}};
+  const struct timespec times[2] = {{10, 0}, {20, 0}};
   EXPECT_THAT(utimensat(dirFd, path.c_str(), times, 0), SyscallSucceeds());
   EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds());
-  EXPECT_EQ(1, statbuf.st_atime);
-  EXPECT_EQ(2, statbuf.st_mtime);
+  EXPECT_EQ(10, statbuf.st_atime);
+  EXPECT_EQ(20, statbuf.st_mtime);
 
   // Test setting with UTIME_NOW and UTIME_OMIT.
   struct stat statbuf2;
@@ -235,10 +225,7 @@ void TestUtimensat(int dirFd, std::string const& path) {
   EXPECT_GE(mtime3, before);
   EXPECT_LE(mtime3, after);
 
-  if (!IsRunningOnGvisor()) {
-    // FIXME(b/36516566): Gofers set atime and mtime to different "now" times.
-    EXPECT_EQ(atime3, mtime3);
-  }
+  EXPECT_EQ(atime3, mtime3);
 }
 
 TEST(UtimensatTest, OnAbsPath) {
@@ -317,13 +304,13 @@ TEST(Utimensat, NullPath) {
   auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
   struct stat statbuf;
-  const struct timespec times[2] = {{1, 0}, {2, 0}};
+  const struct timespec times[2] = {{10, 0}, {20, 0}};
   // Call syscall directly.
   EXPECT_THAT(syscall(SYS_utimensat, fd.get(), NULL, times, 0),
               SyscallSucceeds());
   EXPECT_THAT(fstatat(0, f.path().c_str(), &statbuf, 0), SyscallSucceeds());
-  EXPECT_EQ(1, statbuf.st_atime);
-  EXPECT_EQ(2, statbuf.st_mtime);
+  EXPECT_EQ(10, statbuf.st_atime);
+  EXPECT_EQ(20, statbuf.st_mtime);
 }
 
 }  // namespace
diff --git a/test/syscalls/linux/vfork.cc b/test/syscalls/linux/vfork.cc
index 0aaba482d..19d05998e 100644
--- a/test/syscalls/linux/vfork.cc
+++ b/test/syscalls/linux/vfork.cc
@@ -191,5 +191,5 @@ int main(int argc, char** argv) {
     return gvisor::testing::RunChild();
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/vsyscall.cc b/test/syscalls/linux/vsyscall.cc
index 2c2303358..ae4377108 100644
--- a/test/syscalls/linux/vsyscall.cc
+++ b/test/syscalls/linux/vsyscall.cc
@@ -24,6 +24,7 @@ namespace testing {
 
 namespace {
 
+#if defined(__x86_64__) || defined(__i386__)
 time_t vsyscall_time(time_t* t) {
   constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
   return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
@@ -37,6 +38,7 @@ TEST(VsyscallTest, VsyscallAlwaysAvailableOnGvisor) {
   time_t t;
   EXPECT_THAT(vsyscall_time(&t), SyscallSucceeds());
 }
+#endif
 
 }  // namespace
 
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
index 9b219cfd6..39b5b2f56 100644
--- a/test/syscalls/linux/write.cc
+++ b/test/syscalls/linux/write.cc
@@ -31,14 +31,8 @@ namespace gvisor {
 namespace testing {
 
 namespace {
-// This test is currently very rudimentary.
-//
-// TODO(edahlgren):
-// * bad buffer states (EFAULT).
-// * bad fds (wrong permission, wrong type of file, EBADF).
-// * check offset is incremented.
-// * check for EOF.
-// * writing to pipes, symlinks, special files.
+
+// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary.
 class WriteTest : public ::testing::Test {
  public:
   ssize_t WriteBytes(int fd, int bytes) {
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 8b00ef44c..3231732ec 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -41,12 +41,12 @@ class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNonexistentFile) {
   const char* path = "/does/not/exist";
-  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
-              SyscallFailsWithErrno(ENOENT));
-  EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+  const char* name = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENOENT));
   EXPECT_THAT(listxattr(path, nullptr, 0), SyscallFailsWithErrno(ENOENT));
-  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENOENT));
 }
 
 TEST_F(XattrTest, XattrNullName) {
diff --git a/test/syscalls/syscall_test_runner.sh b/test/syscalls/syscall_test_runner.sh
deleted file mode 100755
index 864bb2de4..000000000
--- a/test/syscalls/syscall_test_runner.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# syscall_test_runner.sh is a simple wrapper around the go syscall test runner.
-# It exists so that we can build the syscall test runner once, and use it for
-# all syscall tests, rather than build it for each test run.
-
-set -euf -x -o pipefail
-
-echo -- "$@"
-
-if [[ -n "${TEST_UNDECLARED_OUTPUTS_DIR}" ]]; then
-  mkdir -p "${TEST_UNDECLARED_OUTPUTS_DIR}"
-  chmod a+rwx "${TEST_UNDECLARED_OUTPUTS_DIR}"
-fi
-
-# Get location of syscall_test_runner binary.
-readonly runner=$(find "${TEST_SRCDIR}" -name syscall_test_runner)
-
-# Pass the arguments of this script directly to the runner.
-exec "${runner}" "$@"
diff --git a/test/util/BUILD b/test/util/BUILD
index 1f22ebe29..2a17c33ee 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_library", "cc_test", "gtest", "select_system")
+load("//tools:defs.bzl", "cc_library", "cc_test", "gbenchmark", "gtest", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -260,6 +260,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
         gtest,
+        gbenchmark,
     ],
 )
 
@@ -349,3 +350,9 @@ cc_library(
         ":save_util",
     ],
 )
+
+cc_library(
+    name = "temp_umask",
+    testonly = 1,
+    hdrs = ["temp_umask.h"],
+)
diff --git a/test/util/capability_util.cc b/test/util/capability_util.cc
index 9fee52fbb..a1b994c45 100644
--- a/test/util/capability_util.cc
+++ b/test/util/capability_util.cc
@@ -63,13 +63,13 @@ PosixErrorOr<bool> CanCreateUserNamespace() {
     // is in a chroot environment (i.e., the caller's root directory does
     // not match the root directory of the mount namespace in which it
     // resides)."
-    std::cerr << "clone(CLONE_NEWUSER) failed with EPERM";
+    std::cerr << "clone(CLONE_NEWUSER) failed with EPERM" << std::endl;
     return false;
   } else if (errno == EUSERS) {
     // "(since Linux 3.11) CLONE_NEWUSER was specified in flags, and the call
     // would cause the limit on the number of nested user namespaces to be
     // exceeded. See user_namespaces(7)."
-    std::cerr << "clone(CLONE_NEWUSER) failed with EUSERS";
+    std::cerr << "clone(CLONE_NEWUSER) failed with EUSERS" << std::endl;
     return false;
   } else {
     // Unexpected error code; indicate an actual error.
diff --git a/test/syscalls/linux/temp_umask.h b/test/util/temp_umask.h
index 81a25440c..e7de84a54 100644
--- a/test/syscalls/linux/temp_umask.h
+++ b/test/util/temp_umask.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
-#define GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
+#ifndef GVISOR_TEST_UTIL_TEMP_UMASK_H_
+#define GVISOR_TEST_UTIL_TEMP_UMASK_H_
 
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -36,4 +36,4 @@ class TempUmask {
 }  // namespace testing
 }  // namespace gvisor
 
-#endif  // GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
+#endif  // GVISOR_TEST_UTIL_TEMP_UMASK_H_
diff --git a/test/util/test_main.cc b/test/util/test_main.cc
index 5c7ee0064..1f389e58f 100644
--- a/test/util/test_main.cc
+++ b/test/util/test_main.cc
@@ -16,5 +16,5 @@
 
 int main(int argc, char** argv) {
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 95e1e0c96..b20758626 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -42,12 +42,13 @@ namespace testing {
 
 #define TEST_ON_GVISOR "TEST_ON_GVISOR"
 #define GVISOR_NETWORK "GVISOR_NETWORK"
+#define GVISOR_VFS "GVISOR_VFS"
 
 bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; }
 
 const std::string GvisorPlatform() {
   // Set by runner.go.
-  char* env = getenv(TEST_ON_GVISOR);
+  const char* env = getenv(TEST_ON_GVISOR);
   if (!env) {
     return Platform::kNative;
   }
@@ -55,10 +56,19 @@ const std::string GvisorPlatform() {
 }
 
 bool IsRunningWithHostinet() {
-  char* env = getenv(GVISOR_NETWORK);
+  const char* env = getenv(GVISOR_NETWORK);
   return env && strcmp(env, "host") == 0;
 }
 
+bool IsRunningWithVFS1() {
+  const char* env = getenv(GVISOR_VFS);
+  if (env == nullptr) {
+    // If not set, it's running on Linux.
+    return false;
+  }
+  return strcmp(env, "VFS1") == 0;
+}
+
 // Inline cpuid instruction.  Preserve %ebx/%rbx register. In PIC compilations
 // %ebx contains the address of the global offset table. %rbx is occasionally
 // used to address stack variables in presence of dynamic allocas.
diff --git a/test/util/test_util.h b/test/util/test_util.h
index 2d22b0eb8..8e3245b27 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -220,6 +220,7 @@ constexpr char kKVM[] = "kvm";
 bool IsRunningOnGvisor();
 const std::string GvisorPlatform();
 bool IsRunningWithHostinet();
+bool IsRunningWithVFS1();
 
 #ifdef __linux__
 void SetupGvisorDeathTest();
@@ -771,6 +772,7 @@ std::string RunfilePath(std::string path);
 #endif
 
 void TestInit(int* argc, char*** argv);
+int RunAllTests(void);
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/test_util_impl.cc b/test/util/test_util_impl.cc
index ba7c0a85b..7e1ad9e66 100644
--- a/test/util/test_util_impl.cc
+++ b/test/util/test_util_impl.cc
@@ -17,8 +17,12 @@
 #include "gtest/gtest.h"
 #include "absl/flags/flag.h"
 #include "absl/flags/parse.h"
+#include "benchmark/benchmark.h"
 #include "test/util/logging.h"
 
+extern bool FLAGS_benchmark_list_tests;
+extern std::string FLAGS_benchmark_filter;
+
 namespace gvisor {
 namespace testing {
 
@@ -26,6 +30,7 @@ void SetupGvisorDeathTest() {}
 
 void TestInit(int* argc, char*** argv) {
   ::testing::InitGoogleTest(argc, *argv);
+  benchmark::Initialize(argc, *argv);
   ::absl::ParseCommandLine(*argc, *argv);
 
   // Always mask SIGPIPE as it's common and tests aren't expected to handle it.
@@ -34,5 +39,14 @@ void TestInit(int* argc, char*** argv) {
   TEST_CHECK(sigaction(SIGPIPE, &sa, nullptr) == 0);
 }
 
+int RunAllTests() {
+  if (FLAGS_benchmark_list_tests || FLAGS_benchmark_filter != ".") {
+    benchmark::RunSpecifiedBenchmarks();
+    return 0;
+  } else {
+    return RUN_ALL_TESTS();
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/tools/BUILD b/tools/BUILD
index e73a9c885..34b950644 100644
--- a/tools/BUILD
+++ b/tools/BUILD
@@ -1,3 +1 @@
 package(licenses = ["notice"])
-
-exports_files(["nogo.js"])
diff --git a/tools/bazel.mk b/tools/bazel.mk
new file mode 100644
index 000000000..9f4a40669
--- /dev/null
+++ b/tools/bazel.mk
@@ -0,0 +1,124 @@
+#!/usr/bin/make -f
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See base Makefile.
+BRANCH_NAME := $(shell (git branch --show-current 2>/dev/null || \
+			git rev-parse --abbrev-ref HEAD 2>/dev/null) | \
+			xargs -n 1 basename 2>/dev/null)
+
+# Bazel container configuration (see below).
+USER ?= gvisor
+HASH ?= $(shell readlink -m $(CURDIR) | md5sum | cut -c1-8)
+DOCKER_NAME ?= gvisor-bazel-$(HASH)
+DOCKER_PRIVILEGED ?= --privileged
+BAZEL_CACHE := $(shell readlink -m ~/.cache/bazel/)
+GCLOUD_CONFIG := $(shell readlink -m ~/.config/gcloud/)
+DOCKER_SOCKET := /var/run/docker.sock
+
+# Non-configurable.
+UID := $(shell id -u ${USER})
+GID := $(shell id -g ${USER})
+USERADD_OPTIONS :=
+FULL_DOCKER_RUN_OPTIONS := $(DOCKER_RUN_OPTIONS)
+FULL_DOCKER_RUN_OPTIONS += -v "$(BAZEL_CACHE):$(BAZEL_CACHE)"
+FULL_DOCKER_RUN_OPTIONS += -v "$(GCLOUD_CONFIG):$(GCLOUD_CONFIG)"
+FULL_DOCKER_RUN_OPTIONS += -v "/tmp:/tmp"
+ifneq ($(DOCKER_PRIVILEGED),)
+FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_SOCKET):$(DOCKER_SOCKET)"
+DOCKER_GROUP := $(shell stat -c '%g' $(DOCKER_SOCKET))
+ifneq ($(GID),$(DOCKER_GROUP))
+USERADD_OPTIONS += --groups $(DOCKER_GROUP)
+GROUPADD_DOCKER += groupadd --gid $(DOCKER_GROUP) --non-unique docker-$(HASH) &&
+FULL_DOCKER_RUN_OPTIONS += --group-add $(DOCKER_GROUP)
+endif
+endif
+SHELL=/bin/bash -o pipefail
+
+##
+## Bazel helpers.
+##
+##   This file supports targets that wrap bazel in a running Docker
+##   container to simplify development. Some options are available to
+##   control the behavior of this container:
+##     USER               - The in-container user.
+##     DOCKER_RUN_OPTIONS - Options for the container (default: --privileged, required for tests).
+##     DOCKER_NAME        - The container name (default: gvisor-bazel-HASH).
+##     BAZEL_CACHE        - The bazel cache directory (default: detected).
+##     GCLOUD_CONFIG      - The gcloud config directory (detect: detected).
+##     DOCKER_SOCKET      - The Docker socket (default: detected).
+##
+bazel-server-start: load-default ## Starts the bazel server.
+	@mkdir -p $(BAZEL_CACHE)
+	@mkdir -p $(GCLOUD_CONFIG)
+	docker run -d --rm \
+		--init \
+	        --name $(DOCKER_NAME) \
+		--user 0:0 $(DOCKER_GROUP_OPTIONS) \
+		-v "$(CURDIR):$(CURDIR)" \
+		--workdir "$(CURDIR)" \
+		--entrypoint "" \
+		$(FULL_DOCKER_RUN_OPTIONS) \
+		gvisor.dev/images/default \
+		sh -c "groupadd --gid $(GID) --non-unique $(USER) && \
+		       $(GROUPADD_DOCKER) \
+		       useradd --uid $(UID) --non-unique --no-create-home --gid $(GID) $(USERADD_OPTIONS) -d $(HOME) $(USER) && \
+	               bazel version && \
+		       exec tail --pid=\$$(bazel info server_pid) -f /dev/null"
+	@while :; do if docker logs $(DOCKER_NAME) 2>/dev/null | grep "Build label:" >/dev/null; then break; fi; \
+		if ! docker ps | grep $(DOCKER_NAME); then exit 1; else sleep 1; fi; done
+.PHONY: bazel-server-start
+
+bazel-shutdown: ## Shuts down a running bazel server.
+	@docker exec --user $(UID):$(GID) $(DOCKER_NAME) bazel shutdown; rc=$$?; docker kill $(DOCKER_NAME) || [[ $$rc -ne 0 ]]
+.PHONY: bazel-shutdown
+
+bazel-alias: ## Emits an alias that can be used within the shell.
+	@echo "alias bazel='docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) bazel'"
+.PHONY: bazel-alias
+
+bazel-server: ## Ensures that the server exists. Used as an internal target.
+	@docker exec $(DOCKER_NAME) true || $(MAKE) bazel-server-start
+.PHONY: bazel-server
+
+build_cmd = docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) sh -o pipefail -c 'bazel $(STARTUP_OPTIONS) build $(OPTIONS) $(TARGETS)'
+
+build_paths = $(build_cmd) 2>&1 \
+		| tee /proc/self/fd/2 \
+		| grep -E "^  bazel-bin/" \
+		| awk "{print $$1;}" \
+		| xargs -n 1 -I {} sh -c "$(1)"
+
+build: bazel-server
+	@$(call build_cmd)
+.PHONY: build
+
+copy: bazel-server
+ifeq (,$(DESTINATION))
+	$(error Destination not provided.)
+endif
+	@$(call build_paths,cp -a {} $(DESTINATION))
+
+run: bazel-server
+	@$(call build_paths,{} $(ARGS))
+.PHONY: run
+
+sudo: bazel-server
+	@$(call build_paths,sudo -E {} $(ARGS))
+.PHONY: sudo
+
+test: bazel-server
+	@docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) bazel $(STARTUP_OPTIONS) test $(OPTIONS) $(TARGETS)
+.PHONY: test
diff --git a/tools/bazeldefs/BUILD b/tools/bazeldefs/BUILD
index 00a467473..f2f80bae1 100644
--- a/tools/bazeldefs/BUILD
+++ b/tools/bazeldefs/BUILD
@@ -1,3 +1,5 @@
+load("//tools:defs.bzl", "rbe_platform", "rbe_toolchain")
+
 package(licenses = ["notice"])
 
 # In bazel, no special support is required for loopback networking. This is
@@ -8,3 +10,42 @@ genrule(
     cmd = "touch $@",
     visibility = ["//:sandbox"],
 )
+
+# We need to define a bazel platform and toolchain to specify dockerPrivileged
+# and dockerRunAsRoot options, they are required to run tests on the RBE
+# cluster in Kokoro.
+rbe_platform(
+    name = "rbe_ubuntu1604",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//tools/cpp:clang",
+        "@bazel_toolchains//constraints:xenial",
+        "@bazel_toolchains//constraints/sanitizers:support_msan",
+    ],
+    remote_execution_properties = """
+        properties: {
+          name: "container-image"
+          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50"
+        }
+        properties: {
+          name: "dockerAddCapabilities"
+          value: "SYS_ADMIN"
+        }
+        properties: {
+          name: "dockerPrivileged"
+          value: "true"
+        }
+    """,
+)
+
+rbe_toolchain(
+    name = "cc-toolchain-clang-x86_64-default",
+    exec_compatible_with = [],
+    tags = [
+        "manual",
+    ],
+    target_compatible_with = [],
+    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 6798362dc..620c460de 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -1,35 +1,99 @@
 """Bazel implementations of standard rules."""
 
+load("@bazel_gazelle//:def.bzl", _gazelle = "gazelle")
+load("@bazel_skylib//rules:build_test.bzl", _build_test = "build_test")
 load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier")
-load("@io_bazel_rules_go//go:def.bzl", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_test = "go_test", _go_tool_library = "go_tool_library")
-load("@io_bazel_rules_go//proto:def.bzl", _go_proto_library = "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "GoLibrary", _go_binary = "go_binary", _go_context = "go_context", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_path = "go_path", _go_test = "go_test")
+load("@io_bazel_rules_go//proto:def.bzl", _go_grpc_library = "go_grpc_library", _go_proto_library = "go_proto_library")
 load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test")
 load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
-load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image")
-load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image")
 load("@pydeps//:requirements.bzl", _py_requirement = "requirement")
-load("//tools/bazeldefs:tags.bzl", _go_suffixes = "go_suffixes")
+load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", _cc_grpc_library = "cc_grpc_library")
 
-container_image = _container_image
-cc_binary = _cc_binary
+build_test = _build_test
 cc_library = _cc_library
 cc_flags_supplier = _cc_flags_supplier
 cc_proto_library = _cc_proto_library
 cc_test = _cc_test
 cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
-go_image = _go_image
+gazelle = _gazelle
 go_embed_data = _go_embed_data
-go_suffixes = _go_suffixes
+go_path = _go_path
 gtest = "@com_google_googletest//:gtest"
+grpcpp = "@com_github_grpc_grpc//:grpc++"
+gbenchmark = "@com_google_benchmark//:benchmark"
 loopback = "//tools/bazeldefs:loopback"
-proto_library = native.proto_library
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
 py_library = native.py_library
 py_binary = native.py_binary
 py_test = native.py_test
+rbe_platform = native.platform
+rbe_toolchain = native.toolchain
+vdso_linker_option = "-fuse-ld=gold "
+
+def proto_library(name, has_services = None, **kwargs):
+    native.proto_library(
+        name = name,
+        **kwargs
+    )
+
+def cc_grpc_library(name, **kwargs):
+    _cc_grpc_library(name = name, grpc_only = True, **kwargs)
+
+def _go_proto_or_grpc_library(go_library_func, name, **kwargs):
+    deps = [
+        dep.replace("_proto", "_go_proto")
+        for dep in (kwargs.pop("deps", []) or [])
+    ]
+    go_library_func(
+        name = name + "_go_proto",
+        importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name + "_go_proto",
+        proto = ":" + name + "_proto",
+        deps = deps,
+        **kwargs
+    )
+
+def go_proto_library(name, **kwargs):
+    _go_proto_or_grpc_library(_go_proto_library, name, **kwargs)
+
+def go_grpc_and_proto_libraries(name, **kwargs):
+    _go_proto_or_grpc_library(_go_grpc_library, name, **kwargs)
+
+def cc_binary(name, static = False, **kwargs):
+    """Run cc_binary.
+
+    Args:
+        name: name of the target.
+        static: make a static binary if True
+        **kwargs: the rest of the args.
+    """
+    if static:
+        # How to statically link a c++ program that uses threads, like for gRPC:
+        # https://gcc.gnu.org/legacy-ml/gcc-help/2010-05/msg00029.html
+        if "linkopts" not in kwargs:
+            kwargs["linkopts"] = []
+        kwargs["linkopts"] += [
+            "-static",
+            "-lstdc++",
+            "-Wl,--whole-archive",
+            "-lpthread",
+            "-Wl,--no-whole-archive",
+        ]
+    _cc_binary(
+        name = name,
+        **kwargs
+    )
 
 def go_binary(name, static = False, pure = False, **kwargs):
+    """Build a go binary.
+
+    Args:
+        name: name of the target.
+        static: build a static binary.
+        pure: build without cgo.
+        **kwargs: rest of the arguments are passed to _go_binary.
+    """
     if static:
         kwargs["static"] = "on"
     if pure:
@@ -39,6 +103,10 @@ def go_binary(name, static = False, pure = False, **kwargs):
         **kwargs
     )
 
+def go_importpath(target):
+    """Returns the importpath for the target."""
+    return target[GoLibrary].importpath
+
 def go_library(name, **kwargs):
     _go_library(
         name = name,
@@ -46,25 +114,17 @@ def go_library(name, **kwargs):
         **kwargs
     )
 
-def go_tool_library(name, **kwargs):
-    _go_tool_library(
-        name = name,
-        importpath = "gvisor.dev/gvisor/" + native.package_name(),
-        **kwargs
-    )
-
-def go_proto_library(name, proto, **kwargs):
-    deps = kwargs.pop("deps", [])
-    _go_proto_library(
-        name = name,
-        importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name,
-        proto = proto,
-        deps = [dep.replace("_proto", "_go_proto") for dep in deps],
-        **kwargs
-    )
+def go_test(name, pure = False, library = None, **kwargs):
+    """Build a go test.
 
-def go_test(name, **kwargs):
-    library = kwargs.pop("library", None)
+    Args:
+        name: name of the output binary.
+        pure: should it be built without cgo.
+        library: the library to embed.
+        **kwargs: rest of the arguments to pass to _go_test.
+    """
+    if pure:
+        kwargs["pure"] = "on"
     if library:
         kwargs["embed"] = [library]
     _go_test(
@@ -72,6 +132,34 @@ def go_test(name, **kwargs):
         **kwargs
     )
 
+def go_rule(rule, implementation, **kwargs):
+    """Wraps a rule definition with Go attributes.
+
+    Args:
+      rule: rule function (typically rule or aspect).
+      implementation: implementation function.
+      **kwargs: other arguments to pass to rule.
+
+    Returns:
+        The result of invoking the rule.
+    """
+    attrs = kwargs.pop("attrs", [])
+    attrs["_go_context_data"] = attr.label(default = "@io_bazel_rules_go//:go_context_data")
+    attrs["_stdlib"] = attr.label(default = "@io_bazel_rules_go//:stdlib")
+    toolchains = kwargs.get("toolchains", []) + ["@io_bazel_rules_go//go:toolchain"]
+    return rule(implementation, attrs = attrs, toolchains = toolchains, **kwargs)
+
+def go_context(ctx):
+    go_ctx = _go_context(ctx)
+    return struct(
+        go = go_ctx.go,
+        env = go_ctx.env,
+        runfiles = depset([go_ctx.go] + go_ctx.sdk.tools + go_ctx.stdlib.libs),
+        goos = go_ctx.sdk.goos,
+        goarch = go_ctx.sdk.goarch,
+        tags = go_ctx.tags,
+    )
+
 def py_requirement(name, direct = True):
     return _py_requirement(name)
 
diff --git a/tools/bazeldefs/platforms.bzl b/tools/bazeldefs/platforms.bzl
new file mode 100644
index 000000000..165b22311
--- /dev/null
+++ b/tools/bazeldefs/platforms.bzl
@@ -0,0 +1,9 @@
+"""List of platforms."""
+
+# Platform to associated tags.
+platforms = {
+    "ptrace": [],
+    "kvm": [],
+}
+
+default_platform = "ptrace"
diff --git a/tools/bazeldefs/tags.bzl b/tools/bazeldefs/tags.bzl
index 558fb53ae..f5d7a7b21 100644
--- a/tools/bazeldefs/tags.bzl
+++ b/tools/bazeldefs/tags.bzl
@@ -1,40 +1,56 @@
 """List of special Go suffixes."""
 
-go_suffixes = [
+def explode(tagset, suffixes):
+    """explode combines tagset and suffixes in all ways.
+
+    Args:
+      tagset: Original suffixes.
+      suffixes: Suffixes to combine before and after.
+
+    Returns:
+      The set of possible combinations.
+    """
+    result = [t for t in tagset]
+    result += [s for s in suffixes]
+    for t in tagset:
+        result += [t + s for s in suffixes]
+        result += [s + t for s in suffixes]
+    return result
+
+archs = [
     "_386",
-    "_386_unsafe",
     "_aarch64",
-    "_aarch64_unsafe",
     "_amd64",
-    "_amd64_unsafe",
     "_arm",
     "_arm64",
-    "_arm64_unsafe",
-    "_arm_unsafe",
-    "_impl",
-    "_impl_unsafe",
-    "_linux",
-    "_linux_unsafe",
     "_mips",
     "_mips64",
-    "_mips64_unsafe",
     "_mips64le",
-    "_mips64le_unsafe",
-    "_mips_unsafe",
     "_mipsle",
-    "_mipsle_unsafe",
-    "_opts",
-    "_opts_unsafe",
     "_ppc64",
-    "_ppc64_unsafe",
     "_ppc64le",
-    "_ppc64le_unsafe",
     "_riscv64",
-    "_riscv64_unsafe",
     "_s390x",
-    "_s390x_unsafe",
     "_sparc64",
-    "_sparc64_unsafe",
-    "_wasm",
-    "_wasm_unsafe",
+    "_x86",
+]
+
+oses = [
+    "_linux",
 ]
+
+generic = [
+    "_impl",
+    "_race",
+    "_norace",
+    "_unsafe",
+    "_opts",
+]
+
+# State explosion? Sure. This is approximately:
+#   len(archs) * (1 + 2 * len(oses) * (1 + 2 * len(generic))
+#
+# This evaluates to 495 at the time of writing. So it's a lot of different
+# combinations, but not so much that it will cause issues. We can probably add
+# quite a few more variants before this becomes a genuine problem.
+go_suffixes = explode(explode(archs, oses), generic)
diff --git a/tools/bigquery/BUILD b/tools/bigquery/BUILD
new file mode 100644
index 000000000..5748fb390
--- /dev/null
+++ b/tools/bigquery/BUILD
@@ -0,0 +1,10 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "bigquery",
+    testonly = 1,
+    srcs = ["bigquery.go"],
+    deps = ["@com_google_cloud_go_bigquery//:go_default_library"],
+)
diff --git a/tools/bigquery/bigquery.go b/tools/bigquery/bigquery.go
new file mode 100644
index 000000000..56f0dc5c9
--- /dev/null
+++ b/tools/bigquery/bigquery.go
@@ -0,0 +1,121 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package bigquery defines a BigQuery schema for benchmarks.
+//
+// This package contains a schema for BigQuery and methods for publishing
+// benchmark data into tables.
+package bigquery
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	bq "cloud.google.com/go/bigquery"
+)
+
+// Benchmark is the top level structure of recorded benchmark data. BigQuery
+// will infer the schema from this.
+type Benchmark struct {
+	Name      string    `bq:"name"`
+	Timestamp time.Time `bq:"timestamp"`
+	Official  bool      `bq:"official"`
+	Metric    []*Metric `bq:"metric"`
+	Metadata  *Metadata `bq:"metadata"`
+}
+
+// Metric holds the actual metric data and unit information for this benchmark.
+type Metric struct {
+	Name   string  `bq:"name"`
+	Unit   string  `bq:"unit"`
+	Sample float64 `bq:"sample"`
+}
+
+// Metadata about this benchmark.
+type Metadata struct {
+	CL          string `bq:"changelist"`
+	IterationID string `bq:"iteration_id"`
+	PendingCL   string `bq:"pending_cl"`
+	Workflow    string `bq:"workflow"`
+	Platform    string `bq:"platform"`
+	Gofer       string `bq:"gofer"`
+}
+
+// InitBigQuery initializes a BigQuery dataset/table in the project. If the dataset/table already exists, it is not duplicated.
+func InitBigQuery(ctx context.Context, projectID, datasetID, tableID string) error {
+	client, err := bq.NewClient(ctx, projectID)
+	if err != nil {
+		return fmt.Errorf("failed to initialize client on project %s: %v", projectID, err)
+	}
+	defer client.Close()
+
+	dataset := client.Dataset(datasetID)
+	if err := dataset.Create(ctx, nil); err != nil && !checkDuplicateError(err) {
+		return fmt.Errorf("failed to create dataset: %s: %v", datasetID, err)
+	}
+
+	table := dataset.Table(tableID)
+	schema, err := bq.InferSchema(Benchmark{})
+	if err != nil {
+		return fmt.Errorf("failed to infer schema: %v", err)
+	}
+
+	if err := table.Create(ctx, &bq.TableMetadata{Schema: schema}); err != nil && !checkDuplicateError(err) {
+		return fmt.Errorf("failed to create table: %s: %v", tableID, err)
+	}
+	return nil
+}
+
+// AddMetric adds a metric to an existing Benchmark.
+func (bm *Benchmark) AddMetric(metricName, unit string, sample float64) {
+	m := &Metric{
+		Name:   metricName,
+		Unit:   unit,
+		Sample: sample,
+	}
+	bm.Metric = append(bm.Metric, m)
+}
+
+// NewBenchmark initializes a new benchmark.
+func NewBenchmark(name string, official bool) *Benchmark {
+	return &Benchmark{
+		Name:      name,
+		Timestamp: time.Now().UTC(),
+		Official:  official,
+		Metric:    make([]*Metric, 0),
+	}
+}
+
+// SendBenchmarks sends the slice of benchmarks to the BigQuery dataset/table.
+func SendBenchmarks(ctx context.Context, benchmarks []*Benchmark, projectID, datasetID, tableID string) error {
+	client, err := bq.NewClient(ctx, projectID)
+	if err != nil {
+		return fmt.Errorf("Failed to initialize client on project: %s: %v", projectID, err)
+	}
+	defer client.Close()
+
+	uploader := client.Dataset(datasetID).Table(tableID).Uploader()
+	if err = uploader.Put(ctx, benchmarks); err != nil {
+		return fmt.Errorf("failed to upload benchmarks to proejct %s, table %s.%s: %v", projectID, datasetID, tableID, err)
+	}
+
+	return nil
+}
+
+// BigQuery will error "409" for duplicate tables and datasets.
+func checkDuplicateError(err error) bool {
+	return strings.Contains(err.Error(), "googleapi: Error 409: Already Exists")
+}
diff --git a/tools/checkescape/BUILD b/tools/checkescape/BUILD
new file mode 100644
index 000000000..b8c3ddf44
--- /dev/null
+++ b/tools/checkescape/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "checkescape",
+    srcs = ["checkescape.go"],
+    nogo = False,
+    visibility = ["//tools/nogo:__subpackages__"],
+    deps = [
+        "//tools/nogo/data",
+        "@org_golang_x_tools//go/analysis:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/buildssa:go_tool_library",
+        "@org_golang_x_tools//go/ssa:go_tool_library",
+    ],
+)
diff --git a/tools/checkescape/checkescape.go b/tools/checkescape/checkescape.go
new file mode 100644
index 000000000..571e9a6e6
--- /dev/null
+++ b/tools/checkescape/checkescape.go
@@ -0,0 +1,726 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package checkescape allows recursive escape analysis for hot paths.
+//
+// The analysis tracks multiple types of escapes, in two categories. First,
+// 'hard' escapes are explicit allocations. Second, 'soft' escapes are
+// interface dispatches or dynamic function dispatches; these don't necessarily
+// escape but they *may* escape. The analysis is capable of making assertions
+// recursively: soft escapes cannot be analyzed in this way, and therefore
+// count as escapes for recursive purposes.
+//
+// The different types of escapes are as follows, with the category in
+// parentheses:
+//
+// 	heap:      A direct allocation is made on the heap (hard).
+// 	builtin:   A call is made to a built-in allocation function (hard).
+// 	stack:     A stack split as part of a function preamble (soft).
+// 	interface: A call is made via an interface whicy *may* escape (soft).
+// 	dynamic:   A dynamic function is dispatched which *may* escape (soft).
+//
+// To the use the package, annotate a function-level comment with either the
+// line "// +checkescape" or "// +checkescape:OPTION[,OPTION]". In the second
+// case, the OPTION field is either a type above, or one of:
+//
+//	local: Escape analysis is limited to local hard escapes only.
+//	all: All the escapes are included.
+//	hard: All hard escapes are included.
+//
+// If the "// +checkescape" annotation is provided, this is equivalent to
+// provided the local and hard options.
+//
+// Some examples of this syntax are:
+//
+// +checkescape:all               - Analyzes for all escapes in this function and all calls.
+// +checkescape:local             - Analyzes only for default local hard escapes.
+// +checkescape:heap              - Only analyzes for heap escapes.
+// +checkescape:interface,dynamic - Only checks for dynamic calls and interface calls.
+// +checkescape                   - Does the same as +checkescape:local,hard.
+//
+// Note that all of the above can be inverted by using +mustescape. The
+// +checkescape keyword will ensure failure if the class of escape occurs,
+// whereas +mustescape will fail if the given class of escape does not occur.
+//
+// Local exemptions can be made by a comment of the form "// escapes: reason."
+// This must appear on the line of the escape and will also apply to callers of
+// the function as well (for non-local escape analysis).
+package checkescape
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"go/ast"
+	"go/token"
+	"go/types"
+	"io"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"golang.org/x/tools/go/analysis"
+	"golang.org/x/tools/go/analysis/passes/buildssa"
+	"golang.org/x/tools/go/ssa"
+	"gvisor.dev/gvisor/tools/nogo/data"
+)
+
+const (
+	// magic is the magic annotation.
+	magic = "// +checkescape"
+
+	// magicParams is the magic annotation with specific parameters.
+	magicParams = magic + ":"
+
+	// testMagic is the test magic annotation (parameters required).
+	testMagic = "// +mustescape:"
+
+	// exempt is the exemption annotation.
+	exempt = "// escapes:"
+)
+
+// escapingBuiltins are builtins known to escape.
+//
+// These are lowered at an earlier stage of compilation to explicit function
+// calls, but are not available for recursive analysis.
+var escapingBuiltins = []string{
+	"append",
+	"makemap",
+	"newobject",
+	"mallocgc",
+}
+
+// Analyzer defines the entrypoint.
+var Analyzer = &analysis.Analyzer{
+	Name:      "checkescape",
+	Doc:       "surfaces recursive escape analysis results",
+	Run:       run,
+	Requires:  []*analysis.Analyzer{buildssa.Analyzer},
+	FactTypes: []analysis.Fact{(*packageEscapeFacts)(nil)},
+}
+
+// packageEscapeFacts is the set of all functions in a package, and whether or
+// not they recursively pass escape analysis.
+//
+// All the type names for receivers are encoded in the full key. The key
+// represents the fully qualified package and type name used at link time.
+type packageEscapeFacts struct {
+	Funcs map[string][]Escape
+}
+
+// AFact implements analysis.Fact.AFact.
+func (*packageEscapeFacts) AFact() {}
+
+// CallSite is a single call site.
+//
+// These can be chained.
+type CallSite struct {
+	LocalPos token.Pos
+	Resolved LinePosition
+}
+
+// Escape is a single escape instance.
+type Escape struct {
+	Reason EscapeReason
+	Detail string
+	Chain  []CallSite
+}
+
+// LinePosition is a low-resolution token.Position.
+//
+// This is used to match against possible exemptions placed in the source.
+type LinePosition struct {
+	Filename string
+	Line     int
+}
+
+// String implements fmt.Stringer.String.
+func (e *LinePosition) String() string {
+	return fmt.Sprintf("%s:%d", e.Filename, e.Line)
+}
+
+// String implements fmt.Stringer.String.
+//
+// Note that this string will contain new lines.
+func (e *Escape) String() string {
+	var b bytes.Buffer
+	fmt.Fprintf(&b, "%s", e.Reason.String())
+	for i, cs := range e.Chain {
+		if i == len(e.Chain)-1 {
+			fmt.Fprintf(&b, "\n @ %s → %s", cs.Resolved.String(), e.Detail)
+		} else {
+			fmt.Fprintf(&b, "\n + %s", cs.Resolved.String())
+		}
+	}
+	return b.String()
+}
+
+// EscapeReason is an escape reason.
+//
+// This is a simple enum.
+type EscapeReason int
+
+const (
+	interfaceInvoke EscapeReason = iota
+	unknownPackage
+	allocation
+	builtin
+	dynamicCall
+	stackSplit
+	reasonCount // Count for below.
+)
+
+// String returns the string for the EscapeReason.
+//
+// Note that this also implicitly defines the reverse string -> EscapeReason
+// mapping, which is the word before the colon (computed below).
+func (e EscapeReason) String() string {
+	switch e {
+	case interfaceInvoke:
+		return "interface: function invocation via interface"
+	case unknownPackage:
+		return "unknown: no package information available"
+	case allocation:
+		return "heap: call to runtime heap allocation"
+	case builtin:
+		return "builtin: call to runtime builtin"
+	case dynamicCall:
+		return "dynamic: call via dynamic function"
+	case stackSplit:
+		return "stack: stack split on function entry"
+	default:
+		panic(fmt.Sprintf("unknown reason: %d", e))
+	}
+}
+
+var hardReasons = []EscapeReason{
+	allocation,
+	builtin,
+}
+
+var softReasons = []EscapeReason{
+	interfaceInvoke,
+	unknownPackage,
+	dynamicCall,
+	stackSplit,
+}
+
+var allReasons = append(hardReasons, softReasons...)
+
+var escapeTypes = func() map[string]EscapeReason {
+	result := make(map[string]EscapeReason)
+	for _, r := range allReasons {
+		parts := strings.Split(r.String(), ":")
+		result[parts[0]] = r // Key before ':'.
+	}
+	return result
+}()
+
+// EscapeCount counts escapes.
+//
+// It is used to avoid accumulating too many escapes for the same reason, for
+// the same function. We limit each class to 3 instances (arbitrarily).
+type EscapeCount struct {
+	byReason [reasonCount]uint32
+}
+
+// maxRecordsPerReason is the number of explicit records.
+//
+// See EscapeCount (and usage), and Record implementation.
+const maxRecordsPerReason = 5
+
+// Record records the reason or returns false if it should not be added.
+func (ec *EscapeCount) Record(reason EscapeReason) bool {
+	ec.byReason[reason]++
+	if ec.byReason[reason] > maxRecordsPerReason {
+		return false
+	}
+	return true
+}
+
+// loadObjdump reads the objdump output.
+//
+// This records if there is a call any function for every source line. It is
+// used only to remove false positives for escape analysis. The call will be
+// elided if escape analysis is able to put the object on the heap exclusively.
+func loadObjdump() (map[LinePosition]string, error) {
+	f, err := os.Open(data.Objdump)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	// Build the map.
+	m := make(map[LinePosition]string)
+	r := bufio.NewReader(f)
+	var (
+		lastField string
+		lastPos   LinePosition
+	)
+	for {
+		line, err := r.ReadString('\n')
+		if err != nil && err != io.EOF {
+			return nil, err
+		}
+
+		// We recognize lines corresponding to actual code (not the
+		// symbol name or other metadata) and annotate them if they
+		// correspond to an explicit CALL instruction. We assume that
+		// the lack of a CALL for a given line is evidence that escape
+		// analysis has eliminated an allocation.
+		//
+		// Lines look like this (including the first space):
+		//  gohacks_unsafe.go:33  0xa39                   488b442408              MOVQ 0x8(SP), AX
+		if len(line) > 0 && line[0] == ' ' {
+			fields := strings.Fields(line)
+			if !strings.Contains(fields[3], "CALL") {
+				continue
+			}
+
+			// Ignore strings containing duffzero, which is just
+			// used by stack allocations for types that are large
+			// enough to warrant Duff's device.
+			if strings.Contains(line, "runtime.duffzero") {
+				continue
+			}
+
+			// Ignore the racefuncenter call, which is used for
+			// race builds. This does not escape.
+			if strings.Contains(line, "runtime.racefuncenter") {
+				continue
+			}
+
+			// Calculate the filename and line. Note that per the
+			// example above, the filename is not a fully qualified
+			// base, just the basename (what we require).
+			if fields[0] != lastField {
+				parts := strings.SplitN(fields[0], ":", 2)
+				lineNum, err := strconv.ParseInt(parts[1], 10, 64)
+				if err != nil {
+					return nil, err
+				}
+				lastPos = LinePosition{
+					Filename: parts[0],
+					Line:     int(lineNum),
+				}
+				lastField = fields[0]
+			}
+			if _, ok := m[lastPos]; ok {
+				continue // Already marked.
+			}
+
+			// Save the actual call for the detail.
+			m[lastPos] = strings.Join(fields[3:], " ")
+		}
+		if err == io.EOF {
+			break
+		}
+	}
+
+	return m, nil
+}
+
+// poser is a type that implements Pos.
+type poser interface {
+	Pos() token.Pos
+}
+
+// run performs the analysis.
+func run(pass *analysis.Pass) (interface{}, error) {
+	calls, err := loadObjdump()
+	if err != nil {
+		return nil, err
+	}
+	pef := packageEscapeFacts{
+		Funcs: make(map[string][]Escape),
+	}
+	linePosition := func(inst, parent poser) LinePosition {
+		p := pass.Fset.Position(inst.Pos())
+		if (p.Filename == "" || p.Line == 0) && parent != nil {
+			p = pass.Fset.Position(parent.Pos())
+		}
+		return LinePosition{
+			Filename: filepath.Base(p.Filename),
+			Line:     p.Line,
+		}
+	}
+	hasCall := func(inst poser) (string, bool) {
+		p := linePosition(inst, nil)
+		s, ok := calls[p]
+		return s, ok
+	}
+	callSite := func(inst ssa.Instruction) CallSite {
+		return CallSite{
+			LocalPos: inst.Pos(),
+			Resolved: linePosition(inst, inst.Parent()),
+		}
+	}
+	escapes := func(reason EscapeReason, detail string, inst ssa.Instruction, ec *EscapeCount) []Escape {
+		if !ec.Record(reason) {
+			return nil // Skip.
+		}
+		es := Escape{
+			Reason: reason,
+			Detail: detail,
+			Chain:  []CallSite{callSite(inst)},
+		}
+		return []Escape{es}
+	}
+	resolve := func(sub []Escape, inst ssa.Instruction, ec *EscapeCount) (es []Escape) {
+		for _, e := range sub {
+			if !ec.Record(e.Reason) {
+				continue // Skip.
+			}
+			es = append(es, Escape{
+				Reason: e.Reason,
+				Detail: e.Detail,
+				Chain:  append([]CallSite{callSite(inst)}, e.Chain...),
+			})
+		}
+		return es
+	}
+	state := pass.ResultOf[buildssa.Analyzer].(*buildssa.SSA)
+
+	var loadFunc func(*ssa.Function) []Escape // Used below.
+
+	analyzeInstruction := func(inst ssa.Instruction, ec *EscapeCount) []Escape {
+		switch x := inst.(type) {
+		case *ssa.Call:
+			if x.Call.IsInvoke() {
+				// This is an interface dispatch. There is no
+				// way to know if this is actually escaping or
+				// not, since we don't know the underlying
+				// type.
+				call, _ := hasCall(inst)
+				return escapes(interfaceInvoke, call, inst, ec)
+			}
+			switch x := x.Call.Value.(type) {
+			case *ssa.Function:
+				if x.Pkg == nil {
+					// Can't resolve the package.
+					return escapes(unknownPackage, "no package", inst, ec)
+				}
+
+				// Atomic functions are instrinics. We can
+				// assume that they don't escape.
+				if x.Pkg.Pkg.Name() == "atomic" {
+					return nil
+				}
+
+				// Is this a local function? If yes, call the
+				// function to load the local function. The
+				// local escapes are the escapes found in the
+				// local function.
+				if x.Pkg.Pkg == pass.Pkg {
+					return resolve(loadFunc(x), inst, ec)
+				}
+
+				// Recursively collect information from
+				// the other analyzers.
+				var imp packageEscapeFacts
+				if !pass.ImportPackageFact(x.Pkg.Pkg, &imp) {
+					// Unable to import the dependency; we must
+					// declare these as escaping.
+					return escapes(unknownPackage, "no analysis", inst, ec)
+				}
+
+				// The escapes of this instruction are the
+				// escapes of the called function directly.
+				return resolve(imp.Funcs[x.RelString(x.Pkg.Pkg)], inst, ec)
+			case *ssa.Builtin:
+				// Ignore elided escapes.
+				if _, has := hasCall(inst); !has {
+					return nil
+				}
+
+				// Check if the builtin is escaping.
+				for _, name := range escapingBuiltins {
+					if x.Name() == name {
+						return escapes(builtin, name, inst, ec)
+					}
+				}
+			default:
+				// All dynamic calls are counted as soft
+				// escapes. They are similar to interface
+				// dispatches. We cannot actually look up what
+				// this refers to using static analysis alone.
+				call, _ := hasCall(inst)
+				return escapes(dynamicCall, call, inst, ec)
+			}
+		case *ssa.Alloc:
+			// Ignore non-heap allocations.
+			if !x.Heap {
+				return nil
+			}
+
+			// Ignore elided escapes.
+			call, has := hasCall(inst)
+			if !has {
+				return nil
+			}
+
+			// This is a real heap allocation.
+			return escapes(allocation, call, inst, ec)
+		case *ssa.MakeMap:
+			return escapes(builtin, "makemap", inst, ec)
+		case *ssa.MakeSlice:
+			return escapes(builtin, "makeslice", inst, ec)
+		case *ssa.MakeClosure:
+			return escapes(builtin, "makeclosure", inst, ec)
+		case *ssa.MakeChan:
+			return escapes(builtin, "makechan", inst, ec)
+		}
+		return nil // No escapes.
+	}
+
+	var analyzeBasicBlock func(*ssa.BasicBlock, *EscapeCount) []Escape // Recursive.
+	analyzeBasicBlock = func(block *ssa.BasicBlock, ec *EscapeCount) (rval []Escape) {
+		for _, inst := range block.Instrs {
+			rval = append(rval, analyzeInstruction(inst, ec)...)
+		}
+		return rval // N.B. may be empty.
+	}
+
+	loadFunc = func(fn *ssa.Function) []Escape {
+		// Is this already available?
+		name := fn.RelString(pass.Pkg)
+		if es, ok := pef.Funcs[name]; ok {
+			return es
+		}
+
+		// In the case of a true cycle, we assume that the current
+		// function itself has no escapes until the rest of the
+		// analysis is complete. This will trip the above in the case
+		// of a cycle of any kind.
+		pef.Funcs[name] = nil
+
+		// Perform the basic analysis.
+		var (
+			es []Escape
+			ec EscapeCount
+		)
+		if fn.Recover != nil {
+			es = append(es, analyzeBasicBlock(fn.Recover, &ec)...)
+		}
+		for _, block := range fn.Blocks {
+			es = append(es, analyzeBasicBlock(block, &ec)...)
+		}
+
+		// Check for a stack split.
+		if call, has := hasCall(fn); has {
+			es = append(es, Escape{
+				Reason: stackSplit,
+				Detail: call,
+				Chain: []CallSite{CallSite{
+					LocalPos: fn.Pos(),
+					Resolved: linePosition(fn, fn.Parent()),
+				}},
+			})
+		}
+
+		// Save the result and return.
+		pef.Funcs[name] = es
+		return es
+	}
+
+	// Complete all local functions.
+	for _, fn := range state.SrcFuncs {
+		loadFunc(fn)
+	}
+
+	// Build the exception list.
+	exemptions := make(map[LinePosition]string)
+	for _, f := range pass.Files {
+		for _, cg := range f.Comments {
+			for _, c := range cg.List {
+				p := pass.Fset.Position(c.Slash)
+				if strings.HasPrefix(c.Text, exempt) {
+					exemptions[LinePosition{
+						Filename: filepath.Base(p.Filename),
+						Line:     p.Line,
+					}] = c.Text[len(exempt):]
+				}
+			}
+		}
+	}
+
+	// Delete everything matching the excemtions.
+	//
+	// This has the implication that exceptions are applied recursively,
+	// since this now modified set is what will be saved.
+	for name, escapes := range pef.Funcs {
+		var newEscapes []Escape
+		for _, escape := range escapes {
+			isExempt := false
+			for line, _ := range exemptions {
+				// Note that an exemption applies if it is
+				// marked as an exemption anywhere in the call
+				// chain. It need not be marked as escapes in
+				// the function itself, nor in the top-level
+				// caller.
+				for _, callSite := range escape.Chain {
+					if callSite.Resolved == line {
+						isExempt = true
+						break
+					}
+				}
+				if isExempt {
+					break
+				}
+			}
+			if !isExempt {
+				// Record this escape; not an exception.
+				newEscapes = append(newEscapes, escape)
+			}
+		}
+		pef.Funcs[name] = newEscapes // Update.
+	}
+
+	// Export all findings for future packages.
+	pass.ExportPackageFact(&pef)
+
+	// Scan all functions for violations.
+	for _, f := range pass.Files {
+		// Scan all declarations.
+		for _, decl := range f.Decls {
+			fdecl, ok := decl.(*ast.FuncDecl)
+			// Function declaration?
+			if !ok {
+				continue
+			}
+			// Is there a comment?
+			if fdecl.Doc == nil {
+				continue
+			}
+			var (
+				reasons     []EscapeReason
+				found       bool
+				local       bool
+				testReasons = make(map[EscapeReason]bool) // reason -> local?
+			)
+			// Does the comment contain a +checkescape line?
+			for _, c := range fdecl.Doc.List {
+				if !strings.HasPrefix(c.Text, magic) && !strings.HasPrefix(c.Text, testMagic) {
+					continue
+				}
+				if c.Text == magic {
+					// Default: hard reasons, local only.
+					reasons = hardReasons
+					local = true
+				} else if strings.HasPrefix(c.Text, magicParams) {
+					// Extract specific reasons.
+					types := strings.Split(c.Text[len(magicParams):], ",")
+					found = true // For below.
+					for i := 0; i < len(types); i++ {
+						if types[i] == "local" {
+							// Limit search to local escapes.
+							local = true
+						} else if types[i] == "all" {
+							// Append all reasons.
+							reasons = append(reasons, allReasons...)
+						} else if types[i] == "hard" {
+							// Append all hard reasons.
+							reasons = append(reasons, hardReasons...)
+						} else {
+							r, ok := escapeTypes[types[i]]
+							if !ok {
+								// This is not a valid escape reason.
+								pass.Reportf(fdecl.Pos(), "unknown reason: %v", types[i])
+								continue
+							}
+							reasons = append(reasons, r)
+						}
+					}
+				} else if strings.HasPrefix(c.Text, testMagic) {
+					types := strings.Split(c.Text[len(testMagic):], ",")
+					local := false
+					for i := 0; i < len(types); i++ {
+						if types[i] == "local" {
+							local = true
+						} else {
+							r, ok := escapeTypes[types[i]]
+							if !ok {
+								// This is not a valid escape reason.
+								pass.Reportf(fdecl.Pos(), "unknown reason: %v", types[i])
+								continue
+							}
+							if v, ok := testReasons[r]; ok && v {
+								// Already registered as local.
+								continue
+							}
+							testReasons[r] = local
+						}
+					}
+				}
+			}
+			if len(reasons) == 0 && found {
+				// A magic annotation was provided, but no reasons.
+				pass.Reportf(fdecl.Pos(), "no reasons provided")
+				continue
+			}
+
+			// Scan for matches.
+			fn := pass.TypesInfo.Defs[fdecl.Name].(*types.Func)
+			name := state.Pkg.Prog.FuncValue(fn).RelString(pass.Pkg)
+			es, ok := pef.Funcs[name]
+			if !ok {
+				pass.Reportf(fdecl.Pos(), "internal error: function %s not found.", name)
+				continue
+			}
+			for _, e := range es {
+				for _, r := range reasons {
+					// Is does meet our local requirement?
+					if local && len(e.Chain) > 1 {
+						continue
+					}
+					// Does this match the reason? Emit
+					// with a full stack trace that
+					// explains why this violates our
+					// constraints.
+					if e.Reason == r {
+						pass.Reportf(e.Chain[0].LocalPos, "%s", e.String())
+					}
+				}
+			}
+
+			// Scan for test (required) matches.
+			testReasonsFound := make(map[EscapeReason]bool)
+			for _, e := range es {
+				// Is this local?
+				local, ok := testReasons[e.Reason]
+				wantLocal := len(e.Chain) == 1
+				testReasonsFound[e.Reason] = wantLocal
+				if !ok {
+					continue
+				}
+				if local == wantLocal {
+					delete(testReasons, e.Reason)
+				}
+			}
+			for reason, local := range testReasons {
+				// We didn't find the escapes we wanted.
+				pass.Reportf(fdecl.Pos(), fmt.Sprintf("testescapes not found: reason=%s, local=%t", reason, local))
+			}
+			if len(testReasons) > 0 {
+				// Dump all reasons found to help in debugging.
+				for _, e := range es {
+					pass.Reportf(e.Chain[0].LocalPos, "escape found: %s", e.String())
+				}
+			}
+		}
+	}
+
+	return nil, nil
+}
diff --git a/tools/checkescape/test1/BUILD b/tools/checkescape/test1/BUILD
new file mode 100644
index 000000000..783403247
--- /dev/null
+++ b/tools/checkescape/test1/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "test1",
+    srcs = ["test1.go"],
+    visibility = ["//tools/checkescape/test2:__pkg__"],
+)
diff --git a/tools/checkescape/test1/test1.go b/tools/checkescape/test1/test1.go
new file mode 100644
index 000000000..68d3f72cc
--- /dev/null
+++ b/tools/checkescape/test1/test1.go
@@ -0,0 +1,195 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package test1 is a test package.
+package test1
+
+import (
+	"fmt"
+	"reflect"
+)
+
+// Interface is a generic interface.
+type Interface interface {
+	Foo()
+}
+
+// Type is a concrete implementation of Interface.
+type Type struct {
+	A uint64
+	B uint64
+}
+
+// Foo implements Interface.Foo.
+//go:nosplit
+func (t Type) Foo() {
+	fmt.Printf("%v", t) // Never executed.
+}
+
+// +checkescape:all,hard
+//go:nosplit
+func InterfaceFunction(i Interface) {
+	// Do nothing; exported for tests.
+}
+
+// +checkesacape:all,hard
+//go:nosplit
+func TypeFunction(t *Type) {
+}
+
+// +mustescape:local,builtin
+//go:noinline
+//go:nosplit
+func BuiltinMap(x int) map[string]bool {
+	return make(map[string]bool)
+}
+
+// +mustescape:builtin
+//go:noinline
+//go:nosplit
+func builtinMapRec(x int) map[string]bool {
+	return BuiltinMap(x)
+}
+
+// +temustescapestescape:local,builtin
+//go:noinline
+//go:nosplit
+func BuiltinClosure(x int) func() {
+	return func() {
+		fmt.Printf("%v", x)
+	}
+}
+
+// +mustescape:builtin
+//go:noinline
+//go:nosplit
+func builtinClosureRec(x int) func() {
+	return BuiltinClosure(x)
+}
+
+// +mustescape:local,builtin
+//go:noinline
+//go:nosplit
+func BuiltinMakeSlice(x int) []byte {
+	return make([]byte, x)
+}
+
+// +mustescape:builtin
+//go:noinline
+//go:nosplit
+func builtinMakeSliceRec(x int) []byte {
+	return BuiltinMakeSlice(x)
+}
+
+// +mustescape:local,builtin
+//go:noinline
+//go:nosplit
+func BuiltinAppend(x []byte) []byte {
+	return append(x, 0)
+}
+
+// +mustescape:builtin
+//go:noinline
+//go:nosplit
+func builtinAppendRec() []byte {
+	return BuiltinAppend(nil)
+}
+
+// +mustescape:local,builtin
+//go:noinline
+//go:nosplit
+func BuiltinChan() chan int {
+	return make(chan int)
+}
+
+// +mustescape:builtin
+//go:noinline
+//go:nosplit
+func builtinChanRec() chan int {
+	return BuiltinChan()
+}
+
+// +mustescape:local,heap
+//go:noinline
+//go:nosplit
+func Heap() *Type {
+	var t Type
+	return &t
+}
+
+// +mustescape:heap
+//go:noinline
+//go:nosplit
+func heapRec() *Type {
+	return Heap()
+}
+
+// +mustescape:local,interface
+//go:noinline
+//go:nosplit
+func Dispatch(i Interface) {
+	i.Foo()
+}
+
+// +mustescape:interface
+//go:noinline
+//go:nosplit
+func dispatchRec(i Interface) {
+	Dispatch(i)
+}
+
+// +mustescape:local,dynamic
+//go:noinline
+//go:nosplit
+func Dynamic(f func()) {
+	f()
+}
+
+// +mustescape:dynamic
+//go:noinline
+//go:nosplit
+func dynamicRec(f func()) {
+	Dynamic(f)
+}
+
+// +mustescape:local,unknown
+//go:noinline
+//go:nosplit
+func Unknown() {
+	_ = reflect.TypeOf((*Type)(nil)) // Does not actually escape.
+}
+
+// +mustescape:unknown
+//go:noinline
+//go:nosplit
+func unknownRec() {
+	Unknown()
+}
+
+//go:noinline
+//go:nosplit
+func internalFunc() {
+}
+
+// +mustescape:local,stack
+//go:noinline
+func Split() {
+	internalFunc()
+}
+
+// +mustescape:stack
+//go:noinline
+func splitRec() {
+	Split()
+}
diff --git a/tools/checkescape/test2/BUILD b/tools/checkescape/test2/BUILD
new file mode 100644
index 000000000..5a11e4b43
--- /dev/null
+++ b/tools/checkescape/test2/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "test2",
+    srcs = ["test2.go"],
+    deps = ["//tools/checkescape/test1"],
+)
diff --git a/tools/checkescape/test2/test2.go b/tools/checkescape/test2/test2.go
new file mode 100644
index 000000000..7fce3e3be
--- /dev/null
+++ b/tools/checkescape/test2/test2.go
@@ -0,0 +1,94 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package test2 is a test package that imports test1.
+package test2
+
+import (
+	"gvisor.dev/gvisor/tools/checkescape/test1"
+)
+
+// +checkescape:all
+//go:nosplit
+func interfaceFunctionCrossPkg() {
+	var i test1.Interface
+	test1.InterfaceFunction(i)
+}
+
+// +checkesacape:all
+//go:nosplit
+func typeFunctionCrossPkg() {
+	var t test1.Type
+	test1.TypeFunction(&t)
+}
+
+// +mustescape:builtin
+//go:noinline
+func builtinMapCrossPkg(x int) map[string]bool {
+	return test1.BuiltinMap(x)
+}
+
+// +mustescape:builtin
+//go:noinline
+func builtinClosureCrossPkg(x int) func() {
+	return test1.BuiltinClosure(x)
+}
+
+// +mustescape:builtin
+//go:noinline
+func builtinMakeSliceCrossPkg(x int) []byte {
+	return test1.BuiltinMakeSlice(x)
+}
+
+// +mustescape:builtin
+//go:noinline
+func builtinAppendCrossPkg() []byte {
+	return test1.BuiltinAppend(nil)
+}
+
+// +mustescape:builtin
+//go:noinline
+func builtinChanCrossPkg() chan int {
+	return test1.BuiltinChan()
+}
+
+// +mustescape:heap
+//go:noinline
+func heapCrossPkg() *test1.Type {
+	return test1.Heap()
+}
+
+// +mustescape:interface
+//go:noinline
+func dispatchCrossPkg(i test1.Interface) {
+	test1.Dispatch(i)
+}
+
+// +mustescape:dynamic
+//go:noinline
+func dynamicCrossPkg(f func()) {
+	test1.Dynamic(f)
+}
+
+// +mustescape:unknown
+//go:noinline
+func unknownCrossPkg() {
+	test1.Unknown()
+}
+
+// +mustescape:stack
+//go:noinline
+func splitCrosssPkt() {
+	test1.Split()
+}
diff --git a/tools/checkunsafe/BUILD b/tools/checkunsafe/BUILD
index 4f1a31a6d..0c264151b 100644
--- a/tools/checkunsafe/BUILD
+++ b/tools/checkunsafe/BUILD
@@ -1,11 +1,12 @@
-load("//tools:defs.bzl", "go_tool_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-go_tool_library(
+go_library(
     name = "checkunsafe",
     srcs = ["check_unsafe.go"],
-    visibility = ["//:sandbox"],
+    nogo = False,
+    visibility = ["//tools/nogo:__subpackages__"],
     deps = [
         "@org_golang_x_tools//go/analysis:go_tool_library",
     ],
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 46249f9c4..41eded16d 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,31 +7,44 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/bazeldefs:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:defs.bzl", _build_test = "build_test", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _default_installer = "default_installer", _default_net_util = "default_net_util", _gazelle = "gazelle", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_path = "go_path", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _rbe_platform = "rbe_platform", _rbe_toolchain = "rbe_toolchain", _select_arch = "select_arch", _select_system = "select_system", _vdso_linker_option = "vdso_linker_option")
+load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", _platforms = "platforms")
+load("//tools/bazeldefs:tags.bzl", "go_suffixes")
+load("//tools/nogo:defs.bzl", "nogo_test")
 
 # Delegate directly.
+build_test = _build_test
 cc_binary = _cc_binary
+cc_flags_supplier = _cc_flags_supplier
+cc_grpc_library = _cc_grpc_library
 cc_library = _cc_library
 cc_test = _cc_test
 cc_toolchain = _cc_toolchain
-cc_flags_supplier = _cc_flags_supplier
-container_image = _container_image
+default_installer = _default_installer
+default_net_util = _default_net_util
+gbenchmark = _gbenchmark
+gazelle = _gazelle
 go_embed_data = _go_embed_data
-go_image = _go_image
+go_path = _go_path
 go_test = _go_test
-go_tool_library = _go_tool_library
 gtest = _gtest
+grpcpp = _grpcpp
+loopback = _loopback
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
-py_library = _py_library
 py_binary = _py_binary
-py_test = _py_test
+py_library = _py_library
 py_requirement = _py_requirement
+py_test = _py_test
 select_arch = _select_arch
 select_system = _select_system
-loopback = _loopback
-default_installer = _default_installer
-default_net_util = _default_net_util
+rbe_platform = _rbe_platform
+rbe_toolchain = _rbe_toolchain
+vdso_linker_option = _vdso_linker_option
+
+# Platform options.
+default_platform = _default_platform
+platforms = _platforms
 
 def go_binary(name, **kwargs):
     """Wraps the standard go_binary.
@@ -83,7 +96,7 @@ def go_imports(name, src, out):
         cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"),
     )
 
-def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs):
+def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, marshal_debug = False, nogo = False, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
     The recommended way is to use this rule with mostly identical configuration as the native
@@ -106,6 +119,7 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
       imports: imports required for stateify.
       stateify: whether statify is enabled (default: true).
       marshal: whether marshal is enabled (default: false).
+      marshal_debug: whether the gomarshal tools emits debugging output (default: false).
       **kwargs: standard go_library arguments.
     """
     all_srcs = srcs
@@ -117,10 +131,10 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         # First, we need to segregate the input files via the special suffixes,
         # and calculate the final output set.
         state_sets = calculate_sets(srcs)
-        for (suffix, srcs) in state_sets.items():
+        for (suffix, src_subset) in state_sets.items():
             go_stateify(
                 name = name + suffix + "_state_autogen_with_imports",
-                srcs = srcs,
+                srcs = src_subset,
                 imports = imports,
                 package = full_pkg,
                 out = name + suffix + "_state_autogen_with_imports.go",
@@ -140,11 +154,14 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
     if marshal:
         # See above.
         marshal_sets = calculate_sets(srcs)
-        for (suffix, srcs) in marshal_sets.items():
+        for (suffix, src_subset) in marshal_sets.items():
             go_marshal(
                 name = name + suffix + "_abi_autogen",
-                srcs = srcs,
-                debug = False,
+                srcs = src_subset,
+                debug = select({
+                    "//tools/go_marshal:marshal_config_verbose": True,
+                    "//conditions:default": marshal_debug,
+                }),
                 imports = imports,
                 package = name,
             )
@@ -165,6 +182,11 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         deps = all_deps,
         **kwargs
     )
+    if nogo:
+        nogo_test(
+            name = name + "_nogo",
+            deps = [":" + name],
+        )
 
     if marshal:
         # Ignore importpath for go_test.
@@ -172,42 +194,61 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
 
         # See above.
         marshal_sets = calculate_sets(srcs)
-        for (suffix, srcs) in marshal_sets.items():
+        for (suffix, _) in marshal_sets.items():
             _go_test(
                 name = name + suffix + "_abi_autogen_test",
                 srcs = [name + suffix + "_abi_autogen_test.go"],
-                library = ":" + name + suffix,
+                library = ":" + name,
                 deps = marshal_test_deps,
                 **kwargs
             )
 
-def proto_library(name, srcs, **kwargs):
+def proto_library(name, srcs, deps = None, has_services = 0, **kwargs):
     """Wraps the standard proto_library.
 
-    Given a proto_library named "foo", this produces three different targets:
+    Given a proto_library named "foo", this produces up to five different
+    targets:
     - foo_proto: proto_library rule.
     - foo_go_proto: go_proto_library rule.
     - foo_cc_proto: cc_proto_library rule.
+    - foo_go_grpc_proto: go_grpc_library rule.
+    - foo_cc_grpc_proto: cc_grpc_library rule.
 
     Args:
+      name: the name to which _proto, _go_proto, etc, will be appended.
       srcs: the proto sources.
+      deps: for the proto library and the go_proto_library.
+      has_services: 1 to build gRPC code, otherwise 0.
       **kwargs: standard proto_library arguments.
     """
-    deps = kwargs.pop("deps", [])
     _proto_library(
         name = name + "_proto",
         srcs = srcs,
         deps = deps,
+        has_services = has_services,
         **kwargs
     )
-    _go_proto_library(
-        name = name + "_go_proto",
-        proto = ":" + name + "_proto",
-        deps = deps,
-        **kwargs
-    )
+    if has_services:
+        _go_grpc_and_proto_libraries(
+            name = name,
+            deps = deps,
+            **kwargs
+        )
+    else:
+        _go_proto_library(
+            name = name,
+            deps = deps,
+            **kwargs
+        )
     _cc_proto_library(
         name = name + "_cc_proto",
         deps = [":" + name + "_proto"],
         **kwargs
     )
+    if has_services:
+        _cc_grpc_library(
+            name = name + "_cc_grpc_proto",
+            srcs = [":" + name + "_proto"],
+            deps = [":" + name + "_cc_proto"],
+            **kwargs
+        )
diff --git a/tools/go_branch.sh b/tools/go_branch.sh
index f97a74aaf..093de89b4 100755
--- a/tools/go_branch.sh
+++ b/tools/go_branch.sh
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -eo pipefail
+set -xeo pipefail
 
 # Discovery the package name from the go.mod file.
 declare -r module=$(cat go.mod | grep -E "^module" | cut -d' ' -f2)
@@ -42,7 +42,8 @@ declare -r head=$(git describe --always)
 
 # We expect to have an existing go branch that we will use as the basis for
 # this commit. That branch may be empty, but it must exist.
-declare -r go_branch=$(git show-ref --hash origin/go)
+git fetch --all
+declare -r go_branch=$(git show-ref --hash go)
 
 # Clone the current repository to the temporary directory, and check out the
 # current go_branch directory. We move to the new repository for convenience.
@@ -87,6 +88,12 @@ EOF
 # because they may correspond to unused templates, etc.
 cp "${repo_orig}"/runsc/*.go runsc/
 
+# Normalize all permissions. The way bazel constructs the :gopath tree may leave
+# some strange permissions on files. We don't have anything in this tree that
+# should be execution, only the Go source files, README.md, and ${othersrc}.
+find . -type f -exec chmod 0644 {} \;
+find . -type d -exec chmod 0755 {} \;
+
 # Update the current working set and commit.
 git add . && git commit -m "Merge ${head} (automated)"
 
diff --git a/tools/go_generics/defs.bzl b/tools/go_generics/defs.bzl
index c5be52ecd..8c9995fd4 100644
--- a/tools/go_generics/defs.bzl
+++ b/tools/go_generics/defs.bzl
@@ -105,7 +105,6 @@ def _go_template_instance_impl(ctx):
         executable = ctx.executable._tool,
     )
 
-    # TODO: How can we get the dependencies out?
     return struct(
         files = depset([output]),
     )
diff --git a/tools/go_generics/generics.go b/tools/go_generics/generics.go
index e9cc2c753..0860ca9db 100644
--- a/tools/go_generics/generics.go
+++ b/tools/go_generics/generics.go
@@ -223,7 +223,9 @@ func main() {
 		} else {
 			switch kind {
 			case globals.KindType, globals.KindVar, globals.KindConst, globals.KindFunction:
-				ident.Name = *prefix + ident.Name + *suffix
+				if ident.Name != "_" {
+					ident.Name = *prefix + ident.Name + *suffix
+				}
 			case globals.KindTag:
 				// Modify the state tag appropriately.
 				if m := stateTagRegexp.FindStringSubmatch(ident.Name); m != nil {
diff --git a/tools/go_marshal/BUILD b/tools/go_marshal/BUILD
index 80d9c0504..be49cf9c8 100644
--- a/tools/go_marshal/BUILD
+++ b/tools/go_marshal/BUILD
@@ -12,3 +12,8 @@ go_binary(
         "//tools/go_marshal/gomarshal",
     ],
 )
+
+config_setting(
+    name = "marshal_config_verbose",
+    values = {"define": "gomarshal=verbose"},
+)
diff --git a/tools/go_marshal/analysis/analysis_unsafe.go b/tools/go_marshal/analysis/analysis_unsafe.go
index 9a9a4f298..cd55cf5cb 100644
--- a/tools/go_marshal/analysis/analysis_unsafe.go
+++ b/tools/go_marshal/analysis/analysis_unsafe.go
@@ -161,6 +161,10 @@ func AlignmentCheck(t *testing.T, typ reflect.Type) (ok bool, delta uint64) {
 		if typ.NumField() > 0 && nextXOff != int(typ.Size()) {
 			implicitPad := int(typ.Size()) - nextXOff
 			f := typ.Field(typ.NumField() - 1) // Final field
+			if tag, ok := f.Tag.Lookup("marshal"); ok && tag == "unaligned" {
+				// Final field explicitly marked unaligned.
+				break
+			}
 			t.Fatalf("Suspect offset for field %s.%s at the end of %s, detected an implicit %d byte padding from offset %d to %d at the end of the struct; either add %d bytes of explict padding at end of the struct or tag the final field %s as `marshal:\"unaligned\"`.",
 				typ.Name(), f.Name, typ.Name(), implicitPad, nextXOff, typ.Size(), implicitPad, f.Name)
 		}
diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl
index d79786a68..323e33882 100644
--- a/tools/go_marshal/defs.bzl
+++ b/tools/go_marshal/defs.bzl
@@ -53,9 +53,10 @@ go_marshal = rule(
 
 # marshal_deps are the dependencies requied by generated code.
 marshal_deps = [
-    "//tools/go_marshal/marshal",
+    "//pkg/gohacks",
     "//pkg/safecopy",
     "//pkg/usermem",
+    "//tools/go_marshal/marshal",
 ]
 
 # marshal_test_deps are required by test targets.
diff --git a/tools/go_marshal/gomarshal/BUILD b/tools/go_marshal/gomarshal/BUILD
index b5d5a4487..44cb33ae4 100644
--- a/tools/go_marshal/gomarshal/BUILD
+++ b/tools/go_marshal/gomarshal/BUILD
@@ -7,6 +7,9 @@ go_library(
     srcs = [
         "generator.go",
         "generator_interfaces.go",
+        "generator_interfaces_array_newtype.go",
+        "generator_interfaces_primitive_newtype.go",
+        "generator_interfaces_struct.go",
         "generator_tests.go",
         "util.go",
     ],
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 01be7c477..177013dbb 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -28,12 +28,6 @@ import (
 	"gvisor.dev/gvisor/tools/tags"
 )
 
-const (
-	marshalImport  = "gvisor.dev/gvisor/tools/go_marshal/marshal"
-	safecopyImport = "gvisor.dev/gvisor/pkg/safecopy"
-	usermemImport  = "gvisor.dev/gvisor/pkg/usermem"
-)
-
 // List of identifiers we use in generated code that may conflict with a
 // similarly-named source identifier. Abort gracefully when we see these to
 // avoid potentially confusing compilation failures in generated code.
@@ -44,7 +38,8 @@ const (
 // All recievers are single letters, so we don't allow import aliases to be a
 // single letter.
 var badIdents = []string{
-	"addr", "blk", "buf", "dst", "dsts", "err", "hdr", "len", "ptr", "src", "srcs", "task", "val",
+	"addr", "blk", "buf", "dst", "dsts", "count", "err", "hdr", "idx", "inner",
+	"length", "limit", "ptr", "size", "src", "srcs", "task", "val",
 	// All single-letter identifiers.
 }
 
@@ -101,15 +96,18 @@ func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*G
 		// used, so that they're always added to the generated code.
 		g.imports.add(i).markUsed()
 	}
-	g.imports.add(marshalImport).markUsed()
+
 	// The following imports may or may not be used by the generated code,
 	// depending on what's required for the target types. Don't mark these as
 	// used by default.
+	g.imports.add("io")
 	g.imports.add("reflect")
 	g.imports.add("runtime")
-	g.imports.add(safecopyImport)
 	g.imports.add("unsafe")
-	g.imports.add(usermemImport)
+	g.imports.add("gvisor.dev/gvisor/pkg/gohacks")
+	g.imports.add("gvisor.dev/gvisor/pkg/safecopy")
+	g.imports.add("gvisor.dev/gvisor/pkg/usermem")
+	g.imports.add("gvisor.dev/gvisor/tools/go_marshal/marshal")
 
 	return &g, nil
 }
@@ -123,7 +121,7 @@ func (g *Generator) writeHeader() error {
 	// Emit build tags.
 	if t := tags.Aggregate(g.inputs); len(t) > 0 {
 		b.emit(strings.Join(t.Lines(), "\n"))
-		b.emit("\n")
+		b.emit("\n\n")
 	}
 
 	// Package header.
@@ -191,10 +189,73 @@ func (g *Generator) parse() ([]*ast.File, []*token.FileSet, error) {
 	return files, fsets, nil
 }
 
-// collectMarshallabeTypes walks the parsed AST and collects a list of type
+// sliceAPI carries information about the '+marshal slice' directive.
+type sliceAPI struct {
+	// Comment node in the AST containing the +marshal tag.
+	comment *ast.Comment
+	// Identifier fragment to use when naming generated functions for the slice
+	// API.
+	ident string
+	// Whether the generated functions should reference the newtype name, or the
+	// inner type name. Only meaningful on newtype declarations on primitives.
+	inner bool
+}
+
+// marshallableType carries information about a type marked with the '+marshal'
+// directive.
+type marshallableType struct {
+	spec  *ast.TypeSpec
+	slice *sliceAPI
+}
+
+func newMarshallableType(fset *token.FileSet, tagLine *ast.Comment, spec *ast.TypeSpec) marshallableType {
+	mt := marshallableType{
+		spec:  spec,
+		slice: nil,
+	}
+
+	var unhandledTags []string
+
+	for _, tag := range strings.Fields(strings.TrimPrefix(tagLine.Text, "// +marshal")) {
+		if strings.HasPrefix(tag, "slice:") {
+			tokens := strings.Split(tag, ":")
+			if len(tokens) < 2 || len(tokens) > 3 {
+				abortAt(fset.Position(tagLine.Slash), fmt.Sprintf("+marshal directive has invalid 'slice' clause. Expecting format 'slice:<IDENTIFIER>[:inner]', got '%v'", tag))
+			}
+			if len(tokens[1]) == 0 {
+				abortAt(fset.Position(tagLine.Slash), "+marshal slice directive has empty identifier argument. Expecting '+marshal slice:identifier'")
+			}
+
+			sa := &sliceAPI{
+				comment: tagLine,
+				ident:   tokens[1],
+			}
+			mt.slice = sa
+
+			if len(tokens) == 3 {
+				if tokens[2] != "inner" {
+					abortAt(fset.Position(tagLine.Slash), "+marshal slice directive has an invalid argument. Expecting '+marshal slice:<IDENTIFIER>[:inner]'")
+				}
+				sa.inner = true
+			}
+
+			continue
+		}
+
+		unhandledTags = append(unhandledTags, tag)
+	}
+
+	if len(unhandledTags) > 0 {
+		abortAt(fset.Position(tagLine.Slash), fmt.Sprintf("+marshal directive contained the following unknown clauses: %v", strings.Join(unhandledTags, " ")))
+	}
+
+	return mt
+}
+
+// collectMarshallableTypes walks the parsed AST and collects a list of type
 // declarations for which we need to generate the Marshallable interface.
-func (g *Generator) collectMarshallabeTypes(a *ast.File, f *token.FileSet) []*ast.TypeSpec {
-	var types []*ast.TypeSpec
+func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) []marshallableType {
+	var types []marshallableType
 	for _, decl := range a.Decls {
 		gdecl, ok := decl.(*ast.GenDecl)
 		// Type declaration?
@@ -209,9 +270,11 @@ func (g *Generator) collectMarshallabeTypes(a *ast.File, f *token.FileSet) []*as
 		}
 		// Does the comment contain a "+marshal" line?
 		marked := false
+		var tagLine *ast.Comment
 		for _, c := range gdecl.Doc.List {
-			if c.Text == "// +marshal" {
+			if strings.HasPrefix(c.Text, "// +marshal") {
 				marked = true
+				tagLine = c
 				break
 			}
 		}
@@ -220,14 +283,23 @@ func (g *Generator) collectMarshallabeTypes(a *ast.File, f *token.FileSet) []*as
 			continue
 		}
 		for _, spec := range gdecl.Specs {
-			// We already confirmed we're in a type declaration earlier.
+			// We already confirmed we're in a type declaration earlier, so this
+			// cast will succeed.
 			t := spec.(*ast.TypeSpec)
-			if _, ok := t.Type.(*ast.StructType); ok {
-				debugfAt(f.Position(t.Pos()), "Collected marshallable type %s.\n", t.Name.Name)
-				types = append(types, t)
-				continue
+			switch t.Type.(type) {
+			case *ast.StructType:
+				debugfAt(f.Position(t.Pos()), "Collected marshallable struct %s.\n", t.Name.Name)
+			case *ast.Ident: // Newtype on primitive.
+				debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on primitive %s.\n", t.Name.Name)
+			case *ast.ArrayType: // Newtype on array.
+				debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on array %s.\n", t.Name.Name)
+			default:
+				// A user specifically requested marshalling on this type, but we
+				// don't support it.
+				abortAt(f.Position(t.Pos()), fmt.Sprintf("Marshalling codegen was requested on type '%s', but go-marshal doesn't support this kind of declaration.\n", t.Name))
 			}
-			debugf("Skipping declaration %v since it's not a struct declaration.\n", gdecl)
+			types = append(types, newMarshallableType(f, tagLine, t))
+
 		}
 	}
 	return types
@@ -254,7 +326,7 @@ func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]imp
 
 			// Make sure we have an import that doesn't use any local names that
 			// would conflict with identifiers in the generated code.
-			if len(i.name) == 1 {
+			if len(i.name) == 1 && i.name != "_" {
 				abortAt(f.Position(spec.Pos()), fmt.Sprintf("Import has a single character local name '%s'; this may conflict with code generated by go_marshal, use a multi-character import alias", i.name))
 			}
 			if _, ok := badIdentsMap[i.name]; ok {
@@ -266,20 +338,40 @@ func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]imp
 
 }
 
-func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interfaceGenerator {
-	// We're guaranteed to have only struct type specs by now. See
-	// Generator.collectMarshallabeTypes.
-	i := newInterfaceGenerator(t, fset)
-	i.validate()
-	i.emitMarshallable()
+func (g *Generator) generateOne(t marshallableType, fset *token.FileSet) *interfaceGenerator {
+	i := newInterfaceGenerator(t.spec, fset)
+	switch ty := t.spec.Type.(type) {
+	case *ast.StructType:
+		i.validateStruct(t.spec, ty)
+		i.emitMarshallableForStruct(ty)
+		if t.slice != nil {
+			i.emitMarshallableSliceForStruct(ty, t.slice)
+		}
+	case *ast.Ident:
+		i.validatePrimitiveNewtype(ty)
+		i.emitMarshallableForPrimitiveNewtype(ty)
+		if t.slice != nil {
+			i.emitMarshallableSliceForPrimitiveNewtype(ty, t.slice)
+		}
+	case *ast.ArrayType:
+		i.validateArrayNewtype(t.spec.Name, ty)
+		// After validate, we can safely call arrayLen.
+		i.emitMarshallableForArrayNewtype(t.spec.Name, ty, ty.Elt.(*ast.Ident))
+		if t.slice != nil {
+			abortAt(fset.Position(t.slice.comment.Slash), fmt.Sprintf("Array type marked as '+marshal slice:...', but this is not supported. Perhaps fold one of the dimensions?"))
+		}
+	default:
+		// This should've been filtered out by collectMarshallabeTypes.
+		panic(fmt.Sprintf("Unexpected type %+v", ty))
+	}
 	return i
 }
 
 // generateOneTestSuite generates a test suite for the automatically generated
 // implementations type t.
-func (g *Generator) generateOneTestSuite(t *ast.TypeSpec) *testGenerator {
-	i := newTestGenerator(t)
-	i.emitTests()
+func (g *Generator) generateOneTestSuite(t marshallableType) *testGenerator {
+	i := newTestGenerator(t.spec)
+	i.emitTests(t.slice)
 	return i
 }
 
@@ -318,7 +410,7 @@ func (g *Generator) Run() error {
 	for i, a := range asts {
 		// Collect type declarations marked for code generation and generate
 		// Marshallable interfaces.
-		for _, t := range g.collectMarshallabeTypes(a, fsets[i]) {
+		for _, t := range g.collectMarshallableTypes(a, fsets[i]) {
 			impl := g.generateOne(t, fsets[i])
 			// Collect Marshallable types referenced by the generated code.
 			for ref, _ := range impl.ms {
@@ -329,24 +421,13 @@ func (g *Generator) Run() error {
 			// the list of imports we need to copy to the generated code.
 			for name, _ := range impl.is {
 				if !g.imports.markUsed(name) {
-					panic(fmt.Sprintf("Generated code for '%s' referenced a non-existent import with local name '%s'", impl.typeName(), name))
+					panic(fmt.Sprintf("Generated code for '%s' referenced a non-existent import with local name '%s'. Either go-marshal needs to add an import to the generated file, or a package in an input source file has a package name differ from the final component of its path, which go-marshal doesn't know how to detect; use an import alias to work around this limitation.", impl.typeName(), name))
 				}
 			}
 			ts = append(ts, g.generateOneTestSuite(t))
 		}
 	}
 
-	// Tool was invoked with input files with no data structures marked for code
-	// generation. This is probably not what the user intended.
-	if len(impls) == 0 {
-		var buf bytes.Buffer
-		fmt.Fprintf(&buf, "go_marshal invoked on these files, but they don't contain any types requiring code generation. Perhaps mark some with \"// +marshal\"?:\n")
-		for _, i := range g.inputs {
-			fmt.Fprintf(&buf, "  %s\n", i)
-		}
-		abort(buf.String())
-	}
-
 	// Write output file header. These include things like package name and
 	// import statements.
 	if err := g.writeHeader(); err != nil {
@@ -389,6 +470,26 @@ func (g *Generator) writeTests(ts []*testGenerator) error {
 	}
 
 	// Write test functions.
+
+	// If we didn't generate any Marshallable implementations, we can't just
+	// emit an empty test file, since that causes the build to fail with "no
+	// tests/benchmarks/examples found". Unfortunately we can't signal bazel to
+	// omit the entire package since the outputs are already defined before
+	// go-marshal is called. If we'd otherwise emit an empty test suite, emit an
+	// empty example instead.
+	if len(ts) == 0 {
+		b.reset()
+		b.emit("func Example() {\n")
+		b.inIndent(func() {
+			b.emit("// This example is intentionally empty to ensure this file contains at least\n")
+			b.emit("// one testable entity. go-marshal is forced to emit a test file if a package\n")
+			b.emit("// is marked marshallable, but emitting a test file with no entities results\n")
+			b.emit("// in a build failure.\n")
+		})
+		b.emit("}\n")
+		return b.write(g.outputTest)
+	}
+
 	for _, t := range ts {
 		if err := t.write(g.outputTest); err != nil {
 			return err
diff --git a/tools/go_marshal/gomarshal/generator_interfaces.go b/tools/go_marshal/gomarshal/generator_interfaces.go
index f25331ac5..e3c3dac63 100644
--- a/tools/go_marshal/gomarshal/generator_interfaces.go
+++ b/tools/go_marshal/gomarshal/generator_interfaces.go
@@ -55,9 +55,6 @@ func (g *interfaceGenerator) typeName() string {
 
 // newinterfaceGenerator creates a new interface generator.
 func newInterfaceGenerator(t *ast.TypeSpec, fset *token.FileSet) *interfaceGenerator {
-	if _, ok := t.Type.(*ast.StructType); !ok {
-		panic(fmt.Sprintf("Attempting to generate code for a not struct type %v", t))
-	}
 	g := &interfaceGenerator{
 		t:  t,
 		r:  receiverName(t),
@@ -77,25 +74,12 @@ func (g *interfaceGenerator) recordUsedMarshallable(m string) {
 
 func (g *interfaceGenerator) recordUsedImport(i string) {
 	g.is[i] = struct{}{}
-
 }
 
 func (g *interfaceGenerator) recordPotentiallyNonPackedField(fieldName string) {
 	g.as[fieldName] = struct{}{}
 }
 
-func (g *interfaceGenerator) forEachField(fn func(f *ast.Field)) {
-	// This is guaranteed to succeed because g.t is always a struct.
-	st := g.t.Type.(*ast.StructType)
-	for _, field := range st.Fields.List {
-		fn(field)
-	}
-}
-
-func (g *interfaceGenerator) fieldAccessor(n *ast.Ident) string {
-	return fmt.Sprintf("%s.%s", g.r, n.Name)
-}
-
 // abortAt aborts the go_marshal tool with the given error message, with a
 // reference position to the input source. Same as abortAt, but uses g to
 // resolve p to position.
@@ -103,67 +87,6 @@ func (g *interfaceGenerator) abortAt(p token.Pos, msg string) {
 	abortAt(g.f.Position(p), msg)
 }
 
-// validate ensures the type we're working with can be marshalled. These checks
-// are done ahead of time and in one place so we can make assumptions later.
-func (g *interfaceGenerator) validate() {
-	g.forEachField(func(f *ast.Field) {
-		if len(f.Names) == 0 {
-			g.abortAt(f.Pos(), "Cannot marshal structs with embedded fields, give the field a name; use '_' for anonymous fields such as padding fields")
-		}
-	})
-
-	g.forEachField(func(f *ast.Field) {
-		fieldDispatcher{
-			primitive: func(_, t *ast.Ident) {
-				switch t.Name {
-				case "int8", "uint8", "byte", "int16", "uint16", "int32", "uint32", "int64", "uint64":
-					// These are the only primitive types we're allow. Below, we
-					// provide suggestions for some disallowed types and reject
-					// them, then attempt to marshal any remaining types by
-					// invoking the marshal.Marshallable interface on them. If
-					// these types don't actually implement
-					// marshal.Marshallable, compilation of the generated code
-					// will fail with an appropriate error message.
-					return
-				case "int":
-					g.abortAt(f.Pos(), "Type 'int' has ambiguous width, use int32 or int64")
-				case "uint":
-					g.abortAt(f.Pos(), "Type 'uint' has ambiguous width, use uint32 or uint64")
-				case "string":
-					g.abortAt(f.Pos(), "Type 'string' is dynamically-sized and cannot be marshalled, use a fixed size byte array '[...]byte' instead")
-				default:
-					debugfAt(g.f.Position(f.Pos()), fmt.Sprintf("Found derived type '%s', will attempt dispatch via marshal.Marshallable.\n", t.Name))
-				}
-			},
-			selector: func(_, _, _ *ast.Ident) {
-				// No validation to perform on selector fields. However this
-				// callback must still be provided.
-			},
-			array: func(n, _ *ast.Ident, len int) {
-				a := f.Type.(*ast.ArrayType)
-				if a.Len == nil {
-					g.abortAt(f.Pos(), fmt.Sprintf("Dynamically sized slice '%s' cannot be marshalled, arrays must be statically sized", n.Name))
-				}
-
-				if _, ok := a.Len.(*ast.BasicLit); !ok {
-					g.abortAt(a.Len.Pos(), fmt.Sprintf("Array size must be a literal, don's use consts or expressions"))
-				}
-
-				if _, ok := a.Elt.(*ast.Ident); !ok {
-					g.abortAt(a.Elt.Pos(), fmt.Sprintf("Marshalling not supported for arrays with %s elements, array elements must be primitive types", kindString(a.Elt)))
-				}
-
-				if len <= 0 {
-					g.abortAt(a.Len.Pos(), fmt.Sprintf("Marshalling not supported for zero length arrays, why does an ABI struct have one?"))
-				}
-			},
-			unhandled: func(_ *ast.Ident) {
-				g.abortAt(f.Pos(), fmt.Sprintf("Marshalling not supported for %s fields", kindString(f.Type)))
-			},
-		}.dispatch(f)
-	})
-}
-
 // scalarSize returns the size of type identified by t. If t isn't a primitive
 // type, the size isn't known at code generation time, and must be resolved via
 // the marshal.Marshallable interface.
@@ -190,7 +113,8 @@ func (g *interfaceGenerator) shiftDynamic(bufVar, name string) {
 	g.emit("%s = %s[%s.SizeBytes():]\n", bufVar, bufVar, name)
 }
 
-func (g *interfaceGenerator) marshalScalar(accessor, typ string, bufVar string) {
+// marshalScalar writes a single scalar to a byte slice.
+func (g *interfaceGenerator) marshalScalar(accessor, typ, bufVar string) {
 	switch typ {
 	case "int8", "uint8", "byte":
 		g.emit("%s[0] = byte(%s)\n", bufVar, accessor)
@@ -213,43 +137,26 @@ func (g *interfaceGenerator) marshalScalar(accessor, typ string, bufVar string)
 	}
 }
 
-func (g *interfaceGenerator) unmarshalScalar(accessor, typ string, bufVar string) {
+// unmarshalScalar reads a single scalar from a byte slice.
+func (g *interfaceGenerator) unmarshalScalar(accessor, typ, bufVar string) {
 	switch typ {
-	case "int8":
-		g.emit("%s = int8(%s[0])\n", accessor, bufVar)
-		g.shift(bufVar, 1)
-	case "uint8":
-		g.emit("%s = uint8(%s[0])\n", accessor, bufVar)
-		g.shift(bufVar, 1)
 	case "byte":
 		g.emit("%s = %s[0]\n", accessor, bufVar)
 		g.shift(bufVar, 1)
-
-	case "int16":
-		g.recordUsedImport("usermem")
-		g.emit("%s = int16(usermem.ByteOrder.Uint16(%s[:2]))\n", accessor, bufVar)
-		g.shift(bufVar, 2)
-	case "uint16":
+	case "int8", "uint8":
+		g.emit("%s = %s(%s[0])\n", accessor, typ, bufVar)
+		g.shift(bufVar, 1)
+	case "int16", "uint16":
 		g.recordUsedImport("usermem")
-		g.emit("%s = usermem.ByteOrder.Uint16(%s[:2])\n", accessor, bufVar)
+		g.emit("%s = %s(usermem.ByteOrder.Uint16(%s[:2]))\n", accessor, typ, bufVar)
 		g.shift(bufVar, 2)
-
-	case "int32":
-		g.recordUsedImport("usermem")
-		g.emit("%s = int32(usermem.ByteOrder.Uint32(%s[:4]))\n", accessor, bufVar)
-		g.shift(bufVar, 4)
-	case "uint32":
+	case "int32", "uint32":
 		g.recordUsedImport("usermem")
-		g.emit("%s = usermem.ByteOrder.Uint32(%s[:4])\n", accessor, bufVar)
+		g.emit("%s = %s(usermem.ByteOrder.Uint32(%s[:4]))\n", accessor, typ, bufVar)
 		g.shift(bufVar, 4)
-
-	case "int64":
-		g.recordUsedImport("usermem")
-		g.emit("%s = int64(usermem.ByteOrder.Uint64(%s[:8]))\n", accessor, bufVar)
-		g.shift(bufVar, 8)
-	case "uint64":
+	case "int64", "uint64":
 		g.recordUsedImport("usermem")
-		g.emit("%s = usermem.ByteOrder.Uint64(%s[:8])\n", accessor, bufVar)
+		g.emit("%s = %s(usermem.ByteOrder.Uint64(%s[:8]))\n", accessor, typ, bufVar)
 		g.shift(bufVar, 8)
 	default:
 		g.emit("%s.UnmarshalBytes(%s[:%s.SizeBytes()])\n", accessor, bufVar, accessor)
@@ -258,348 +165,112 @@ func (g *interfaceGenerator) unmarshalScalar(accessor, typ string, bufVar string
 	}
 }
 
-// areFieldsPackedExpression returns a go expression checking whether g.t's fields are
-// packed. Returns "", false if g.t has no fields that may be potentially
-// packed, otherwise returns <clause>, true, where <clause> is an expression
-// like "t.a.Packed() && t.b.Packed() && t.c.Packed()".
-func (g *interfaceGenerator) areFieldsPackedExpression() (string, bool) {
-	if len(g.as) == 0 {
-		return "", false
-	}
-
-	cs := make([]string, 0, len(g.as))
-	for accessor, _ := range g.as {
-		cs = append(cs, fmt.Sprintf("%s.Packed()", accessor))
-	}
-	return strings.Join(cs, " && "), true
+// emitCastToByteSlice unsafely casts an arbitrary type's underlying memory to a
+// byte slice, bypassing escape analysis. The caller is responsible for ensuring
+// srcPtr lives until they're done with dstVar, the runtime does not consider
+// dstVar dependent on srcPtr due to the escape analysis bypass.
+//
+// srcPtr must be a pointer.
+//
+// This function uses internally uses the identifier "hdr", and cannot be used
+// in a context where it is already bound.
+func (g *interfaceGenerator) emitCastToByteSlice(srcPtr, dstVar, lenExpr string) {
+	g.recordUsedImport("gohacks")
+	g.emit("// Construct a slice backed by dst's underlying memory.\n")
+	g.emit("var %s []byte\n", dstVar)
+	g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&%s))\n", dstVar)
+	g.emit("hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(%s)))\n", srcPtr)
+	g.emit("hdr.Len = %s\n", lenExpr)
+	g.emit("hdr.Cap = %s\n\n", lenExpr)
 }
 
-func (g *interfaceGenerator) emitMarshallable() {
-	// Is g.t a packed struct without consideing field types?
-	thisPacked := true
-	g.forEachField(func(f *ast.Field) {
-		if f.Tag != nil {
-			if f.Tag.Value == "`marshal:\"unaligned\"`" {
-				if thisPacked {
-					debugfAt(g.f.Position(g.t.Pos()),
-						fmt.Sprintf("Marking type '%s' as not packed due to tag `marshal:\"unaligned\"`.\n", g.t.Name))
-					thisPacked = false
-				}
-			}
-		}
-	})
-
-	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
-	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		primitiveSize := 0
-		var dynamicSizeTerms []string
-
-		g.forEachField(fieldDispatcher{
-			primitive: func(n, t *ast.Ident) {
-				if size, dynamic := g.scalarSize(t); !dynamic {
-					primitiveSize += size
-				} else {
-					g.recordUsedMarshallable(t.Name)
-					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("%s.SizeBytes()", g.fieldAccessor(n)))
-				}
-			},
-			selector: func(n, tX, tSel *ast.Ident) {
-				tName := fmt.Sprintf("%s.%s", tX.Name, tSel.Name)
-				g.recordUsedImport(tX.Name)
-				g.recordUsedMarshallable(tName)
-				dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", tName))
-			},
-			array: func(n, t *ast.Ident, len int) {
-				if len < 1 {
-					// Zero-length arrays should've been rejected by validate().
-					panic("unreachable")
-				}
-				if size, dynamic := g.scalarSize(t); !dynamic {
-					primitiveSize += size * len
-				} else {
-					g.recordUsedMarshallable(t.Name)
-					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()*%d", t.Name, len))
-				}
-			},
-		}.dispatch)
-		g.emit("return %d", primitiveSize)
-		if len(dynamicSizeTerms) > 0 {
-			g.incIndent()
-		}
-		{
-			for _, d := range dynamicSizeTerms {
-				g.emitNoIndent(" +\n")
-				g.emit(d)
-			}
-		}
-		if len(dynamicSizeTerms) > 0 {
-			g.decIndent()
-		}
-	})
-	g.emit("\n}\n\n")
-
-	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
-	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.forEachField(fieldDispatcher{
-			primitive: func(n, t *ast.Ident) {
-				if n.Name == "_" {
-					g.emit("// Padding: dst[:sizeof(%s)] ~= %s(0)\n", t.Name, t.Name)
-					if len, dynamic := g.scalarSize(t); !dynamic {
-						g.shift("dst", len)
-					} else {
-						// We can't use shiftDynamic here because we don't have
-						// an instance of the dynamic type we can referece here
-						// (since the version in this struct is anonymous). Use
-						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("dst = dst[(*%s)(nil).SizeBytes():]\n", t.Name)
-					}
-					return
-				}
-				g.marshalScalar(g.fieldAccessor(n), t.Name, "dst")
-			},
-			selector: func(n, tX, tSel *ast.Ident) {
-				g.marshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "dst")
-			},
-			array: func(n, t *ast.Ident, size int) {
-				if n.Name == "_" {
-					g.emit("// Padding: dst[:sizeof(%s)*%d] ~= [%d]%s{0}\n", t.Name, size, size, t.Name)
-					if len, dynamic := g.scalarSize(t); !dynamic {
-						g.shift("dst", len*size)
-					} else {
-						// We can't use shiftDynamic here because we don't have
-						// an instance of the dynamic type we can reference here
-						// (since the version in this struct is anonymous). Use
-						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("dst = dst[(*%s)(nil).SizeBytes()*%d:]\n", t.Name, size)
-					}
-					return
-				}
-
-				g.emit("for i := 0; i < %d; i++ {\n", size)
-				g.inIndent(func() {
-					g.marshalScalar(fmt.Sprintf("%s[i]", g.fieldAccessor(n)), t.Name, "dst")
-				})
-				g.emit("}\n")
-			},
-		}.dispatch)
-	})
-	g.emit("}\n\n")
-
-	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
-	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		g.forEachField(fieldDispatcher{
-			primitive: func(n, t *ast.Ident) {
-				if n.Name == "_" {
-					g.emit("// Padding: var _ %s ~= src[:sizeof(%s)]\n", t.Name, t.Name)
-					if len, dynamic := g.scalarSize(t); !dynamic {
-						g.shift("src", len)
-					} else {
-						// We can't use shiftDynamic here because we don't have
-						// an instance of the dynamic type we can reference here
-						// (since the version in this struct is anonymous). Use
-						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("src = src[(*%s)(nil).SizeBytes():]\n", t.Name)
-						g.recordPotentiallyNonPackedField(fmt.Sprintf("(*%s)(nil)", t.Name))
-					}
-					return
-				}
-				g.unmarshalScalar(g.fieldAccessor(n), t.Name, "src")
-			},
-			selector: func(n, tX, tSel *ast.Ident) {
-				g.unmarshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "src")
-			},
-			array: func(n, t *ast.Ident, size int) {
-				if n.Name == "_" {
-					g.emit("// Padding: ~ copy([%d]%s(%s), src[:sizeof(%s)*%d])\n", size, t.Name, g.fieldAccessor(n), t.Name, size)
-					if len, dynamic := g.scalarSize(t); !dynamic {
-						g.shift("src", len*size)
-					} else {
-						// We can't use shiftDynamic here because we don't have
-						// an instance of the dynamic type we can referece here
-						// (since the version in this struct is anonymous). Use
-						// a typed nil pointer to call SizeBytes() instead.
-						g.emit("src = src[(*%s)(nil).SizeBytes()*%d:]\n", t.Name, size)
-					}
-					return
-				}
-
-				g.emit("for i := 0; i < %d; i++ {\n", size)
-				g.inIndent(func() {
-					g.unmarshalScalar(fmt.Sprintf("%s[i]", g.fieldAccessor(n)), t.Name, "src")
-				})
-				g.emit("}\n")
-			},
-		}.dispatch)
-	})
-	g.emit("}\n\n")
-
-	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
-	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		expr, fieldsMaybePacked := g.areFieldsPackedExpression()
-		switch {
-		case !thisPacked:
-			g.emit("return false\n")
-		case fieldsMaybePacked:
-			g.emit("return %s\n", expr)
-		default:
-			g.emit("return true\n")
-
-		}
-	})
-	g.emit("}\n\n")
-
-	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
-	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		if thisPacked {
-			g.recordUsedImport("safecopy")
-			g.recordUsedImport("unsafe")
-			if cond, ok := g.areFieldsPackedExpression(); ok {
-				g.emit("if %s {\n", cond)
-				g.inIndent(func() {
-					g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
-				})
-				g.emit("} else {\n")
-				g.inIndent(func() {
-					g.emit("%s.MarshalBytes(dst)\n", g.r)
-				})
-				g.emit("}\n")
-			} else {
-				g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
-			}
-		} else {
-			g.emit("// Type %s doesn't have a packed layout in memory, fallback to MarshalBytes.\n", g.typeName())
-			g.emit("%s.MarshalBytes(dst)\n", g.r)
-		}
-	})
-	g.emit("}\n\n")
-
-	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
-	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		if thisPacked {
-			g.recordUsedImport("safecopy")
-			g.recordUsedImport("unsafe")
-			if cond, ok := g.areFieldsPackedExpression(); ok {
-				g.emit("if %s {\n", cond)
-				g.inIndent(func() {
-					g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
-				})
-				g.emit("} else {\n")
-				g.inIndent(func() {
-					g.emit("%s.UnmarshalBytes(src)\n", g.r)
-				})
-				g.emit("}\n")
-			} else {
-				g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
-			}
-		} else {
-			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
-			g.emit("%s.UnmarshalBytes(src)\n", g.r)
-		}
-	})
-	g.emit("}\n\n")
-
-	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
-	g.recordUsedImport("marshal")
-	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		fallback := func() {
-			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
-			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
-			g.emit("%s.MarshalBytes(buf)\n", g.r)
-			g.emit("return task.CopyOutBytes(addr, buf)\n")
-		}
-		if thisPacked {
-			g.recordUsedImport("reflect")
-			g.recordUsedImport("runtime")
-			g.recordUsedImport("unsafe")
-			if cond, ok := g.areFieldsPackedExpression(); ok {
-				g.emit("if !%s {\n", cond)
-				g.inIndent(fallback)
-				g.emit("}\n\n")
-			}
-			// Fast serialization.
-			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-			g.emit("val := uintptr(ptr)\n")
-			g.emit("val = val^0\n\n")
+// emitCastToByteSlice unsafely casts a slice with elements of an abitrary type
+// to a byte slice. As part of the cast, the byte slice is made to look
+// independent of the src slice by bypassing escape analysis. This means the
+// byte slice can be used without causing the source to escape. The caller is
+// responsible for ensuring srcPtr lives until they're done with dstVar, as the
+// runtime no longer considers dstVar dependent on srcPtr and is free to GC it.
+//
+// srcPtr must be a pointer.
+//
+// This function uses internally uses the identifiers "ptr", "val" and "hdr",
+// and cannot be used in a context where these identifiers are already bound.
+func (g *interfaceGenerator) emitCastSliceToByteSlice(srcPtr, dstVar, lenExpr string) {
+	g.emitNoEscapeSliceDataPointer(srcPtr, "val")
+
+	g.emit("// Construct a slice backed by dst's underlying memory.\n")
+	g.emit("var %s []byte\n", dstVar)
+	g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&%s))\n", dstVar)
+	g.emit("hdr.Data = uintptr(val)\n")
+	g.emit("hdr.Len = %s\n", lenExpr)
+	g.emit("hdr.Cap = %s\n\n", lenExpr)
+}
 
-			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-			g.emit("var buf []byte\n")
-			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-			g.emit("hdr.Data = val\n")
-			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+// emitNoEscapeSliceDataPointer unsafely casts a slice's data pointer to an
+// unsafe.Pointer, bypassing escape analysis. The caller is responsible for
+// ensuring srcPtr lives until they're done with dstVar, as the runtime no
+// longer considers dstVar dependent on srcPtr and is free to GC it.
+//
+// srcPtr must be a pointer.
+//
+// This function uses internally uses the identifier "ptr" cannot be used in a
+// context where this identifier is already bound.
+func (g *interfaceGenerator) emitNoEscapeSliceDataPointer(srcPtr, dstVar string) {
+	g.recordUsedImport("gohacks")
+	g.emit("ptr := unsafe.Pointer(%s)\n", srcPtr)
+	g.emit("%s := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data))\n\n", dstVar)
+}
 
-			g.emit("len, err := task.CopyOutBytes(addr, buf)\n")
-			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-			g.emit("// must live until after the CopyOutBytes.\n")
-			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return len, err\n")
-		} else {
-			fallback()
-		}
-	})
-	g.emit("}\n\n")
+func (g *interfaceGenerator) emitKeepAlive(ptrVar string) {
+	g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", ptrVar)
+	g.emit("// must live until the use above.\n")
+	g.emit("runtime.KeepAlive(%s)\n", ptrVar)
+}
 
-	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
-	g.recordUsedImport("marshal")
-	g.recordUsedImport("usermem")
-	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
-	g.inIndent(func() {
-		fallback := func() {
-			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
-			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes())\n", g.r)
-			g.emit("n, err := task.CopyInBytes(addr, buf)\n")
-			g.emit("if err != nil {\n")
-			g.inIndent(func() {
-				g.emit("return n, err\n")
-			})
-			g.emit("}\n")
+func (g *interfaceGenerator) expandBinaryExpr(b *strings.Builder, e *ast.BinaryExpr) {
+	switch x := e.X.(type) {
+	case *ast.BinaryExpr:
+		// Recursively expand sub-expression.
+		g.expandBinaryExpr(b, x)
+	case *ast.Ident:
+		fmt.Fprintf(b, "%s", x.Name)
+	case *ast.BasicLit:
+		fmt.Fprintf(b, "%s", x.Value)
+	default:
+		g.abortAt(e.Pos(), "Cannot convert binary expression to output code. Go-marshal currently only handles simple expressions of literals, constants and basic identifiers")
+	}
 
-			g.emit("%s.UnmarshalBytes(buf)\n", g.r)
-			g.emit("return n, nil\n")
-		}
-		if thisPacked {
-			g.recordUsedImport("reflect")
-			g.recordUsedImport("runtime")
-			g.recordUsedImport("unsafe")
-			if cond, ok := g.areFieldsPackedExpression(); ok {
-				g.emit("if !%s {\n", cond)
-				g.inIndent(fallback)
-				g.emit("}\n\n")
-			}
-			// Fast deserialization.
-			g.emit("// Bypass escape analysis on %s. The no-op arithmetic operation on the\n", g.r)
-			g.emit("// pointer makes the compiler think val doesn't depend on %s.\n", g.r)
-			g.emit("// See src/runtime/stubs.go:noescape() in the golang toolchain.\n")
-			g.emit("ptr := unsafe.Pointer(%s)\n", g.r)
-			g.emit("val := uintptr(ptr)\n")
-			g.emit("val = val^0\n\n")
+	fmt.Fprintf(b, "%s", e.Op)
 
-			g.emit("// Construct a slice backed by %s's underlying memory.\n", g.r)
-			g.emit("var buf []byte\n")
-			g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))\n")
-			g.emit("hdr.Data = val\n")
-			g.emit("hdr.Len = %s.SizeBytes()\n", g.r)
-			g.emit("hdr.Cap = %s.SizeBytes()\n\n", g.r)
+	switch y := e.Y.(type) {
+	case *ast.BinaryExpr:
+		// Recursively expand sub-expression.
+		g.expandBinaryExpr(b, y)
+	case *ast.Ident:
+		fmt.Fprintf(b, "%s", y.Name)
+	case *ast.BasicLit:
+		fmt.Fprintf(b, "%s", y.Value)
+	default:
+		g.abortAt(e.Pos(), "Cannot convert binary expression to output code. Go-marshal currently only handles simple expressions of literals, constants and basic identifiers")
+	}
+}
 
-			g.emit("len, err := task.CopyInBytes(addr, buf)\n")
-			g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", g.r)
-			g.emit("// must live until after the CopyInBytes.\n")
-			g.emit("runtime.KeepAlive(%s)\n", g.r)
-			g.emit("return len, err\n")
-		} else {
-			fallback()
-		}
-	})
-	g.emit("}\n\n")
+// arrayLenExpr returns a string containing a valid golang expression
+// representing the length of array a. The returned expression should be treated
+// as a single value, and will be already parenthesized as required.
+func (g *interfaceGenerator) arrayLenExpr(a *ast.ArrayType) string {
+	var b strings.Builder
+
+	switch l := a.Len.(type) {
+	case *ast.Ident:
+		fmt.Fprintf(&b, "%s", l.Name)
+	case *ast.BasicLit:
+		fmt.Fprintf(&b, "%s", l.Value)
+	case *ast.BinaryExpr:
+		g.expandBinaryExpr(&b, l)
+		return fmt.Sprintf("(%s)", b.String())
+	default:
+		g.abortAt(l.Pos(), "Cannot convert this array len expression to output code. Go-marshal currently only handles simple expressions of literals, constants and basic identifiers")
+	}
+	return b.String()
 }
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
new file mode 100644
index 000000000..72ef03a22
--- /dev/null
+++ b/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go
@@ -0,0 +1,146 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains the bits of the code generator specific to marshalling
+// newtypes on arrays.
+
+package gomarshal
+
+import (
+	"fmt"
+	"go/ast"
+)
+
+func (g *interfaceGenerator) validateArrayNewtype(n *ast.Ident, a *ast.ArrayType) {
+	if a.Len == nil {
+		g.abortAt(a.Pos(), fmt.Sprintf("Dynamically sized slice '%s' cannot be marshalled, arrays must be statically sized", n.Name))
+	}
+
+	if _, ok := a.Elt.(*ast.Ident); !ok {
+		g.abortAt(a.Elt.Pos(), fmt.Sprintf("Marshalling not supported for arrays with %s elements, array elements must be primitive types", kindString(a.Elt)))
+	}
+}
+
+func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n *ast.Ident, a *ast.ArrayType, elt *ast.Ident) {
+	g.recordUsedImport("io")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("reflect")
+	g.recordUsedImport("runtime")
+	g.recordUsedImport("safecopy")
+	g.recordUsedImport("unsafe")
+	g.recordUsedImport("usermem")
+
+	lenExpr := g.arrayLenExpr(a)
+
+	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if size, dynamic := g.scalarSize(elt); !dynamic {
+			g.emit("return %d * %s\n", size, lenExpr)
+		} else {
+			g.emit("return (*%s)(nil).SizeBytes() * %s\n", n.Name, lenExpr)
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
+	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr)
+		g.inIndent(func() {
+			g.marshalScalar(fmt.Sprintf("%s[idx]", g.r), elt.Name, "dst")
+		})
+		g.emit("}\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
+	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr)
+		g.inIndent(func() {
+			g.unmarshalScalar(fmt.Sprintf("%s[idx]", g.r), elt.Name, "src")
+		})
+		g.emit("}\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Array newtypes are always packed.\n")
+		g.emit("return true\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
+	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
+	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
+	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := w.Write(buf)\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return int64(length), err\n")
+
+	})
+	g.emit("}\n\n")
+}
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
new file mode 100644
index 000000000..39f654ea8
--- /dev/null
+++ b/tools/go_marshal/gomarshal/generator_interfaces_primitive_newtype.go
@@ -0,0 +1,289 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains the bits of the code generator specific to marshalling
+// newtypes on primitives.
+
+package gomarshal
+
+import (
+	"fmt"
+	"go/ast"
+)
+
+// marshalPrimitiveScalar writes a single primitive variable to a byte
+// slice.
+func (g *interfaceGenerator) marshalPrimitiveScalar(accessor, typ, bufVar string) {
+	switch typ {
+	case "int8", "uint8", "byte":
+		g.emit("%s[0] = byte(*%s)\n", bufVar, accessor)
+	case "int16", "uint16":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint16(%s[:2], uint16(*%s))\n", bufVar, accessor)
+	case "int32", "uint32":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint32(%s[:4], uint32(*%s))\n", bufVar, accessor)
+	case "int64", "uint64":
+		g.recordUsedImport("usermem")
+		g.emit("usermem.ByteOrder.PutUint64(%s[:8], uint64(*%s))\n", bufVar, accessor)
+	default:
+		g.emit("// Explicilty cast to the underlying type before dispatching to\n")
+		g.emit("// MarshalBytes, so we don't recursively call %s.MarshalBytes\n", accessor)
+		g.emit("inner := (*%s)(%s)\n", typ, accessor)
+		g.emit("inner.MarshalBytes(%s[:%s.SizeBytes()])\n", bufVar, accessor)
+	}
+}
+
+// unmarshalPrimitiveScalar read a single primitive variable from a byte slice.
+func (g *interfaceGenerator) unmarshalPrimitiveScalar(accessor, typ, bufVar, typeCast string) {
+	switch typ {
+	case "byte":
+		g.emit("*%s = %s(%s[0])\n", accessor, typeCast, bufVar)
+	case "int8", "uint8":
+		g.emit("*%s = %s(%s(%s[0]))\n", accessor, typeCast, typ, bufVar)
+	case "int16", "uint16":
+		g.recordUsedImport("usermem")
+		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint16(%s[:2])))\n", accessor, typeCast, typ, bufVar)
+	case "int32", "uint32":
+		g.recordUsedImport("usermem")
+		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint32(%s[:4])))\n", accessor, typeCast, typ, bufVar)
+	case "int64", "uint64":
+		g.recordUsedImport("usermem")
+		g.emit("*%s = %s(%s(usermem.ByteOrder.Uint64(%s[:8])))\n", accessor, typeCast, typ, bufVar)
+	default:
+		g.emit("// Explicilty cast to the underlying type before dispatching to\n")
+		g.emit("// UnmarshalBytes, so we don't recursively call %s.UnmarshalBytes\n", accessor)
+		g.emit("inner := (*%s)(%s)\n", typ, accessor)
+		g.emit("inner.UnmarshalBytes(%s[:%s.SizeBytes()])\n", bufVar, accessor)
+	}
+}
+
+func (g *interfaceGenerator) validatePrimitiveNewtype(t *ast.Ident) {
+	switch t.Name {
+	case "int8", "uint8", "byte", "int16", "uint16", "int32", "uint32", "int64", "uint64":
+		// These are the only primitive types we're allow. Below, we provide
+		// suggestions for some disallowed types and reject them, then attempt
+		// to marshal any remaining types by invoking the marshal.Marshallable
+		// interface on them. If these types don't actually implement
+		// marshal.Marshallable, compilation of the generated code will fail
+		// with an appropriate error message.
+		return
+	case "int":
+		g.abortAt(t.Pos(), "Type 'int' has ambiguous width, use int32 or int64")
+	case "uint":
+		g.abortAt(t.Pos(), "Type 'uint' has ambiguous width, use uint32 or uint64")
+	case "string":
+		g.abortAt(t.Pos(), "Type 'string' is dynamically-sized and cannot be marshalled, use a fixed size byte array '[...]byte' instead")
+	default:
+		debugfAt(g.f.Position(t.Pos()), fmt.Sprintf("Found derived type '%s', will attempt dispatch via marshal.Marshallable.\n", t.Name))
+	}
+}
+
+// emitMarshallableForPrimitiveNewtype outputs code to implement the
+// marshal.Marshallable interface for a newtype on a primitive. Primitive
+// newtypes are always packed, so we can omit the various fallbacks required for
+// non-packed structs.
+func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident) {
+	g.recordUsedImport("io")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("reflect")
+	g.recordUsedImport("runtime")
+	g.recordUsedImport("safecopy")
+	g.recordUsedImport("unsafe")
+	g.recordUsedImport("usermem")
+
+	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if size, dynamic := g.scalarSize(nt); !dynamic {
+			g.emit("return %d\n", size)
+		} else {
+			g.emit("return (*%s)(nil).SizeBytes()\n", nt.Name)
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
+	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.marshalPrimitiveScalar(g.r, nt.Name, "dst")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
+	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.unmarshalPrimitiveScalar(g.r, nt.Name, "src", g.typeName())
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("// Scalar newtypes are always packed.\n")
+		g.emit("return true\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
+	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
+	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
+	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+		g.emit("length, err := w.Write(buf)\n")
+		g.emitKeepAlive(g.r)
+		g.emit("return int64(length), err\n")
+
+	})
+	g.emit("}\n\n")
+}
+
+func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Ident, slice *sliceAPI) {
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+	g.recordUsedImport("reflect")
+	g.recordUsedImport("runtime")
+	g.recordUsedImport("unsafe")
+
+	eltType := g.typeName()
+	if slice.inner {
+		eltType = nt.Name
+	}
+
+	g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, eltType)
+	g.emit("//go:nosplit\n")
+	g.emit("func Copy%sIn(task marshal.Task, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, eltType)
+	g.inIndent(func() {
+		g.emit("count := len(dst)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		g.emitCastSliceToByteSlice("&dst", "buf", "size * count")
+
+		g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
+		g.emitKeepAlive("dst")
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, eltType)
+	g.emit("//go:nosplit\n")
+	g.emit("func Copy%sOut(task marshal.Task, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, eltType)
+	g.inIndent(func() {
+		g.emit("count := len(src)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		g.emitCastSliceToByteSlice("&src", "buf", "size * count")
+
+		g.emit("length, err := task.CopyOutBytes(addr, buf) // escapes: okay.\n")
+		g.emitKeepAlive("src")
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe%s is like %s.MarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName())
+	g.emit("func MarshalUnsafe%s(src []%s, dst []byte) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(src)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		g.emitNoEscapeSliceDataPointer("&src", "val")
+
+		g.emit("length, err := safecopy.CopyIn(dst[:(size*count)], val)\n")
+		g.emitKeepAlive("src")
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe%s is like %s.UnmarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName())
+	g.emit("func UnmarshalUnsafe%s(dst []%s, src []byte) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(dst)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		g.emitNoEscapeSliceDataPointer("&dst", "val")
+
+		g.emit("length, err := safecopy.CopyOut(val, src[:(size*count)])\n")
+		g.emitKeepAlive("dst")
+		g.emit("return length, err\n")
+	})
+	g.emit("}\n\n")
+}
diff --git a/tools/go_marshal/gomarshal/generator_interfaces_struct.go b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
new file mode 100644
index 000000000..9cd3c9579
--- /dev/null
+++ b/tools/go_marshal/gomarshal/generator_interfaces_struct.go
@@ -0,0 +1,618 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains the bits of the code generator specific to marshalling
+// structs.
+
+package gomarshal
+
+import (
+	"fmt"
+	"go/ast"
+	"strings"
+)
+
+func (g *interfaceGenerator) fieldAccessor(n *ast.Ident) string {
+	return fmt.Sprintf("%s.%s", g.r, n.Name)
+}
+
+// areFieldsPackedExpression returns a go expression checking whether g.t's fields are
+// packed. Returns "", false if g.t has no fields that may be potentially
+// packed, otherwise returns <clause>, true, where <clause> is an expression
+// like "t.a.Packed() && t.b.Packed() && t.c.Packed()".
+func (g *interfaceGenerator) areFieldsPackedExpression() (string, bool) {
+	if len(g.as) == 0 {
+		return "", false
+	}
+
+	cs := make([]string, 0, len(g.as))
+	for accessor, _ := range g.as {
+		cs = append(cs, fmt.Sprintf("%s.Packed()", accessor))
+	}
+	return strings.Join(cs, " && "), true
+}
+
+// validateStruct ensures the type we're working with can be marshalled. These
+// checks are done ahead of time and in one place so we can make assumptions
+// later.
+func (g *interfaceGenerator) validateStruct(ts *ast.TypeSpec, st *ast.StructType) {
+	forEachStructField(st, func(f *ast.Field) {
+		if len(f.Names) == 0 {
+			g.abortAt(f.Pos(), "Cannot marshal structs with embedded fields, give the field a name; use '_' for anonymous fields such as padding fields")
+		}
+	})
+
+	forEachStructField(st, func(f *ast.Field) {
+		fieldDispatcher{
+			primitive: func(_, t *ast.Ident) {
+				g.validatePrimitiveNewtype(t)
+			},
+			selector: func(_, _, _ *ast.Ident) {
+				// No validation to perform on selector fields. However this
+				// callback must still be provided.
+			},
+			array: func(n *ast.Ident, a *ast.ArrayType, _ *ast.Ident) {
+				g.validateArrayNewtype(n, a)
+			},
+			unhandled: func(_ *ast.Ident) {
+				g.abortAt(f.Pos(), fmt.Sprintf("Marshalling not supported for %s fields", kindString(f.Type)))
+			},
+		}.dispatch(f)
+	})
+}
+
+func (g *interfaceGenerator) isStructPacked(st *ast.StructType) bool {
+	packed := true
+	forEachStructField(st, func(f *ast.Field) {
+		if f.Tag != nil {
+			if f.Tag.Value == "`marshal:\"unaligned\"`" {
+				if packed {
+					debugfAt(g.f.Position(g.t.Pos()),
+						fmt.Sprintf("Marking type '%s' as not packed due to tag `marshal:\"unaligned\"`.\n", g.t.Name))
+					packed = false
+				}
+			}
+		}
+	})
+	return packed
+}
+
+func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) {
+	thisPacked := g.isStructPacked(st)
+
+	g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n")
+	g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		primitiveSize := 0
+		var dynamicSizeTerms []string
+
+		forEachStructField(st, fieldDispatcher{
+			primitive: func(n, t *ast.Ident) {
+				if size, dynamic := g.scalarSize(t); !dynamic {
+					primitiveSize += size
+				} else {
+					g.recordUsedMarshallable(t.Name)
+					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", t.Name))
+				}
+			},
+			selector: func(n, tX, tSel *ast.Ident) {
+				tName := fmt.Sprintf("%s.%s", tX.Name, tSel.Name)
+				g.recordUsedImport(tX.Name)
+				g.recordUsedMarshallable(tName)
+				dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", tName))
+			},
+			array: func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident) {
+				lenExpr := g.arrayLenExpr(a)
+				if size, dynamic := g.scalarSize(t); !dynamic {
+					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("%d*%s", size, lenExpr))
+				} else {
+					g.recordUsedMarshallable(t.Name)
+					dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()*%s", t.Name, lenExpr))
+				}
+			},
+		}.dispatch)
+		g.emit("return %d", primitiveSize)
+		if len(dynamicSizeTerms) > 0 {
+			g.incIndent()
+		}
+		{
+			for _, d := range dynamicSizeTerms {
+				g.emitNoIndent(" +\n")
+				g.emit(d)
+			}
+		}
+		if len(dynamicSizeTerms) > 0 {
+			g.decIndent()
+		}
+	})
+	g.emit("\n}\n\n")
+
+	g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n")
+	g.emit("func (%s *%s) MarshalBytes(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		forEachStructField(st, fieldDispatcher{
+			primitive: func(n, t *ast.Ident) {
+				if n.Name == "_" {
+					g.emit("// Padding: dst[:sizeof(%s)] ~= %s(0)\n", t.Name, t.Name)
+					if len, dynamic := g.scalarSize(t); !dynamic {
+						g.shift("dst", len)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can reference here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("dst = dst[(*%s)(nil).SizeBytes():]\n", t.Name)
+					}
+					return
+				}
+				g.marshalScalar(g.fieldAccessor(n), t.Name, "dst")
+			},
+			selector: func(n, tX, tSel *ast.Ident) {
+				if n.Name == "_" {
+					g.emit("// Padding: dst[:sizeof(%s)] ~= %s(0)\n", tX.Name, tSel.Name)
+					g.emit("dst = dst[(*%s.%s)(nil).SizeBytes():]\n", tX.Name, tSel.Name)
+					return
+				}
+				g.marshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "dst")
+			},
+			array: func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident) {
+				lenExpr := g.arrayLenExpr(a)
+				if n.Name == "_" {
+					g.emit("// Padding: dst[:sizeof(%s)*%s] ~= [%s]%s{0}\n", t.Name, lenExpr, lenExpr, t.Name)
+					if size, dynamic := g.scalarSize(t); !dynamic {
+						g.emit("dst = dst[%d*(%s):]\n", size, lenExpr)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can reference here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("dst = dst[(*%s)(nil).SizeBytes()*(%s):]\n", t.Name, lenExpr)
+					}
+					return
+				}
+
+				g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr)
+				g.inIndent(func() {
+					g.marshalScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "dst")
+				})
+				g.emit("}\n")
+			},
+		}.dispatch)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n")
+	g.emit("func (%s *%s) UnmarshalBytes(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		forEachStructField(st, fieldDispatcher{
+			primitive: func(n, t *ast.Ident) {
+				if n.Name == "_" {
+					g.emit("// Padding: var _ %s ~= src[:sizeof(%s)]\n", t.Name, t.Name)
+					if len, dynamic := g.scalarSize(t); !dynamic {
+						g.shift("src", len)
+					} else {
+						// We don't have an instance of the dynamic type we can
+						// reference here (since the version in this struct is
+						// anonymous). Use a typed nil pointer to call
+						// SizeBytes() instead.
+						g.shiftDynamic("src", fmt.Sprintf("(*%s)(nil)", t.Name))
+						g.recordPotentiallyNonPackedField(fmt.Sprintf("(*%s)(nil)", t.Name))
+					}
+					return
+				}
+				g.unmarshalScalar(g.fieldAccessor(n), t.Name, "src")
+			},
+			selector: func(n, tX, tSel *ast.Ident) {
+				if n.Name == "_" {
+					g.emit("// Padding: %s ~= src[:sizeof(%s.%s)]\n", g.fieldAccessor(n), tX.Name, tSel.Name)
+					g.emit("src = src[(*%s.%s)(nil).SizeBytes():]\n", tX.Name, tSel.Name)
+					g.recordPotentiallyNonPackedField(fmt.Sprintf("(*%s.%s)(nil)", tX.Name, tSel.Name))
+					return
+				}
+				g.unmarshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "src")
+			},
+			array: func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident) {
+				lenExpr := g.arrayLenExpr(a)
+				if n.Name == "_" {
+					g.emit("// Padding: ~ copy([%s]%s(%s), src[:sizeof(%s)*%s])\n", lenExpr, t.Name, g.fieldAccessor(n), t.Name, lenExpr)
+					if size, dynamic := g.scalarSize(t); !dynamic {
+						g.emit("src = src[%d*(%s):]\n", size, lenExpr)
+					} else {
+						// We can't use shiftDynamic here because we don't have
+						// an instance of the dynamic type we can referece here
+						// (since the version in this struct is anonymous). Use
+						// a typed nil pointer to call SizeBytes() instead.
+						g.emit("src = src[(*%s)(nil).SizeBytes()*(%s):]\n", t.Name, lenExpr)
+					}
+					return
+				}
+
+				g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr)
+				g.inIndent(func() {
+					g.unmarshalScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "src")
+				})
+				g.emit("}\n")
+			},
+		}.dispatch)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Packed implements marshal.Marshallable.Packed.\n")
+	g.emit("//go:nosplit\n")
+	g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		expr, fieldsMaybePacked := g.areFieldsPackedExpression()
+		switch {
+		case !thisPacked:
+			g.emit("return false\n")
+		case fieldsMaybePacked:
+			g.emit("return %s\n", expr)
+		default:
+			g.emit("return true\n")
+
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n")
+	g.emit("func (%s *%s) MarshalUnsafe(dst []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if thisPacked {
+			g.recordUsedImport("safecopy")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if %s {\n", cond)
+				g.inIndent(func() {
+					g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+				})
+				g.emit("} else {\n")
+				g.inIndent(func() {
+					g.emit("%s.MarshalBytes(dst)\n", g.r)
+				})
+				g.emit("}\n")
+			} else {
+				g.emit("safecopy.CopyIn(dst, unsafe.Pointer(%s))\n", g.r)
+			}
+		} else {
+			g.emit("// Type %s doesn't have a packed layout in memory, fallback to MarshalBytes.\n", g.typeName())
+			g.emit("%s.MarshalBytes(dst)\n", g.r)
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n")
+	g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		if thisPacked {
+			g.recordUsedImport("safecopy")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if %s {\n", cond)
+				g.inIndent(func() {
+					g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+				})
+				g.emit("} else {\n")
+				g.inIndent(func() {
+					g.emit("%s.UnmarshalBytes(src)\n", g.r)
+				})
+				g.emit("}\n")
+			} else {
+				g.emit("safecopy.CopyOut(unsafe.Pointer(%s), src)\n", g.r)
+			}
+		} else {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
+			g.emit("%s.UnmarshalBytes(src)\n", g.r)
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n")
+	g.emit("//go:nosplit\n")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+	g.emit("func (%s *%s) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
+			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r)
+			g.emit("%s.MarshalBytes(buf) // escapes: fallback.\n", g.r)
+			g.emit("return task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !%s {\n", cond)
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast serialization.
+			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+			g.emit("length, err := task.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n")
+			g.emitKeepAlive(g.r)
+			g.emit("return length, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n")
+	g.emit("//go:nosplit\n")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+	g.emit("func (%s *%s) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		g.emit("return %s.CopyOutN(task, addr, %s.SizeBytes())\n", g.r, g.r)
+	})
+	g.emit("}\n\n")
+
+	g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n")
+	g.emit("//go:nosplit\n")
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+	g.emit("func (%s *%s) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
+			g.emit("buf := task.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r)
+			g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
+			g.emit("// Unmarshal unconditionally. If we had a short copy-in, this results in a\n")
+			g.emit("// partially unmarshalled struct.\n")
+			g.emit("%s.UnmarshalBytes(buf) // escapes: fallback.\n", g.r)
+			g.emit("return length, err\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !%s {\n", cond)
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast deserialization.
+			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+			g.emit("length, err := task.CopyInBytes(addr, buf) // escapes: okay.\n")
+			g.emitKeepAlive(g.r)
+			g.emit("return length, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// WriteTo implements io.WriterTo.WriteTo.\n")
+	g.recordUsedImport("io")
+	g.emit("func (%s *%s) WriteTo(w io.Writer) (int64, error) {\n", g.r, g.typeName())
+	g.inIndent(func() {
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
+			g.emit("buf := make([]byte, %s.SizeBytes())\n", g.r)
+			g.emit("%s.MarshalBytes(buf)\n", g.r)
+			g.emit("length, err := w.Write(buf)\n")
+			g.emit("return int64(length), err\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if cond, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !%s {\n", cond)
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast serialization.
+			g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r))
+
+			g.emit("length, err := w.Write(buf)\n")
+			g.emitKeepAlive(g.r)
+			g.emit("return int64(length), err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+}
+
+func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType, slice *sliceAPI) {
+	thisPacked := g.isStructPacked(st)
+
+	if slice.inner {
+		abortAt(g.f.Position(slice.comment.Slash), fmt.Sprintf("The ':inner' argument to '+marshal slice:%s:inner' is only applicable to newtypes on primitives. Remove it from this struct declaration.", slice.ident))
+	}
+
+	g.recordUsedImport("marshal")
+	g.recordUsedImport("usermem")
+
+	g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, g.typeName())
+	g.emit("func Copy%sIn(task marshal.Task, addr usermem.Addr, dst []%s) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(dst)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
+			g.emit("buf := task.CopyScratchBuffer(size * count)\n")
+			g.emit("length, err := task.CopyInBytes(addr, buf)\n\n")
+
+			g.emit("// Unmarshal as much as possible, even on error. First handle full objects.\n")
+			g.emit("limit := length/size\n")
+			g.emit("for idx := 0; idx < limit; idx++ {\n")
+			g.inIndent(func() {
+				g.emit("dst[idx].UnmarshalBytes(buf[size*idx:size*(idx+1)])\n")
+			})
+			g.emit("}\n\n")
+
+			g.emit("// Handle any final partial object.\n")
+			g.emit("if length < size*count && length%size != 0 {\n")
+			g.inIndent(func() {
+				g.emit("idx := limit\n")
+				g.emit("dst[idx].UnmarshalBytes(buf[size*idx:size*(idx+1)])\n")
+			})
+			g.emit("}\n\n")
+
+			g.emit("return length, err\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if _, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !dst[0].Packed() {\n")
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast deserialization.
+			g.emitCastSliceToByteSlice("&dst", "buf", "size * count")
+
+			g.emit("length, err := task.CopyInBytes(addr, buf)\n")
+			g.emitKeepAlive("dst")
+			g.emit("return length, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, g.typeName())
+	g.emit("func Copy%sOut(task marshal.Task, addr usermem.Addr, src []%s) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(src)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
+			g.emit("buf := task.CopyScratchBuffer(size * count)\n")
+			g.emit("for idx := 0; idx < count; idx++ {\n")
+			g.inIndent(func() {
+				g.emit("src[idx].MarshalBytes(buf[size*idx:size*(idx+1)])\n")
+			})
+			g.emit("}\n")
+			g.emit("return task.CopyOutBytes(addr, buf)\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if _, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !src[0].Packed() {\n")
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			// Fast serialization.
+			g.emitCastSliceToByteSlice("&src", "buf", "size * count")
+
+			g.emit("length, err := task.CopyOutBytes(addr, buf)\n")
+			g.emitKeepAlive("src")
+			g.emit("return length, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// MarshalUnsafe%s is like %s.MarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName())
+	g.emit("func MarshalUnsafe%s(src []%s, dst []byte) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(src)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName())
+			g.emit("for idx := 0; idx < count; idx++ {\n")
+			g.inIndent(func() {
+				g.emit("src[idx].MarshalBytes(dst[size*idx:(size)*(idx+1)])\n")
+			})
+			g.emit("}\n")
+			g.emit("return size * count, nil\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if _, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !src[0].Packed() {\n")
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			g.emitNoEscapeSliceDataPointer("&src", "val")
+
+			g.emit("length, err := safecopy.CopyIn(dst[:(size*count)], val)\n")
+			g.emitKeepAlive("src")
+			g.emit("return length, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+
+	g.emit("// UnmarshalUnsafe%s is like %s.UnmarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName())
+	g.emit("func UnmarshalUnsafe%s(dst []%s, src []byte) (int, error) {\n", slice.ident, g.typeName())
+	g.inIndent(func() {
+		g.emit("count := len(dst)\n")
+		g.emit("if count == 0 {\n")
+		g.inIndent(func() {
+			g.emit("return 0, nil\n")
+		})
+		g.emit("}\n")
+		g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		fallback := func() {
+			g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName())
+			g.emit("for idx := 0; idx < count; idx++ {\n")
+			g.inIndent(func() {
+				g.emit("dst[idx].UnmarshalBytes(src[size*idx:size*(idx+1)])\n")
+			})
+			g.emit("}\n")
+			g.emit("return size * count, nil\n")
+		}
+		if thisPacked {
+			g.recordUsedImport("reflect")
+			g.recordUsedImport("runtime")
+			g.recordUsedImport("unsafe")
+			if _, ok := g.areFieldsPackedExpression(); ok {
+				g.emit("if !dst[0].Packed() {\n")
+				g.inIndent(fallback)
+				g.emit("}\n\n")
+			}
+			g.emitNoEscapeSliceDataPointer("&dst", "val")
+
+			g.emit("length, err := safecopy.CopyOut(val, src[:(size*count)])\n")
+			g.emitKeepAlive("dst")
+			g.emit("return length, err\n")
+		} else {
+			fallback()
+		}
+	})
+	g.emit("}\n\n")
+}
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index cc760b6d0..631295373 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -22,6 +22,7 @@ import (
 )
 
 var standardImports = []string{
+	"bytes",
 	"fmt",
 	"reflect",
 	"testing",
@@ -29,6 +30,11 @@ var standardImports = []string{
 	"gvisor.dev/gvisor/tools/go_marshal/analysis",
 }
 
+var sliceAPIImports = []string{
+	"encoding/binary",
+	"gvisor.dev/gvisor/pkg/usermem",
+}
+
 type testGenerator struct {
 	sourceBuffer
 
@@ -48,9 +54,6 @@ type testGenerator struct {
 }
 
 func newTestGenerator(t *ast.TypeSpec) *testGenerator {
-	if _, ok := t.Type.(*ast.StructType); !ok {
-		panic(fmt.Sprintf("Attempting to generate code for a not struct type %v", t))
-	}
 	g := &testGenerator{
 		t:       t,
 		r:       receiverName(t),
@@ -60,6 +63,11 @@ func newTestGenerator(t *ast.TypeSpec) *testGenerator {
 	for _, i := range standardImports {
 		g.imports.add(i).markUsed()
 	}
+	// These imports are used if a type requests the slice API. Don't
+	// mark them as used by default.
+	for _, i := range sliceAPIImports {
+		g.imports.add(i)
+	}
 
 	return g
 }
@@ -68,14 +76,6 @@ func (g *testGenerator) typeName() string {
 	return g.t.Name.Name
 }
 
-func (g *testGenerator) forEachField(fn func(f *ast.Field)) {
-	// This is guaranteed to succeed because g.t is always a struct.
-	st := g.t.Type.(*ast.StructType)
-	for _, field := range st.Fields.List {
-		fn(field)
-	}
-}
-
 func (g *testGenerator) testFuncName(base string) string {
 	return fmt.Sprintf("%s%s", base, strings.Title(g.t.Name.Name))
 }
@@ -88,10 +88,10 @@ func (g *testGenerator) inTestFunction(name string, body func()) {
 
 func (g *testGenerator) emitTestNonZeroSize() {
 	g.inTestFunction("TestSizeNonZero", func() {
-		g.emit("x := &%s{}\n", g.typeName())
+		g.emit("var x %v\n", g.typeName())
 		g.emit("if x.SizeBytes() == 0 {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(\"Marshallable.Size() should not return zero\")\n")
+			g.emit("t.Fatal(\"Marshallable.SizeBytes() should not return zero\")\n")
 		})
 		g.emit("}\n")
 	})
@@ -99,7 +99,7 @@ func (g *testGenerator) emitTestNonZeroSize() {
 
 func (g *testGenerator) emitTestSuspectAlignment() {
 	g.inTestFunction("TestSuspectAlignment", func() {
-		g.emit("x := %s{}\n", g.typeName())
+		g.emit("var x %v\n", g.typeName())
 		g.emit("analysis.AlignmentCheck(t, reflect.TypeOf(x))\n")
 	})
 }
@@ -117,35 +117,115 @@ func (g *testGenerator) emitTestMarshalUnmarshalPreservesData() {
 		g.emit("y.UnmarshalBytes(buf)\n")
 		g.emit("if !reflect.DeepEqual(x, y) {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across Marshal/Unmarshal cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, y))\n")
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalBytes/UnmarshalBytes cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, y))\n")
 		})
 		g.emit("}\n")
 		g.emit("yUnsafe.UnmarshalBytes(bufUnsafe)\n")
 		g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/Unmarshal cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, yUnsafe))\n")
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/UnmarshalBytes cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, yUnsafe))\n")
 		})
 		g.emit("}\n\n")
 
 		g.emit("z.UnmarshalUnsafe(buf)\n")
 		g.emit("if !reflect.DeepEqual(x, z) {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across Marshal/UnmarshalUnsafe cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, z))\n")
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalBytes/UnmarshalUnsafe cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, z))\n")
 		})
 		g.emit("}\n")
 		g.emit("zUnsafe.UnmarshalUnsafe(bufUnsafe)\n")
 		g.emit("if !reflect.DeepEqual(x, zUnsafe) {\n")
 		g.inIndent(func() {
-			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/UnmarshalUnsafe cycle:\\nBefore: %%+v\\nAfter: %%+v\\n\", x, zUnsafe))\n")
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/UnmarshalUnsafe cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, zUnsafe))\n")
 		})
 		g.emit("}\n")
 	})
 }
 
-func (g *testGenerator) emitTests() {
+func (g *testGenerator) emitTestMarshalUnmarshalSlicePreservesData(slice *sliceAPI) {
+	for _, name := range []string{"binary", "usermem"} {
+		if !g.imports.markUsed(name) {
+			panic(fmt.Sprintf("Generated test for '%s' referenced a non-existent import with local name '%s'", g.typeName(), name))
+		}
+	}
+
+	g.inTestFunction("TestSafeMarshalUnmarshalSlicePreservesData", func() {
+		g.emit("var x, y, yUnsafe [8]%s\n", g.typeName())
+		g.emit("analysis.RandomizeValue(&x)\n\n")
+		g.emit("size := (*%s)(nil).SizeBytes() * len(x)\n", g.typeName())
+		g.emit("buf := bytes.NewBuffer(make([]byte, size))\n")
+		g.emit("buf.Reset()\n")
+		g.emit("if err := binary.Write(buf, usermem.ByteOrder, x[:]); err != nil {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"binary.Write failed: %v\", err))\n")
+		})
+		g.emit("}\n")
+		g.emit("bufUnsafe := make([]byte, size)\n")
+		g.emit("MarshalUnsafe%s(x[:], bufUnsafe)\n\n", slice.ident)
+
+		g.emit("UnmarshalUnsafe%s(y[:], buf.Bytes())\n", slice.ident)
+		g.emit("if !reflect.DeepEqual(x, y) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across binary.Write/UnmarshalUnsafeSlice cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, y))\n")
+		})
+		g.emit("}\n")
+		g.emit("UnmarshalUnsafe%s(yUnsafe[:], bufUnsafe)\n", slice.ident)
+		g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafeSlice/UnmarshalUnsafeSlice cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, yUnsafe))\n")
+		})
+		g.emit("}\n\n")
+	})
+}
+
+func (g *testGenerator) emitTestWriteToUnmarshalPreservesData() {
+	g.inTestFunction("TestWriteToUnmarshalPreservesData", func() {
+		g.emit("var x, y, yUnsafe %s\n", g.typeName())
+		g.emit("analysis.RandomizeValue(&x)\n\n")
+
+		g.emit("var buf bytes.Buffer\n\n")
+
+		g.emit("x.WriteTo(&buf)\n")
+		g.emit("y.UnmarshalBytes(buf.Bytes())\n\n")
+		g.emit("yUnsafe.UnmarshalUnsafe(buf.Bytes())\n\n")
+
+		g.emit("if !reflect.DeepEqual(x, y) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across WriteTo/UnmarshalBytes cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, y))\n")
+		})
+		g.emit("}\n")
+		g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across WriteTo/UnmarshalUnsafe cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, yUnsafe))\n")
+		})
+		g.emit("}\n")
+	})
+}
+
+func (g *testGenerator) emitTestSizeBytesOnTypedNilPtr() {
+	g.inTestFunction("TestSizeBytesOnTypedNilPtr", func() {
+		g.emit("var x %s\n", g.typeName())
+		g.emit("sizeFromConcrete := x.SizeBytes()\n")
+		g.emit("sizeFromTypedNilPtr := (*%s)(nil).SizeBytes()\n\n", g.typeName())
+
+		g.emit("if sizeFromTypedNilPtr != sizeFromConcrete {\n")
+		g.inIndent(func() {
+			g.emit("t.Fatalf(\"SizeBytes() on typed nil pointer (%v) doesn't match size returned by a concrete object (%v).\\n\", sizeFromTypedNilPtr, sizeFromConcrete)\n")
+		})
+		g.emit("}\n")
+	})
+}
+
+func (g *testGenerator) emitTests(slice *sliceAPI) {
 	g.emitTestNonZeroSize()
 	g.emitTestSuspectAlignment()
 	g.emitTestMarshalUnmarshalPreservesData()
+	g.emitTestWriteToUnmarshalPreservesData()
+	g.emitTestSizeBytesOnTypedNilPtr()
+
+	if slice != nil {
+		g.emitTestMarshalUnmarshalSlicePreservesData(slice)
+	}
 }
 
 func (g *testGenerator) write(out io.Writer) error {
diff --git a/tools/go_marshal/gomarshal/util.go b/tools/go_marshal/gomarshal/util.go
index 3d86935b4..d94314302 100644
--- a/tools/go_marshal/gomarshal/util.go
+++ b/tools/go_marshal/gomarshal/util.go
@@ -25,7 +25,6 @@ import (
 	"path"
 	"reflect"
 	"sort"
-	"strconv"
 	"strings"
 )
 
@@ -64,12 +63,18 @@ func kindString(e ast.Expr) string {
 	}
 }
 
+func forEachStructField(st *ast.StructType, fn func(f *ast.Field)) {
+	for _, field := range st.Fields.List {
+		fn(field)
+	}
+}
+
 // fieldDispatcher is a collection of callbacks for handling different types of
 // fields in a struct declaration.
 type fieldDispatcher struct {
 	primitive func(n, t *ast.Ident)
 	selector  func(n, tX, tSel *ast.Ident)
-	array     func(n, t *ast.Ident, size int)
+	array     func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident)
 	unhandled func(n *ast.Ident)
 }
 
@@ -96,22 +101,12 @@ func (fd fieldDispatcher) dispatch(f *ast.Field) {
 		case *ast.SelectorExpr:
 			fd.selector(name, v.X.(*ast.Ident), v.Sel)
 		case *ast.ArrayType:
-			len := 0
-			if v.Len != nil {
-				// Non-literal array length is handled by generatorInterfaces.validate().
-				if lenLit, ok := v.Len.(*ast.BasicLit); ok {
-					var err error
-					len, err = strconv.Atoi(lenLit.Value)
-					if err != nil {
-						panic(err)
-					}
-				}
-			}
 			switch t := v.Elt.(type) {
 			case *ast.Ident:
-				fd.array(name, t, len)
+				fd.array(name, v, t)
 			default:
-				fd.array(name, nil, len)
+				// Should be handled with a better error message during validate.
+				panic(fmt.Sprintf("Array element type is of unsupported kind. Expected *ast.Ident, got %v", t))
 			}
 		default:
 			fd.unhandled(name)
@@ -270,6 +265,11 @@ type importStmt struct {
 	aliased bool
 	// Indicates whether this import was referenced by generated code.
 	used bool
+	// AST node and file set representing the import statement, if any. These
+	// are only non-nil if the import statement originates from an input source
+	// file.
+	spec *ast.ImportSpec
+	fset *token.FileSet
 }
 
 func newImport(p string) *importStmt {
@@ -295,14 +295,27 @@ func newImportFromSpec(spec *ast.ImportSpec, f *token.FileSet) *importStmt {
 		name:    name,
 		path:    p,
 		aliased: spec.Name != nil,
+		spec:    spec,
+		fset:    f,
 	}
 }
 
+// String implements fmt.Stringer.String. This generates a string for the import
+// statement appropriate for writing directly to generated code.
 func (i *importStmt) String() string {
 	if i.aliased {
-		return fmt.Sprintf("%s \"%s\"", i.name, i.path)
+		return fmt.Sprintf("%s %q", i.name, i.path)
+	}
+	return fmt.Sprintf("%q", i.path)
+}
+
+// debugString returns a debug string representing an import statement. This
+// representation is not valid golang code and is used for debugging output.
+func (i *importStmt) debugString() string {
+	if i.spec != nil && i.fset != nil {
+		return fmt.Sprintf("%s: %s", i.fset.Position(i.spec.Path.Pos()), i)
 	}
-	return fmt.Sprintf("\"%s\"", i.path)
+	return fmt.Sprintf("(go-marshal import): %s", i)
 }
 
 func (i *importStmt) markUsed() {
@@ -310,58 +323,111 @@ func (i *importStmt) markUsed() {
 }
 
 func (i *importStmt) equivalent(other *importStmt) bool {
-	return i == other
+	return i.name == other.name && i.path == other.path && i.aliased == other.aliased
 }
 
 // importTable represents a collection of importStmts.
+//
+// An importTable may contain multiple import statements referencing the same
+// local name. All import statements aliasing to the same local name are
+// technically ambiguous, as if such an import name is used in the generated
+// code, it's not clear which import statement it refers to. We ignore any
+// potential collisions until actually writing the import table to the generated
+// source file. See importTable.write.
+//
+// Given the following import statements across all the files comprising a
+// package marshalled:
+//
+// "sync"
+// "pkg/sync"
+// "pkg/sentry/kernel"
+// ktime "pkg/sentry/kernel/time"
+//
+// An importTable representing them would look like this:
+//
+// importTable {
+//     is: map[string][]*importStmt {
+//         "sync": []*importStmt{
+//             importStmt{name:"sync", path:"sync", aliased:false}
+//             importStmt{name:"sync", path:"pkg/sync", aliased:false}
+//         },
+//         "kernel": []*importStmt{importStmt{
+//            name: "kernel",
+//            path: "pkg/sentry/kernel",
+//            aliased: false
+//         }},
+//         "ktime": []*importStmt{importStmt{
+//             name: "ktime",
+//             path: "pkg/sentry/kernel/time",
+//             aliased: true,
+//         }},
+//     }
+// }
+//
+// Note that the local name "sync" is assigned to two different import
+// statements. This is possible if the import statements are from different
+// source files in the same package.
+//
+// Since go-marshal generates a single output file per package regardless of the
+// number of input files, if "sync" is referenced by any generated code, it's
+// unclear which import statement "sync" refers to. While it's theoretically
+// possible to resolve this by assigning a unique local alias to each instance
+// of the sync package, go-marshal currently aborts when it encounters such an
+// ambiguity.
+//
+// TODO(b/151478251): importTable considers the final component of an import
+// path to be the package name, but this is only a convention. The actual
+// package name is determined by the package statement in the source files for
+// the package.
 type importTable struct {
 	// Map of imports and whether they should be copied to the output.
-	is map[string]*importStmt
+	is map[string][]*importStmt
 }
 
 func newImportTable() *importTable {
 	return &importTable{
-		is: make(map[string]*importStmt),
+		is: make(map[string][]*importStmt),
 	}
 }
 
-// Merges import statements from other into i. Collisions in import statements
-// result in a panic.
+// Merges import statements from other into i.
 func (i *importTable) merge(other *importTable) {
-	for name, im := range other.is {
-		if dup, ok := i.is[name]; ok && dup.equivalent(im) {
-			panic(fmt.Sprintf("Found colliding import statements: ours: %+v, other's: %+v", dup, im))
-		}
-
-		i.is[name] = im
+	for name, ims := range other.is {
+		i.is[name] = append(i.is[name], ims...)
 	}
 }
 
+func (i *importTable) addStmt(s *importStmt) *importStmt {
+	i.is[s.name] = append(i.is[s.name], s)
+	return s
+}
+
 func (i *importTable) add(s string) *importStmt {
 	n := newImport(s)
-	i.is[n.name] = n
-	return n
+	return i.addStmt(n)
 }
 
 func (i *importTable) addFromSpec(spec *ast.ImportSpec, f *token.FileSet) *importStmt {
-	n := newImportFromSpec(spec, f)
-	i.is[n.name] = n
-	return n
+	return i.addStmt(newImportFromSpec(spec, f))
 }
 
 // Marks the import named n as used. If no such import is in the table, returns
 // false.
 func (i *importTable) markUsed(n string) bool {
-	if n, ok := i.is[n]; ok {
-		n.markUsed()
+	if ns, ok := i.is[n]; ok {
+		for _, n := range ns {
+			n.markUsed()
+		}
 		return true
 	}
 	return false
 }
 
 func (i *importTable) clear() {
-	for _, i := range i.is {
-		i.used = false
+	for _, is := range i.is {
+		for _, i := range is {
+			i.used = false
+		}
 	}
 }
 
@@ -372,9 +438,42 @@ func (i *importTable) write(out io.Writer) error {
 	}
 
 	imports := make([]string, 0, len(i.is))
-	for _, i := range i.is {
-		if i.used {
-			imports = append(imports, i.String())
+	for name, is := range i.is {
+		var lastUsed *importStmt
+		var ambiguous bool
+
+		for _, i := range is {
+			if i.used {
+				if lastUsed != nil {
+					if !i.equivalent(lastUsed) {
+						ambiguous = true
+					}
+				}
+				lastUsed = i
+			}
+		}
+
+		if ambiguous {
+			// We have two or more import statements across the different source
+			// files that share a local name, and at least one of these imports
+			// are used by the generated code. This ambiguity can't be resolved
+			// by go-marshal and requires the user intervention. Dump a list of
+			// the colliding import statements and let the user modify the input
+			// files as appropriate.
+			var b strings.Builder
+			fmt.Fprintf(&b, "The imported name %q is used by one of the types marked for marshalling, and which import statement the code refers to is ambiguous. Perhaps give the imports unique local names?\n\n", name)
+			fmt.Fprintf(&b, "The following %d import statements are ambiguous for the local name %q:\n", len(is), name)
+			// Note: len(is) is guaranteed to be 1 or greater or ambiguous can't
+			// be true. Therefore the slicing below is safe.
+			for _, i := range is[:len(is)-1] {
+				fmt.Fprintf(&b, "  %v\n", i.debugString())
+			}
+			fmt.Fprintf(&b, "  %v", is[len(is)-1].debugString())
+			panic(b.String())
+		}
+
+		if lastUsed != nil {
+			imports = append(imports, lastUsed.String())
 		}
 	}
 	sort.Strings(imports)
diff --git a/tools/go_marshal/marshal/marshal.go b/tools/go_marshal/marshal/marshal.go
index 10614ec4d..cb2166252 100644
--- a/tools/go_marshal/marshal/marshal.go
+++ b/tools/go_marshal/marshal/marshal.go
@@ -21,6 +21,8 @@
 package marshal
 
 import (
+	"io"
+
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -40,18 +42,34 @@ type Task interface {
 	CopyInBytes(addr usermem.Addr, b []byte) (int, error)
 }
 
-// Marshallable represents a type that can be marshalled to and from memory.
+// Marshallable represents operations on a type that can be marshalled to and
+// from memory.
+//
+// go-marshal automatically generates implementations for this interface for
+// types marked as '+marshal'.
 type Marshallable interface {
+	io.WriterTo
+
 	// SizeBytes is the size of the memory representation of a type in
 	// marshalled form.
+	//
+	// SizeBytes must handle a nil receiver. Practically, this means SizeBytes
+	// cannot deference any fields on the object implementing it (but will
+	// likely make use of the type of these fields).
 	SizeBytes() int
 
-	// MarshalBytes serializes a copy of a type to dst. dst must be at least
-	// SizeBytes() long.
+	// MarshalBytes serializes a copy of a type to dst. dst may be smaller than
+	// SizeBytes(), which results in a part of the struct being marshalled. Note
+	// that this may have unexpected results for non-packed types, as implicit
+	// padding needs to be taken into account when reasoning about how much of
+	// the type is serialized.
 	MarshalBytes(dst []byte)
 
-	// UnmarshalBytes deserializes a type from src. src must be at least
-	// SizeBytes() long.
+	// UnmarshalBytes deserializes a type from src. src may be smaller than
+	// SizeBytes(), which results in a partially deserialized struct. Note that
+	// this may have unexpected results for non-packed types, as implicit
+	// padding needs to be taken into account when reasoning about how much of
+	// the type is deserialized.
 	UnmarshalBytes(src []byte)
 
 	// Packed returns true if the marshalled size of the type is the same as the
@@ -59,13 +77,20 @@ type Marshallable interface {
 	// starting at unaligned addresses (should always be true by default for ABI
 	// structs, verified by automatically generated tests when using
 	// go_marshal), and has no fields marked `marshal:"unaligned"`.
+	//
+	// Packed must return the same result for all possible values of the type
+	// implementing it. Violating this constraint implies the type doesn't have
+	// a static memory layout, and will lead to memory corruption.
+	// Go-marshal-generated code reuses the result of Packed for multiple values
+	// of the same type.
 	Packed() bool
 
 	// MarshalUnsafe serializes a type by bulk copying its in-memory
 	// representation to the dst buffer. This is only safe to do when the type
 	// has no implicit padding, see Marshallable.Packed. When Packed would
 	// return false, MarshalUnsafe should fall back to the safer but slower
-	// MarshalBytes.
+	// MarshalBytes. dst may be smaller than SizeBytes(), see comment for
+	// MarshalBytes for implications.
 	MarshalUnsafe(dst []byte)
 
 	// UnmarshalUnsafe deserializes a type by directly copying to the underlying
@@ -74,7 +99,8 @@ type Marshallable interface {
 	// This allows much faster unmarshalling of types which have no implicit
 	// padding, see Marshallable.Packed. When Packed would return false,
 	// UnmarshalUnsafe should fall back to the safer but slower unmarshal
-	// mechanism implemented in UnmarshalBytes.
+	// mechanism implemented in UnmarshalBytes. src may be smaller than
+	// SizeBytes(), see comment for UnmarshalBytes for implications.
 	UnmarshalUnsafe(src []byte)
 
 	// CopyIn deserializes a Marshallable type from a task's memory. This may
@@ -83,6 +109,10 @@ type Marshallable interface {
 	// marshalled does not escape. The implementation should avoid creating
 	// extra copies in memory by directly deserializing to the object's
 	// underlying memory.
+	//
+	// If the copy-in from the task memory is only partially successful, CopyIn
+	// should still attempt to deserialize as much data as possible. See comment
+	// for UnmarshalBytes.
 	CopyIn(task Task, addr usermem.Addr) (int, error)
 
 	// CopyOut serializes a Marshallable type to a task's memory. This may only
@@ -90,5 +120,68 @@ type Marshallable interface {
 	// MarshalUnsafe on Marshallable.Packed types, as the type being serialized
 	// does not escape. The implementation should avoid creating extra copies in
 	// memory by directly serializing from the object's underlying memory.
+	//
+	// The copy-out to the task memory may be partially successful, in which
+	// case CopyOut returns how much data was serialized. See comment for
+	// MarshalBytes for implications.
 	CopyOut(task Task, addr usermem.Addr) (int, error)
+
+	// CopyOutN is like CopyOut, but explicitly requests a partial
+	// copy-out. Note that this may yield unexpected results for non-packed
+	// types and the caller may only want to allow this for packed types. See
+	// comment on MarshalBytes.
+	//
+	// The limit must be less than or equal to SizeBytes().
+	CopyOutN(task Task, addr usermem.Addr, limit int) (int, error)
 }
+
+// go-marshal generates additional functions for a type based on additional
+// clauses to the +marshal directive. They are documented below.
+//
+// Slice API
+// =========
+//
+// Adding a "slice" clause to the +marshal directive for structs or newtypes on
+// primitives like this:
+//
+// // +marshal slice:FooSlice
+// type Foo struct { ... }
+//
+// Generates four additional functions for marshalling slices of Foos like this:
+//
+// // MarshalUnsafeFooSlice is like Foo.MarshalUnsafe, buf for a []Foo. It's
+// // more efficient that repeatedly calling calling Foo.MarshalUnsafe over a
+// // []Foo in a loop.
+// func MarshalUnsafeFooSlice(src []Foo, dst []byte) (int, error) { ... }
+//
+// // UnmarshalUnsafeFooSlice is like Foo.UnmarshalUnsafe, buf for a []Foo. It's
+// // more efficient that repeatedly calling calling Foo.UnmarshalUnsafe over a
+// // []Foo in a loop.
+// func UnmarshalUnsafeFooSlice(dst []Foo, src []byte) (int, error) { ... }
+//
+// // CopyFooSliceIn copies in a slice of Foo objects from the task's memory.
+// func CopyFooSliceIn(task marshal.Task, addr usermem.Addr, dst []Foo) (int, error) { ... }
+//
+// // CopyFooSliceIn copies out a slice of Foo objects to the task's memory.
+// func CopyFooSliceOut(task marshal.Task, addr usermem.Addr, src []Foo) (int, error) { ... }
+//
+// The name of the functions are of the format "Copy%sIn" and "Copy%sOut", where
+// %s is the first argument to the slice clause. This directive is not supported
+// for newtypes on arrays.
+//
+// The slice clause also takes an optional second argument, which must be the
+// value "inner":
+//
+// // +marshal slice:Int32Slice:inner
+// type Int32 int32
+//
+// This is only valid on newtypes on primitives, and causes the generated
+// functions to accept slices of the inner type instead:
+//
+// func CopyInt32SliceIn(task marshal.Task, addr usermem.Addr, dst []int32) (int, error) { ... }
+//
+// Without "inner", they would instead be:
+//
+// func CopyInt32SliceIn(task marshal.Task, addr usermem.Addr, dst []Int32) (int, error) { ... }
+//
+// This may help avoid a cast depending on how the generated functions are used.
diff --git a/tools/go_marshal/primitive/BUILD b/tools/go_marshal/primitive/BUILD
new file mode 100644
index 000000000..cc08ba63a
--- /dev/null
+++ b/tools/go_marshal/primitive/BUILD
@@ -0,0 +1,18 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "primitive",
+    srcs = [
+        "primitive.go",
+    ],
+    marshal = True,
+    visibility = [
+        "//:sandbox",
+    ],
+    deps = [
+        "//pkg/usermem",
+        "//tools/go_marshal/marshal",
+    ],
+)
diff --git a/tools/go_marshal/primitive/primitive.go b/tools/go_marshal/primitive/primitive.go
new file mode 100644
index 000000000..ebcf130ae
--- /dev/null
+++ b/tools/go_marshal/primitive/primitive.go
@@ -0,0 +1,175 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package primitive defines marshal.Marshallable implementations for primitive
+// types.
+package primitive
+
+import (
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+)
+
+// Int16 is a marshal.Marshallable implementation for int16.
+//
+// +marshal slice:Int16Slice:inner
+type Int16 int16
+
+// Uint16 is a marshal.Marshallable implementation for uint16.
+//
+// +marshal slice:Uint16Slice:inner
+type Uint16 uint16
+
+// Int32 is a marshal.Marshallable implementation for int32.
+//
+// +marshal slice:Int32Slice:inner
+type Int32 int32
+
+// Uint32 is a marshal.Marshallable implementation for uint32.
+//
+// +marshal slice:Uint32Slice:inner
+type Uint32 uint32
+
+// Int64 is a marshal.Marshallable implementation for int64.
+//
+// +marshal slice:Int64Slice:inner
+type Int64 int64
+
+// Uint64 is a marshal.Marshallable implementation for uint64.
+//
+// +marshal slice:Uint64Slice:inner
+type Uint64 uint64
+
+// Below, we define some convenience functions for marshalling primitive types
+// using the newtypes above, without requiring superfluous casts.
+
+// 16-bit integers
+
+// CopyInt16In is a convenient wrapper for copying in an int16 from the task's
+// memory.
+func CopyInt16In(task marshal.Task, addr usermem.Addr, dst *int16) (int, error) {
+	var buf Int16
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int16(buf)
+	return n, nil
+}
+
+// CopyInt16Out is a convenient wrapper for copying out an int16 to the task's
+// memory.
+func CopyInt16Out(task marshal.Task, addr usermem.Addr, src int16) (int, error) {
+	srcP := Int16(src)
+	return srcP.CopyOut(task, addr)
+}
+
+// CopyUint16In is a convenient wrapper for copying in a uint16 from the task's
+// memory.
+func CopyUint16In(task marshal.Task, addr usermem.Addr, dst *uint16) (int, error) {
+	var buf Uint16
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint16(buf)
+	return n, nil
+}
+
+// CopyUint16Out is a convenient wrapper for copying out a uint16 to the task's
+// memory.
+func CopyUint16Out(task marshal.Task, addr usermem.Addr, src uint16) (int, error) {
+	srcP := Uint16(src)
+	return srcP.CopyOut(task, addr)
+}
+
+// 32-bit integers
+
+// CopyInt32In is a convenient wrapper for copying in an int32 from the task's
+// memory.
+func CopyInt32In(task marshal.Task, addr usermem.Addr, dst *int32) (int, error) {
+	var buf Int32
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int32(buf)
+	return n, nil
+}
+
+// CopyInt32Out is a convenient wrapper for copying out an int32 to the task's
+// memory.
+func CopyInt32Out(task marshal.Task, addr usermem.Addr, src int32) (int, error) {
+	srcP := Int32(src)
+	return srcP.CopyOut(task, addr)
+}
+
+// CopyUint32In is a convenient wrapper for copying in a uint32 from the task's
+// memory.
+func CopyUint32In(task marshal.Task, addr usermem.Addr, dst *uint32) (int, error) {
+	var buf Uint32
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint32(buf)
+	return n, nil
+}
+
+// CopyUint32Out is a convenient wrapper for copying out a uint32 to the task's
+// memory.
+func CopyUint32Out(task marshal.Task, addr usermem.Addr, src uint32) (int, error) {
+	srcP := Uint32(src)
+	return srcP.CopyOut(task, addr)
+}
+
+// 64-bit integers
+
+// CopyInt64In is a convenient wrapper for copying in an int64 from the task's
+// memory.
+func CopyInt64In(task marshal.Task, addr usermem.Addr, dst *int64) (int, error) {
+	var buf Int64
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = int64(buf)
+	return n, nil
+}
+
+// CopyInt64Out is a convenient wrapper for copying out an int64 to the task's
+// memory.
+func CopyInt64Out(task marshal.Task, addr usermem.Addr, src int64) (int, error) {
+	srcP := Int64(src)
+	return srcP.CopyOut(task, addr)
+}
+
+// CopyUint64In is a convenient wrapper for copying in a uint64 from the task's
+// memory.
+func CopyUint64In(task marshal.Task, addr usermem.Addr, dst *uint64) (int, error) {
+	var buf Uint64
+	n, err := buf.CopyIn(task, addr)
+	if err != nil {
+		return n, err
+	}
+	*dst = uint64(buf)
+	return n, nil
+}
+
+// CopyUint64Out is a convenient wrapper for copying out a uint64 to the task's
+// memory.
+func CopyUint64Out(task marshal.Task, addr usermem.Addr, src uint64) (int, error) {
+	srcP := Uint64(src)
+	return srcP.CopyOut(task, addr)
+}
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index e345e3a8e..2fbcc8a03 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -25,5 +25,20 @@ go_library(
     testonly = 1,
     srcs = ["test.go"],
     marshal = True,
+    visibility = ["//tools/go_marshal/test:__subpackages__"],
     deps = ["//tools/go_marshal/test/external"],
 )
+
+go_test(
+    name = "marshal_test",
+    size = "small",
+    srcs = ["marshal_test.go"],
+    deps = [
+        ":test",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//tools/go_marshal/analysis",
+        "//tools/go_marshal/marshal",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
diff --git a/tools/go_marshal/test/benchmark_test.go b/tools/go_marshal/test/benchmark_test.go
index e12403741..224d308c7 100644
--- a/tools/go_marshal/test/benchmark_test.go
+++ b/tools/go_marshal/test/benchmark_test.go
@@ -24,7 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/tools/go_marshal/analysis"
-	test "gvisor.dev/gvisor/tools/go_marshal/test"
+	"gvisor.dev/gvisor/tools/go_marshal/test"
 )
 
 // Marshalling using the standard encoding/binary package.
@@ -176,3 +176,45 @@ func BenchmarkGoMarshalUnsafe(b *testing.B) {
 		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
 	}
 }
+
+func BenchmarkBinarySlice(b *testing.B) {
+	var s1, s2 [64]test.Stat
+	analysis.RandomizeValue(&s1)
+
+	size := binary.Size(s1)
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		buf := make([]byte, 0, size)
+		buf = binary.Marshal(buf, usermem.ByteOrder, &s1)
+		binary.Unmarshal(buf, usermem.ByteOrder, &s2)
+	}
+
+	b.StopTimer()
+
+	// Sanity check, make sure the values were preserved.
+	if !reflect.DeepEqual(s1, s2) {
+		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
+	}
+}
+
+func BenchmarkGoMarshalUnsafeSlice(b *testing.B) {
+	var s1, s2 [64]test.Stat
+	analysis.RandomizeValue(&s1)
+
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		buf := make([]byte, (*test.Stat)(nil).SizeBytes()*len(s1))
+		test.MarshalUnsafeStatSlice(s1[:], buf)
+		test.UnmarshalUnsafeStatSlice(s2[:], buf)
+	}
+
+	b.StopTimer()
+
+	// Sanity check, make sure the values were preserved.
+	if !reflect.DeepEqual(s1, s2) {
+		panic(fmt.Sprintf("Data corruption across marshal/unmarshal cycle:\nBefore: %+v\nAfter: %+v\n", s1, s2))
+	}
+}
diff --git a/tools/go_marshal/test/escape/BUILD b/tools/go_marshal/test/escape/BUILD
new file mode 100644
index 000000000..f74e6ffae
--- /dev/null
+++ b/tools/go_marshal/test/escape/BUILD
@@ -0,0 +1,14 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "escape",
+    testonly = 1,
+    srcs = ["escape.go"],
+    deps = [
+        "//pkg/usermem",
+        "//tools/go_marshal/marshal",
+        "//tools/go_marshal/test",
+    ],
+)
diff --git a/tools/go_marshal/test/escape/escape.go b/tools/go_marshal/test/escape/escape.go
new file mode 100644
index 000000000..6a46ddbf8
--- /dev/null
+++ b/tools/go_marshal/test/escape/escape.go
@@ -0,0 +1,95 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package escape
+
+import (
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/test"
+)
+
+// dummyTask implements marshal.Task.
+type dummyTask struct {
+}
+
+func (*dummyTask) CopyScratchBuffer(size int) []byte {
+	return make([]byte, size)
+}
+
+func (*dummyTask) CopyOutBytes(addr usermem.Addr, b []byte) (int, error) {
+	return len(b), nil
+}
+
+func (*dummyTask) CopyInBytes(addr usermem.Addr, b []byte) (int, error) {
+	return len(b), nil
+}
+
+func (t *dummyTask) MarshalBytes(addr usermem.Addr, marshallable marshal.Marshallable) {
+	buf := t.CopyScratchBuffer(marshallable.SizeBytes())
+	marshallable.MarshalBytes(buf)
+	t.CopyOutBytes(addr, buf)
+}
+
+func (t *dummyTask) MarshalUnsafe(addr usermem.Addr, marshallable marshal.Marshallable) {
+	buf := t.CopyScratchBuffer(marshallable.SizeBytes())
+	marshallable.MarshalUnsafe(buf)
+	t.CopyOutBytes(addr, buf)
+}
+
+// +checkescape:all
+//go:nosplit
+func doCopyIn(t *dummyTask) {
+	var stat test.Stat
+	stat.CopyIn(t, usermem.Addr(0xf000ba12))
+}
+
+// +checkescape:all
+//go:nosplit
+func doCopyOut(t *dummyTask) {
+	var stat test.Stat
+	stat.CopyOut(t, usermem.Addr(0xf000ba12))
+}
+
+// +mustescape:builtin
+// +mustescape:stack
+func doMarshalBytesDirect(t *dummyTask) {
+	var stat test.Stat
+	buf := t.CopyScratchBuffer(stat.SizeBytes())
+	stat.MarshalBytes(buf)
+	t.CopyOutBytes(usermem.Addr(0xf000ba12), buf)
+}
+
+// +mustescape:builtin
+// +mustescape:stack
+func doMarshalUnsafeDirect(t *dummyTask) {
+	var stat test.Stat
+	buf := t.CopyScratchBuffer(stat.SizeBytes())
+	stat.MarshalUnsafe(buf)
+	t.CopyOutBytes(usermem.Addr(0xf000ba12), buf)
+}
+
+// +mustescape:local,heap
+// +mustescape:stack
+func doMarshalBytesViaMarshallable(t *dummyTask) {
+	var stat test.Stat
+	t.MarshalBytes(usermem.Addr(0xf000ba12), &stat)
+}
+
+// +mustescape:local,heap
+// +mustescape:stack
+func doMarshalUnsafeViaMarshallable(t *dummyTask) {
+	var stat test.Stat
+	t.MarshalUnsafe(usermem.Addr(0xf000ba12), &stat)
+}
diff --git a/tools/go_marshal/test/external/external.go b/tools/go_marshal/test/external/external.go
index 4be3722f3..26fe8e0c8 100644
--- a/tools/go_marshal/test/external/external.go
+++ b/tools/go_marshal/test/external/external.go
@@ -21,3 +21,11 @@ package external
 type External struct {
 	j int64
 }
+
+// NotPacked is an unaligned Marshallable type for use in testing.
+//
+// +marshal
+type NotPacked struct {
+	a int32
+	b byte `marshal:"unaligned"`
+}
diff --git a/tools/go_marshal/test/marshal_test.go b/tools/go_marshal/test/marshal_test.go
new file mode 100644
index 000000000..16829ee45
--- /dev/null
+++ b/tools/go_marshal/test/marshal_test.go
@@ -0,0 +1,515 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package marshal_test contains manual tests for the marshal interface. These
+// are intended to test behaviour not covered by the automatically generated
+// tests.
+package marshal_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"reflect"
+	"runtime"
+	"testing"
+	"unsafe"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/tools/go_marshal/analysis"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+	"gvisor.dev/gvisor/tools/go_marshal/test"
+)
+
+var simulatedErr error = syserror.EFAULT
+
+// mockTask implements marshal.Task.
+type mockTask struct {
+	taskMem usermem.BytesIO
+}
+
+// populate fills the task memory with the contents of val.
+func (t *mockTask) populate(val interface{}) {
+	var buf bytes.Buffer
+	// Use binary.Write so we aren't testing go-marshal against its own
+	// potentially buggy implementation.
+	if err := binary.Write(&buf, usermem.ByteOrder, val); err != nil {
+		panic(err)
+	}
+	t.taskMem.Bytes = buf.Bytes()
+}
+
+func (t *mockTask) setLimit(n int) {
+	if len(t.taskMem.Bytes) < n {
+		grown := make([]byte, n)
+		copy(grown, t.taskMem.Bytes)
+		t.taskMem.Bytes = grown
+		return
+	}
+	t.taskMem.Bytes = t.taskMem.Bytes[:n]
+}
+
+// CopyScratchBuffer implements marshal.Task.CopyScratchBuffer.
+func (t *mockTask) CopyScratchBuffer(size int) []byte {
+	return make([]byte, size)
+}
+
+// CopyOutBytes implements marshal.Task.CopyOutBytes. The implementation
+// completely ignores the target address and stores a copy of b in its
+// internally buffer, overriding any previous contents.
+func (t *mockTask) CopyOutBytes(_ usermem.Addr, b []byte) (int, error) {
+	return t.taskMem.CopyOut(nil, 0, b, usermem.IOOpts{})
+}
+
+// CopyInBytes implements marshal.Task.CopyInBytes. The implementation
+// completely ignores the source address and always fills b from the begining of
+// its internal buffer.
+func (t *mockTask) CopyInBytes(_ usermem.Addr, b []byte) (int, error) {
+	return t.taskMem.CopyIn(nil, 0, b, usermem.IOOpts{})
+}
+
+// unsafeMemory returns the underlying memory for m. The returned slice is only
+// valid for the lifetime for m. The garbage collector isn't aware that the
+// returned slice is related to m, the caller must ensure m lives long enough.
+func unsafeMemory(m marshal.Marshallable) []byte {
+	if !m.Packed() {
+		// We can't return a slice pointing to the underlying memory
+		// since the layout isn't packed. Allocate a temporary buffer
+		// and marshal instead.
+		var buf bytes.Buffer
+		if err := binary.Write(&buf, usermem.ByteOrder, m); err != nil {
+			panic(err)
+		}
+		return buf.Bytes()
+	}
+
+	// reflect.ValueOf(m)
+	//   .Elem() // Unwrap interface to inner concrete object
+	//   .Addr() // Pointer value to object
+	//   .Pointer() // Actual address from the pointer value
+	ptr := reflect.ValueOf(m).Elem().Addr().Pointer()
+
+	size := m.SizeBytes()
+
+	var mem []byte
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&mem))
+	hdr.Data = ptr
+	hdr.Len = size
+	hdr.Cap = size
+
+	return mem
+}
+
+// unsafeMemorySlice returns the underlying memory for m. The returned slice is
+// only valid for the lifetime for m. The garbage collector isn't aware that the
+// returned slice is related to m, the caller must ensure m lives long enough.
+//
+// Precondition: m must be a slice.
+func unsafeMemorySlice(m interface{}, elt marshal.Marshallable) []byte {
+	kind := reflect.TypeOf(m).Kind()
+	if kind != reflect.Slice {
+		panic("unsafeMemorySlice called on non-slice")
+	}
+
+	if !elt.Packed() {
+		// We can't return a slice pointing to the underlying memory
+		// since the layout isn't packed. Allocate a temporary buffer
+		// and marshal instead.
+		var buf bytes.Buffer
+		if err := binary.Write(&buf, usermem.ByteOrder, m); err != nil {
+			panic(err)
+		}
+		return buf.Bytes()
+	}
+
+	v := reflect.ValueOf(m)
+	length := v.Len() * elt.SizeBytes()
+
+	var mem []byte
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&mem))
+	hdr.Data = v.Pointer() // This is a pointer to the first elem for slices.
+	hdr.Len = length
+	hdr.Cap = length
+
+	return mem
+}
+
+func isZeroes(buf []byte) bool {
+	for _, b := range buf {
+		if b != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// compareMemory compares the first n bytes of two chuncks of memory represented
+// by expected and actual.
+func compareMemory(t *testing.T, expected, actual []byte, n int) {
+	t.Logf("Expected (%d): %v (%d) + (%d) %v\n", len(expected), expected[:n], n, len(expected)-n, expected[n:])
+	t.Logf("Actual   (%d): %v (%d) + (%d) %v\n", len(actual), actual[:n], n, len(actual)-n, actual[n:])
+
+	if diff := cmp.Diff(expected[:n], actual[:n]); diff != "" {
+		t.Errorf("Memory buffers don't match:\n--- expected only\n+++ actual only\n%v", diff)
+	}
+}
+
+// limitedCopyIn populates task memory with src, then unmarshals task memory to
+// dst. The task signals an error at limit bytes during copy-in, which should
+// result in a truncated unmarshalling.
+func limitedCopyIn(t *testing.T, src, dst marshal.Marshallable, limit int) {
+	var task mockTask
+	task.populate(src)
+	task.setLimit(limit)
+
+	n, err := dst.CopyIn(&task, usermem.Addr(0))
+	if n != limit {
+		t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n)
+	}
+	if err != simulatedErr {
+		t.Errorf("CopyIn returned unexpected error, expected %v, got %v", simulatedErr, err)
+	}
+
+	expectedMem := unsafeMemory(src)
+	defer runtime.KeepAlive(src)
+	actualMem := unsafeMemory(dst)
+	defer runtime.KeepAlive(dst)
+
+	compareMemory(t, expectedMem, actualMem, n)
+
+	// The last n bytes should be zero for actual, since actual was
+	// zero-initialized, and CopyIn shouldn't have touched those bytes. However
+	// we can only guarantee we didn't touch anything in the last n bytes if the
+	// layout is packed.
+	if dst.Packed() && !isZeroes(actualMem[n:]) {
+		t.Errorf("Expected the last %d bytes of copied in object to be zeroes, got %v\n", dst.SizeBytes()-n, actualMem)
+	}
+}
+
+// limitedCopyOut marshals src to task memory. The task signals an error at
+// limit bytes during copy-out, which should result in a truncated marshalling.
+func limitedCopyOut(t *testing.T, src marshal.Marshallable, limit int) {
+	var task mockTask
+	task.setLimit(limit)
+
+	n, err := src.CopyOut(&task, usermem.Addr(0))
+	if n != limit {
+		t.Errorf("CopyOut copied unexpected number of bytes, expected %d, got %d", limit, n)
+	}
+	if err != simulatedErr {
+		t.Errorf("CopyOut returned unexpected error, expected %v, got %v", simulatedErr, err)
+	}
+
+	expectedMem := unsafeMemory(src)
+	defer runtime.KeepAlive(src)
+	actualMem := task.taskMem.Bytes
+
+	compareMemory(t, expectedMem, actualMem, n)
+}
+
+// copyOutN marshals src to task memory, requesting the marshalling to be
+// limited to limit bytes.
+func copyOutN(t *testing.T, src marshal.Marshallable, limit int) {
+	var task mockTask
+	task.setLimit(limit)
+
+	n, err := src.CopyOutN(&task, usermem.Addr(0), limit)
+	if err != nil {
+		t.Errorf("CopyOut returned unexpected error: %v", err)
+	}
+	if n != limit {
+		t.Errorf("CopyOut copied unexpected number of bytes, expected %d, got %d", limit, n)
+	}
+
+	expectedMem := unsafeMemory(src)
+	defer runtime.KeepAlive(src)
+	actualMem := task.taskMem.Bytes
+
+	t.Logf("Expected: %v + %v\n", expectedMem[:n], expectedMem[n:])
+	t.Logf("Actual  : %v + %v\n", actualMem[:n], actualMem[n:])
+
+	compareMemory(t, expectedMem, actualMem, n)
+}
+
+// TestLimitedMarshalling verifies marshalling/unmarshalling succeeds when the
+// underyling copy in/out operations partially succeed.
+func TestLimitedMarshalling(t *testing.T) {
+	types := []reflect.Type{
+		// Packed types.
+		reflect.TypeOf((*test.Type2)(nil)),
+		reflect.TypeOf((*test.Type3)(nil)),
+		reflect.TypeOf((*test.Timespec)(nil)),
+		reflect.TypeOf((*test.Stat)(nil)),
+		reflect.TypeOf((*test.InetAddr)(nil)),
+		reflect.TypeOf((*test.SignalSet)(nil)),
+		reflect.TypeOf((*test.SignalSetAlias)(nil)),
+		// Non-packed types.
+		reflect.TypeOf((*test.Type1)(nil)),
+		reflect.TypeOf((*test.Type4)(nil)),
+		reflect.TypeOf((*test.Type5)(nil)),
+		reflect.TypeOf((*test.Type6)(nil)),
+		reflect.TypeOf((*test.Type7)(nil)),
+		reflect.TypeOf((*test.Type8)(nil)),
+	}
+
+	for _, tyPtr := range types {
+		// Remove one level of pointer-indirection from the type. We get this
+		// back when we pass the type to reflect.New.
+		ty := tyPtr.Elem()
+
+		// Partial copy-in.
+		t.Run(fmt.Sprintf("PartialCopyIn_%v", ty), func(t *testing.T) {
+			expected := reflect.New(ty).Interface().(marshal.Marshallable)
+			actual := reflect.New(ty).Interface().(marshal.Marshallable)
+			analysis.RandomizeValue(expected)
+
+			limitedCopyIn(t, expected, actual, expected.SizeBytes()/2)
+		})
+
+		// Partial copy-out.
+		t.Run(fmt.Sprintf("PartialCopyOut_%v", ty), func(t *testing.T) {
+			expected := reflect.New(ty).Interface().(marshal.Marshallable)
+			analysis.RandomizeValue(expected)
+
+			limitedCopyOut(t, expected, expected.SizeBytes()/2)
+		})
+
+		// Explicitly request partial copy-out.
+		t.Run(fmt.Sprintf("PartialCopyOutN_%v", ty), func(t *testing.T) {
+			expected := reflect.New(ty).Interface().(marshal.Marshallable)
+			analysis.RandomizeValue(expected)
+
+			copyOutN(t, expected, expected.SizeBytes()/2)
+		})
+	}
+}
+
+// TestLimitedMarshalling verifies marshalling/unmarshalling of slices of
+// marshallable types succeed when the underyling copy in/out operations
+// partially succeed.
+func TestLimitedSliceMarshalling(t *testing.T) {
+	types := []struct {
+		arrayPtrType reflect.Type
+		copySliceIn  func(task marshal.Task, addr usermem.Addr, dstSlice interface{}) (int, error)
+		copySliceOut func(task marshal.Task, addr usermem.Addr, srcSlice interface{}) (int, error)
+		unsafeMemory func(arrPtr interface{}) []byte
+	}{
+		// Packed types.
+		{
+			reflect.TypeOf((*[20]test.Stat)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[20]test.Stat)[:]
+				return test.CopyStatSliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[20]test.Stat)[:]
+				return test.CopyStatSliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[20]test.Stat)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+		{
+			reflect.TypeOf((*[1]test.Stat)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[1]test.Stat)[:]
+				return test.CopyStatSliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[1]test.Stat)[:]
+				return test.CopyStatSliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[1]test.Stat)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+		{
+			reflect.TypeOf((*[5]test.SignalSetAlias)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[5]test.SignalSetAlias)[:]
+				return test.CopySignalSetAliasSliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[5]test.SignalSetAlias)[:]
+				return test.CopySignalSetAliasSliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[5]test.SignalSetAlias)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+		// Non-packed types.
+		{
+			reflect.TypeOf((*[20]test.Type1)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[20]test.Type1)[:]
+				return test.CopyType1SliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[20]test.Type1)[:]
+				return test.CopyType1SliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[20]test.Type1)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+		{
+			reflect.TypeOf((*[1]test.Type1)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[1]test.Type1)[:]
+				return test.CopyType1SliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[1]test.Type1)[:]
+				return test.CopyType1SliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[1]test.Type1)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+		{
+			reflect.TypeOf((*[7]test.Type8)(nil)),
+			func(task marshal.Task, addr usermem.Addr, dst interface{}) (int, error) {
+				slice := dst.(*[7]test.Type8)[:]
+				return test.CopyType8SliceIn(task, addr, slice)
+			},
+			func(task marshal.Task, addr usermem.Addr, src interface{}) (int, error) {
+				slice := src.(*[7]test.Type8)[:]
+				return test.CopyType8SliceOut(task, addr, slice)
+			},
+			func(a interface{}) []byte {
+				slice := a.(*[7]test.Type8)[:]
+				return unsafeMemorySlice(slice, &slice[0])
+			},
+		},
+	}
+
+	for _, tt := range types {
+		// The body of this loop is generic over the type tt.arrayPtrType, with
+		// the help of reflection. To aid in readability, comments below show
+		// the equivalent go code assuming
+		// tt.arrayPtrType = typeof(*[20]test.Stat).
+
+		// Equivalent:
+		// var x *[20]test.Stat
+		// arrayTy := reflect.TypeOf(*x)
+		arrayTy := tt.arrayPtrType.Elem()
+
+		// Partial copy-in of slices.
+		t.Run(fmt.Sprintf("PartialCopySliceIn_%v", arrayTy), func(t *testing.T) {
+			// Equivalent:
+			// var x [20]test.Stat
+			// length := len(x)
+			length := arrayTy.Len()
+			if length < 1 {
+				panic("Test type can't be zero-length array")
+			}
+			// Equivalent:
+			// elem := new(test.Stat).(marshal.Marshallable)
+			elem := reflect.New(arrayTy.Elem()).Interface().(marshal.Marshallable)
+
+			// Equivalent:
+			// var expected, actual interface{}
+			// expected = new([20]test.Stat)
+			// actual = new([20]test.Stat)
+			expected := reflect.New(arrayTy).Interface()
+			actual := reflect.New(arrayTy).Interface()
+
+			analysis.RandomizeValue(expected)
+
+			limit := (length * elem.SizeBytes()) / 2
+			// Also make sure the limit is partially inside one of the elements.
+			limit += elem.SizeBytes() / 2
+			analysis.RandomizeValue(expected)
+
+			var task mockTask
+			task.populate(expected)
+			task.setLimit(limit)
+
+			n, err := tt.copySliceIn(&task, usermem.Addr(0), actual)
+			if n != limit {
+				t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n)
+			}
+			if n < length*elem.SizeBytes() && err != simulatedErr {
+				t.Errorf("CopyIn returned unexpected error, expected %v, got %v", simulatedErr, err)
+			}
+
+			expectedMem := tt.unsafeMemory(expected)
+			defer runtime.KeepAlive(expected)
+			actualMem := tt.unsafeMemory(actual)
+			defer runtime.KeepAlive(actual)
+
+			compareMemory(t, expectedMem, actualMem, n)
+
+			// The last n bytes should be zero for actual, since actual was
+			// zero-initialized, and CopyIn shouldn't have touched those bytes. However
+			// we can only guarantee we didn't touch anything in the last n bytes if the
+			// layout is packed.
+			if elem.Packed() && !isZeroes(actualMem[n:]) {
+				t.Errorf("Expected the last %d bytes of copied in object to be zeroes, got %v\n", (elem.SizeBytes()*length)-n, actualMem)
+			}
+		})
+
+		// Partial copy-out of slices.
+		t.Run(fmt.Sprintf("PartialCopySliceOut_%v", arrayTy), func(t *testing.T) {
+			// Equivalent:
+			// var x [20]test.Stat
+			// length := len(x)
+			length := arrayTy.Len()
+			if length < 1 {
+				panic("Test type can't be zero-length array")
+			}
+			// Equivalent:
+			// elem := new(test.Stat).(marshal.Marshallable)
+			elem := reflect.New(arrayTy.Elem()).Interface().(marshal.Marshallable)
+
+			// Equivalent:
+			// var expected, actual interface{}
+			// expected = new([20]test.Stat)
+			// actual = new([20]test.Stat)
+			expected := reflect.New(arrayTy).Interface()
+
+			analysis.RandomizeValue(expected)
+
+			limit := (length * elem.SizeBytes()) / 2
+			// Also make sure the limit is partially inside one of the elements.
+			limit += elem.SizeBytes() / 2
+			analysis.RandomizeValue(expected)
+
+			var task mockTask
+			task.populate(expected)
+			task.setLimit(limit)
+
+			n, err := tt.copySliceOut(&task, usermem.Addr(0), expected)
+			if n != limit {
+				t.Errorf("CopyIn copied unexpected number of bytes, expected %d, got %d", limit, n)
+			}
+			if n < length*elem.SizeBytes() && err != simulatedErr {
+				t.Errorf("CopyIn returned unexpected error, expected %v, got %v", simulatedErr, err)
+			}
+
+			expectedMem := tt.unsafeMemory(expected)
+			defer runtime.KeepAlive(expected)
+			actualMem := task.taskMem.Bytes
+
+			compareMemory(t, expectedMem, actualMem, n)
+		})
+	}
+}
diff --git a/tools/go_marshal/test/test.go b/tools/go_marshal/test/test.go
index 8de02d707..f75ca1b7f 100644
--- a/tools/go_marshal/test/test.go
+++ b/tools/go_marshal/test/test.go
@@ -23,7 +23,7 @@ import (
 
 // Type1 is a test data type.
 //
-// +marshal
+// +marshal slice:Type1Slice
 type Type1 struct {
 	a    Type2
 	x, y int64 // Multiple field names.
@@ -75,6 +75,34 @@ type Type5 struct {
 	m int64
 }
 
+// Type6 is a test data type ends mid-word.
+//
+// +marshal
+type Type6 struct {
+	a int64
+	b int64
+	// If c isn't marked unaligned, analysis fails (as it should, since
+	// the unsafe API corrupts Type7).
+	c byte `marshal:"unaligned"`
+}
+
+// Type7 is a test data type that contains a child struct that ends
+// mid-word.
+// +marshal
+type Type7 struct {
+	x Type6
+	y int64
+}
+
+// Type8 is a test data type which contains an external non-packed field.
+//
+// +marshal slice:Type8Slice
+type Type8 struct {
+	a  int64
+	np ex.NotPacked
+	b  int64
+}
+
 // Timespec represents struct timespec in <time.h>.
 //
 // +marshal
@@ -85,7 +113,7 @@ type Timespec struct {
 
 // Stat represents struct stat.
 //
-// +marshal
+// +marshal slice:StatSlice
 type Stat struct {
 	Dev     uint64
 	Ino     uint64
@@ -103,3 +131,46 @@ type Stat struct {
 	CTime   Timespec
 	_       [3]int64
 }
+
+// InetAddr is an example marshallable newtype on an array.
+//
+// +marshal
+type InetAddr [4]byte
+
+// SignalSet is an example marshallable newtype on a primitive.
+//
+// +marshal slice:SignalSetSlice:inner
+type SignalSet uint64
+
+// SignalSetAlias is an example newtype on another marshallable type.
+//
+// +marshal slice:SignalSetAliasSlice
+type SignalSetAlias SignalSet
+
+const sizeA = 64
+const sizeB = 8
+
+// TestArray is a test data structure on an array with a constant length.
+//
+// +marshal
+type TestArray [sizeA]int32
+
+// TestArray2 is a newtype on an array with a simple arithmetic expression of
+// constants for the array length.
+//
+// +marshal
+type TestArray2 [sizeA * sizeB]int32
+
+// TestArray2 is a newtype on an array with a simple arithmetic expression of
+// mixed constants and literals for the array length.
+//
+// +marshal
+type TestArray3 [sizeA*sizeB + 12]int32
+
+// Type9 is a test data type containing an array with a non-literal length.
+//
+// +marshal
+type Type9 struct {
+	x int64
+	y [sizeA]int32
+}
diff --git a/tools/go_mod.sh b/tools/go_mod.sh
new file mode 100755
index 000000000..84b779d6d
--- /dev/null
+++ b/tools/go_mod.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eo pipefail
+
+# Build the :gopath target.
+bazel build //:gopath
+declare -r gopathdir="bazel-bin/gopath/src/gvisor.dev/gvisor/"
+
+# Copy go.mod and execute the command.
+cp -a go.mod go.sum "${gopathdir}"
+(cd "${gopathdir}" && go mod "$@")
+cp -a "${gopathdir}/go.mod" "${gopathdir}/go.sum" .
+
+# Cleanup the WORKSPACE file.
+bazel run //:gazelle -- update-repos -from_file=go.mod
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index 3437aa476..309ee9c21 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -206,7 +206,7 @@ func main() {
 		initCalls = append(initCalls, fmt.Sprintf("%sRegister(\"%s.%s\", (*%s)(nil), state.Fns{Save: (*%s).save, Load: (*%s).load})", statePrefix, *fullPkg, name, name, name, name))
 	}
 	emitZeroCheck := func(name string) {
-		fmt.Fprintf(outputFile, "	if !%sIsZeroValue(x.%s) { m.Failf(\"%s is %%v, expected zero\", x.%s) }\n", statePrefix, name, name, name)
+		fmt.Fprintf(outputFile, "	if !%sIsZeroValue(&x.%s) { m.Failf(\"%s is %%#v, expected zero\", &x.%s) }\n", statePrefix, name, name, name)
 	}
 	emitLoadValue := func(name, typName string) {
 		fmt.Fprintf(outputFile, "	m.LoadValue(\"%s\", new(%s), func(y interface{}) { x.load%s(y.(%s)) })\n", name, typName, camelCased(name), typName)
diff --git a/tools/image_build.sh b/tools/image_build.sh
deleted file mode 100755
index 9b20a740d..000000000
--- a/tools/image_build.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script is responsible for building a new GCP image that: 1) has nested
-# virtualization enabled, and 2) has been completely set up with the
-# image_setup.sh script. This script should be idempotent, as we memoize the
-# setup script with a hash and check for that name.
-#
-# The GCP project name should be defined via a gcloud config.
-
-set -xeo pipefail
-
-# Parameters.
-declare -r ZONE=${ZONE:-us-central1-f}
-declare -r USERNAME=${USERNAME:-test}
-declare -r IMAGE_PROJECT=${IMAGE_PROJECT:-ubuntu-os-cloud}
-declare -r IMAGE_FAMILY=${IMAGE_FAMILY:-ubuntu-1604-lts}
-
-# Random names.
-declare -r DISK_NAME=$(mktemp -u disk-XXXXXX | tr A-Z a-z)
-declare -r SNAPSHOT_NAME=$(mktemp -u snapshot-XXXXXX | tr A-Z a-z)
-declare -r INSTANCE_NAME=$(mktemp -u build-XXXXXX | tr A-Z a-z)
-
-# Hashes inputs.
-declare -r SETUP_BLOB=$(echo ${ZONE} ${USERNAME} ${IMAGE_PROJECT} ${IMAGE_FAMILY} && sha256sum "$@")
-declare -r SETUP_HASH=$(echo ${SETUP_BLOB} | sha256sum - | cut -d' ' -f1 | cut -c 1-16)
-declare -r IMAGE_NAME=${IMAGE_NAME:-image-}${SETUP_HASH}
-
-# Does the image already exist? Skip the build.
-declare -r existing=$(gcloud compute images list --filter="name=(${IMAGE_NAME})" --format="value(name)")
-if ! [[ -z "${existing}" ]]; then
-  echo "${existing}"
-  exit 0
-fi
-
-# Set the zone for all actions.
-gcloud config set compute/zone "${ZONE}"
-
-# Start a unique instance. Note that this instance will have a unique persistent
-# disk as it's boot disk with the same name as the instance.
-gcloud compute instances create \
-    --quiet \
-    --image-project "${IMAGE_PROJECT}" \
-    --image-family "${IMAGE_FAMILY}" \
-    --boot-disk-size "200GB" \
-    "${INSTANCE_NAME}"
-function cleanup {
-    gcloud compute instances delete --quiet "${INSTANCE_NAME}"
-}
-trap cleanup EXIT
-
-# Wait for the instance to become available.
-declare attempts=0
-while [[ "${attempts}" -lt 30 ]]; do
-  attempts=$((${attempts}+1))
-  if gcloud compute ssh "${USERNAME}"@"${INSTANCE_NAME}" -- true; then
-    break
-  fi
-done
-if [[ "${attempts}" -ge 30 ]]; then
-  echo "too many attempts: failed"
-  exit 1
-fi
-
-# Run the install scripts provided.
-for arg; do
-  gcloud compute ssh "${USERNAME}"@"${INSTANCE_NAME}" -- sudo bash - <"${arg}"
-done
-
-# Stop the instance; required before creating an image.
-gcloud compute instances stop --quiet "${INSTANCE_NAME}"
-
-# Create a snapshot of the instance disk.
-gcloud compute disks snapshot \
-    --quiet \
-    --zone="${ZONE}" \
-    --snapshot-names="${SNAPSHOT_NAME}" \
-    "${INSTANCE_NAME}"
-
-# Create the disk image.
-gcloud compute images create \
-    --quiet \
-    --source-snapshot="${SNAPSHOT_NAME}" \
-    --licenses="https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx" \
-    "${IMAGE_NAME}"
diff --git a/tools/installers/BUILD b/tools/installers/BUILD
index d78a265ca..caa7b1983 100644
--- a/tools/installers/BUILD
+++ b/tools/installers/BUILD
@@ -17,6 +17,14 @@ sh_binary(
 )
 
 sh_binary(
+    name = "images",
+    srcs = ["images.sh"],
+    data = [
+        "//images",
+    ],
+)
+
+sh_binary(
     name = "master",
     srcs = ["master.sh"],
 )
diff --git a/tools/installers/head.sh b/tools/installers/head.sh
index 9de8f138c..7fc566ebd 100755
--- a/tools/installers/head.sh
+++ b/tools/installers/head.sh
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 # Install our runtime.
-$(dirname $0)/runsc install
+$(find . -executable -type f -name runsc) install
 
 # Restart docker.
 service docker restart || true
diff --git a/tools/installers/images.sh b/tools/installers/images.sh
new file mode 100755
index 000000000..52e750f57
--- /dev/null
+++ b/tools/installers/images.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail
+
+# Find the images directory.
+for images in $(find . -type d -name images); do
+  if [[ -f "${images}"/Makefile ]]; then
+    make -C "${images}" load-all-images
+  fi
+done
diff --git a/tools/installers/master.sh b/tools/installers/master.sh
index 7b1956454..2c6001c6c 100755
--- a/tools/installers/master.sh
+++ b/tools/installers/master.sh
@@ -15,6 +15,20 @@
 # limitations under the License.
 
 # Install runsc from the master branch.
+set -e
+
 curl -fsSL https://gvisor.dev/archive.key | sudo apt-key add -
 add-apt-repository "deb https://storage.googleapis.com/gvisor/releases release main"
-apt-get update && apt-get install -y runsc
+
+while true; do
+  if (apt-get update && apt-get install -y runsc); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
+
+runsc install
+service docker restart
diff --git a/tools/issue_reviver/main.go b/tools/issue_reviver/main.go
index 4256f5a6c..47c796b8a 100644
--- a/tools/issue_reviver/main.go
+++ b/tools/issue_reviver/main.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os"
+	"strings"
 
 	"gvisor.dev/gvisor/tools/issue_reviver/github"
 	"gvisor.dev/gvisor/tools/issue_reviver/reviver"
@@ -35,14 +36,22 @@ var (
 
 // Keep the options simple for now. Supports only a single path and repo.
 func init() {
-	flag.StringVar(&owner, "owner", "google", "Github project org/owner to look for issues")
-	flag.StringVar(&repo, "repo", "gvisor", "Github repo to look for issues")
+	flag.StringVar(&owner, "owner", "", "Github project org/owner to look for issues")
+	flag.StringVar(&repo, "repo", "", "Github repo to look for issues")
 	flag.StringVar(&tokenFile, "oauth-token-file", "", "Path to file containing the OAUTH token to be used as credential to github")
-	flag.StringVar(&path, "path", "", "Path to scan for TODOs")
+	flag.StringVar(&path, "path", ".", "Path to scan for TODOs")
 	flag.BoolVar(&dryRun, "dry-run", false, "If set to true, no changes are made to issues")
 }
 
 func main() {
+	// Set defaults from the environment.
+	repository := os.Getenv("GITHUB_REPOSITORY")
+	if parts := strings.SplitN(repository, "/", 2); len(parts) == 2 {
+		owner = parts[0]
+		repo = parts[1]
+	}
+
+	// Parse flags.
 	flag.Parse()
 
 	// Check for mandatory parameters.
@@ -62,8 +71,10 @@ func main() {
 		os.Exit(1)
 	}
 
-	// Token is passed as a file so it doesn't show up in command line arguments.
-	var token string
+	// The access token may be passed as a file so it doesn't show up in
+	// command line arguments. It also may be provided through the
+	// environment to faciliate use through GitHub's CI system.
+	token := os.Getenv("GITHUB_TOKEN")
 	if len(tokenFile) != 0 {
 		bytes, err := ioutil.ReadFile(tokenFile)
 		if err != nil {
diff --git a/tools/make_apt.sh b/tools/make_apt.sh
new file mode 100755
index 000000000..3fb1066e5
--- /dev/null
+++ b/tools/make_apt.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [[ "$#" -le 3 ]]; then
+  echo "usage: $0 <private-key> <suite> <root> <packages...>"
+  exit 1
+fi
+declare -r private_key=$(readlink -e "$1"); shift
+declare -r suite="$1"; shift
+declare -r root="$1"; shift
+
+# Ensure that we have the correct packages installed.
+function apt_install() {
+  while true; do
+    sudo apt-get update &&
+      sudo apt-get install -y "$@" &&
+      true
+    result="${?}"
+    case $result in
+      0)
+        break
+        ;;
+      100)
+        # 100 is the error code that apt-get returns.
+        ;;
+      *)
+        exit $result
+        ;;
+    esac
+  done
+}
+dpkg-sig --help >/dev/null 2>&1       || apt_install dpkg-sig
+apt-ftparchive --help >/dev/null 2>&1 || apt_install apt-utils
+xz --help >/dev/null 2>&1             || apt_install xz-utils
+
+# Verbose from this point.
+set -xeo pipefail
+
+# Create a directory for the release.
+declare -r release="${root}/dists/${suite}"
+mkdir -p "${release}"
+
+# Create a temporary keyring, and ensure it is cleaned up.
+declare -r keyring=$(mktemp /tmp/keyringXXXXXX.gpg)
+cleanup() {
+  rm -f "${keyring}"
+}
+trap cleanup EXIT
+
+# We attempt the import twice because the first one will fail if the public key
+# is not found. This isn't actually a failure for us, because we don't require
+# the public (this may be stored separately). The second import will succeed
+# because, in reality, the first import succeeded and it's a no-op.
+gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}" || \
+  gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}"
+
+# Copy the packages into the root.
+for pkg in "$@"; do
+  ext=${pkg##*.}
+  name=$(basename "${pkg}" ".${ext}")
+  arch=${name##*_}
+  if [[ "${name}" == "${arch}" ]]; then
+    continue # Not a regular package.
+  fi
+  if [[ "${pkg}" =~ ^.*\.deb$ ]]; then
+    # Extract from the debian file.
+    version=$(dpkg --info "${pkg}" | grep -E 'Version:' | cut -d':' -f2)
+  elif [[ "${pkg}" =~ ^.*\.changes$ ]]; then
+    # Extract from the changes file.
+    version=$(grep -E 'Version:' "${pkg}" | cut -d':' -f2)
+  else
+    # Unsupported file type.
+    echo "Unknown file type: ${pkg}"
+    exit 1
+  fi
+
+  # The package may already exist, in which case we leave it alone.
+  version=${version// /} # Trim whitespace.
+  destdir="${root}/pool/${version}/binary-${arch}"
+  target="${destdir}/${name}.${ext}"
+  if [[ -f "${target}" ]]; then
+    continue
+  fi
+
+  # Copy & sign the package.
+  mkdir -p "${destdir}"
+  cp -a "${pkg}" "${target}"
+  chmod 0644 "${target}"
+  if [[ "${ext}" == "deb" ]]; then
+    dpkg-sig -g "--no-default-keyring --keyring ${keyring}" --sign builder "${target}"
+  fi
+done
+
+# Build the package list.
+declare arches=()
+for dir in "${root}"/pool/*/binary-*; do
+  name=$(basename "${dir}")
+  arch=${name##binary-}
+  arches+=("${arch}")
+  repo_packages="${release}"/main/"${name}"
+  mkdir -p "${repo_packages}"
+  (cd "${root}" && apt-ftparchive --arch "${arch}" packages pool > "${repo_packages}"/Packages)
+  (cd "${repo_packages}" && cat Packages | gzip > Packages.gz)
+  (cd "${repo_packages}" && cat Packages | xz > Packages.xz)
+done
+
+# Build the release list.
+cat > "${release}"/apt.conf <<EOF
+APT {
+  FTPArchive {
+    Release {
+      Architectures "${arches[@]}";
+      Suite "${suite}";
+      Components "main";
+    };
+  };
+};
+EOF
+(cd "${release}" && apt-ftparchive -c=apt.conf release . > Release)
+rm "${release}"/apt.conf
+
+# Sign the release.
+declare -r digest_opts=("--digest-algo" "SHA512" "--cert-digest-algo" "SHA512")
+(cd "${release}" && rm -f Release.gpg InRelease)
+(cd "${release}" && gpg --no-default-keyring --keyring "${keyring}" --clearsign "${digest_opts[@]}" -o InRelease Release)
+(cd "${release}" && gpg --no-default-keyring --keyring "${keyring}" -abs "${digest_opts[@]}" -o Release.gpg Release)
diff --git a/tools/make_release.sh b/tools/make_release.sh
new file mode 100755
index 000000000..b1cdd47b0
--- /dev/null
+++ b/tools/make_release.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [[ "$#" -le 2 ]]; then
+  echo "usage: $0 <private-key> <root> <binaries & packages...>"
+  echo "The environment variable NIGHTLY may be set to control"
+  echo "whether the nightly packages are produced or not."
+  exit 1
+fi
+
+set -xeo pipefail
+declare -r private_key="$1"; shift
+declare -r root="$1"; shift
+declare -a binaries
+declare -a pkgs
+
+# Collect binaries & packages.
+for arg in "$@"; do
+  if [[ "${arg}" == *.deb ]] || [[ "${arg}" == *.changes ]]; then
+    pkgs+=("${arg}")
+  else
+    binaries+=("${arg}")
+  fi
+done
+
+# install_raw installs raw artifacts.
+install_raw() {
+  mkdir -p "${root}/$1"
+  for binary in "${binaries[@]}"; do
+    # Copy the raw file & generate a sha512sum.
+    name=$(basename "${binary}")
+    cp -f "${binary}" "${root}/$1"
+    sha512sum "${root}/$1/${name}" | \
+        awk "{print $$1 \"  ${name}\"}" > "${root}/$1/${name}.sha512"
+  done
+}
+
+# install_apt installs an apt repository.
+install_apt() {
+  tools/make_apt.sh "${private_key}" "$1" "${root}" "${pkgs[@]}"
+}
+
+# If nightly, install only nightly artifacts.
+if [[ "${NIGHTLY:-false}" == "true" ]]; then
+  # The "latest" directory and current date.
+  stamp="$(date -Idate)"
+  install_raw "nightly/latest"
+  install_raw "nightly/${stamp}"
+  install_apt "nightly"
+else
+  # Is it a tagged release? Build that.
+  tags="$(git tag --points-at HEAD 2>/dev/null || true)"
+  if ! [[ -z "${tags}" ]]; then
+    # Note that a given commit can match any number of tags. We have to iterate
+    # through all possible tags and produce associated artifacts.
+    for tag in ${tags}; do
+      name=$(echo "${tag}" | cut -d'-' -f2)
+      base=$(echo "${name}" | cut -d'.' -f1)
+      install_raw "release/${name}"
+      install_raw "release/latest"
+      install_apt "release"
+      install_apt "${base}"
+    done
+  else
+    # Otherwise, assume it is a raw master commit.
+    install_raw "master/latest"
+    install_apt "master"
+  fi
+fi
diff --git a/tools/make_repository.sh b/tools/make_repository.sh
deleted file mode 100755
index 27ffbc9f3..000000000
--- a/tools/make_repository.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Parse arguments. We require more than two arguments, which are the private
-# keyring, the e-mail associated with the signer, and the list of packages.
-if [ "$#" -le 3 ]; then
-  echo "usage: $0 <private-key> <signer-email> <component> <root> <packages...>"
-  exit 1
-fi
-declare -r private_key=$(readlink -e "$1"); shift
-declare -r signer="$1"; shift
-declare -r component="$1"; shift
-declare -r root="$1"; shift
-
-# Verbose from this point.
-set -xeo pipefail
-
-# Create a temporary working directory. We don't remove this, as we ultimately
-# print this result and allow the caller to copy wherever they would like.
-declare -r tmpdir=$(mktemp -d /tmp/repoXXXXXX)
-
-# Create a temporary keyring, and ensure it is cleaned up.
-declare -r keyring=$(mktemp /tmp/keyringXXXXXX.gpg)
-cleanup() {
-  rm -f "${keyring}"
-}
-trap cleanup EXIT
-gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}" >&2
-
-# Copy the packages into the root.
-for pkg in "$@"; do
-  name=$(basename "${pkg}" .deb)
-  name=$(basename "${name}" .changes)
-  arch=${name##*_}
-  if [[ "${name}" == "${arch}" ]]; then
-    continue # Not a regular package.
-  fi
-  if [[ "${pkg}" =~ ^.*\.deb$ ]]; then
-    # Extract from the debian file.
-    version=$(dpkg --info "${pkg}" | grep -E 'Version:' | cut -d':' -f2)
-  elif [[ "${pkg}" =~ ^.*\.changes$ ]]; then
-    # Extract from the changes file.
-    version=$(grep -E 'Version:' "${pkg}" | cut -d':' -f2)
-  else
-    # Unsupported file type.
-    echo "Unknown file type: ${pkg}"
-    exit 1
-  fi
-  version=${version// /} # Trim whitespace.
-  mkdir -p "${root}"/pool/"${version}"/binary-"${arch}"
-  cp -a "${pkg}" "${root}"/pool/"${version}"/binary-"${arch}"
-done
-
-# Ensure all permissions are correct.
-find "${root}"/pool -type f -exec chmod 0644 {} \;
-
-# Sign all packages.
-for file in "${root}"/pool/*/binary-*/*.deb; do
-  dpkg-sig -g "--no-default-keyring --keyring ${keyring}" --sign builder "${file}" >&2
-done
-
-# Build the package list.
-declare arches=()
-for dir in "${root}"/pool/*/binary-*; do
-  name=$(basename "${dir}")
-  arch=${name##binary-}
-  arches+=("${arch}")
-  repo_packages="${tmpdir}"/"${component}"/"${name}"
-  mkdir -p "${repo_packages}"
-  (cd "${root}" && apt-ftparchive --arch "${arch}" packages pool > "${repo_packages}"/Packages)
-  (cd "${repo_packages}" && cat Packages | gzip > Packages.gz)
-  (cd "${repo_packages}" && cat Packages | xz > Packages.xz)
-done
-
-# Build the release list.
-cat > "${tmpdir}"/apt.conf <<EOF
-APT {
-  FTPArchive {
-    Release {
-      Architectures "${arches[@]}";
-      Components "${component}";
-    };
-  };
-};
-EOF
-(cd "${tmpdir}" && apt-ftparchive -c=apt.conf release . > Release)
-rm "${tmpdir}"/apt.conf
-
-# Sign the release.
-declare -r digest_opts=("--digest-algo" "SHA512" "--cert-digest-algo" "SHA512")
-(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" --clearsign "${digest_opts[@]}" -o InRelease Release >&2)
-(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" -abs "${digest_opts[@]}" -o Release.gpg Release >&2)
-
-# Show the results.
-echo "${tmpdir}"
diff --git a/tools/nogo.js b/tools/nogo.js
deleted file mode 100644
index fc0a4d1f0..000000000
--- a/tools/nogo.js
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "checkunsafe": {
-    "exclude_files": {
-      "/external/": "not subject to constraint"
-    }
-  }
-}
diff --git a/tools/nogo/BUILD b/tools/nogo/BUILD
new file mode 100644
index 000000000..c21b09511
--- /dev/null
+++ b/tools/nogo/BUILD
@@ -0,0 +1,49 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "nogo",
+    srcs = [
+        "build.go",
+        "config.go",
+        "matchers.go",
+        "nogo.go",
+        "register.go",
+    ],
+    nogo = False,
+    visibility = ["//:sandbox"],
+    deps = [
+        "//tools/checkescape",
+        "//tools/checkunsafe",
+        "//tools/nogo/data",
+        "@org_golang_x_tools//go/analysis:go_tool_library",
+        "@org_golang_x_tools//go/analysis/internal/facts:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/asmdecl:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/assign:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/atomic:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/bools:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/buildtag:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/cgocall:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/composite:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/copylock:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/errorsas:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/httpresponse:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/loopclosure:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/lostcancel:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/nilfunc:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/nilness:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/printf:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/shadow:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/shift:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/stdmethods:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/stringintconv:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/structtag:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/tests:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unmarshal:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unreachable:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unsafeptr:go_tool_library",
+        "@org_golang_x_tools//go/analysis/passes/unusedresult:go_tool_library",
+        "@org_golang_x_tools//go/gcexportdata:go_tool_library",
+    ],
+)
diff --git a/tools/nogo/README.md b/tools/nogo/README.md
new file mode 100644
index 000000000..6e4db18de
--- /dev/null
+++ b/tools/nogo/README.md
@@ -0,0 +1,31 @@
+# Extended "nogo" analysis
+
+This package provides a build aspect that perform nogo analysis. This will be
+automatically injected to all relevant libraries when using the default
+`go_binary` and `go_library` rules.
+
+It exists for several reasons.
+
+*   The default `nogo` provided by bazel is insufficient with respect to the
+    possibility of binary analysis. This package allows us to analyze the
+    generated binary in addition to using the standard analyzers.
+
+*   The configuration provided in this package is much richer than the standard
+    `nogo` JSON blob. Specifically, it allows us to exclude specific structures
+    from the composite rules (such as the Ranges that are common with the set
+    types).
+
+*   The bazel version of `nogo` is run directly against the `go_library` and
+    `go_binary` targets, meaning that any change to the configuration requires a
+    rebuild from scratch (for some reason included all C++ source files in the
+    process). Using an aspect is more efficient in this regard.
+
+*   The checks supported by this package are exported as tests, which makes it
+    easier to reason about and plumb into the build system.
+
+*   For uninteresting reasons, it is impossible to integrate the default `nogo`
+    analyzer provided by bazel with internal Google tooling. To provide a
+    consistent experience, this package allows those systems to be unified.
+
+To use this package, import `nogo_test` from `defs.bzl` and add a single
+dependency which is a `go_binary` or `go_library` rule.
diff --git a/tools/nogo/build.go b/tools/nogo/build.go
new file mode 100644
index 000000000..1c0d08661
--- /dev/null
+++ b/tools/nogo/build.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nogo
+
+import (
+	"fmt"
+	"io"
+	"os"
+)
+
+var (
+	// internalPrefix is the internal path prefix. Note that this is not
+	// special, as paths should be passed relative to the repository root
+	// and should not have any special prefix applied.
+	internalPrefix = fmt.Sprintf("^")
+
+	// externalPrefix is external workspace packages.
+	externalPrefix = "^external/"
+)
+
+// findStdPkg needs to find the bundled standard library packages.
+func findStdPkg(path, GOOS, GOARCH string) (io.ReadCloser, error) {
+	return os.Open(fmt.Sprintf("external/go_sdk/pkg/%s_%s/%s.a", GOOS, GOARCH, path))
+}
diff --git a/tools/nogo/check/BUILD b/tools/nogo/check/BUILD
new file mode 100644
index 000000000..e2d76cd5c
--- /dev/null
+++ b/tools/nogo/check/BUILD
@@ -0,0 +1,12 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+# Note that the check binary must be public, since an aspect may be applied
+# across lots of different rules in different repositories.
+go_binary(
+    name = "check",
+    srcs = ["main.go"],
+    visibility = ["//visibility:public"],
+    deps = ["//tools/nogo"],
+)
diff --git a/runsc/boot/loader_amd64.go b/tools/nogo/check/main.go
index b9669f2ac..3828edf3a 100644
--- a/runsc/boot/loader_amd64.go
+++ b/tools/nogo/check/main.go
@@ -12,16 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64
-
-package boot
+// Binary check is the nogo entrypoint.
+package main
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/tools/nogo"
 )
 
-func init() {
-	// Register the global syscall table.
-	kernel.RegisterSyscallTable(linux.AMD64)
+func main() {
+	nogo.Main()
 }
diff --git a/tools/nogo/config.go b/tools/nogo/config.go
new file mode 100644
index 000000000..6958fca69
--- /dev/null
+++ b/tools/nogo/config.go
@@ -0,0 +1,116 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nogo
+
+import (
+	"golang.org/x/tools/go/analysis"
+	"golang.org/x/tools/go/analysis/passes/asmdecl"
+	"golang.org/x/tools/go/analysis/passes/assign"
+	"golang.org/x/tools/go/analysis/passes/atomic"
+	"golang.org/x/tools/go/analysis/passes/bools"
+	"golang.org/x/tools/go/analysis/passes/buildtag"
+	"golang.org/x/tools/go/analysis/passes/cgocall"
+	"golang.org/x/tools/go/analysis/passes/composite"
+	"golang.org/x/tools/go/analysis/passes/copylock"
+	"golang.org/x/tools/go/analysis/passes/errorsas"
+	"golang.org/x/tools/go/analysis/passes/httpresponse"
+	"golang.org/x/tools/go/analysis/passes/loopclosure"
+	"golang.org/x/tools/go/analysis/passes/lostcancel"
+	"golang.org/x/tools/go/analysis/passes/nilfunc"
+	"golang.org/x/tools/go/analysis/passes/nilness"
+	"golang.org/x/tools/go/analysis/passes/printf"
+	"golang.org/x/tools/go/analysis/passes/shadow"
+	"golang.org/x/tools/go/analysis/passes/shift"
+	"golang.org/x/tools/go/analysis/passes/stdmethods"
+	"golang.org/x/tools/go/analysis/passes/stringintconv"
+	"golang.org/x/tools/go/analysis/passes/structtag"
+	"golang.org/x/tools/go/analysis/passes/tests"
+	"golang.org/x/tools/go/analysis/passes/unmarshal"
+	"golang.org/x/tools/go/analysis/passes/unreachable"
+	"golang.org/x/tools/go/analysis/passes/unsafeptr"
+	"golang.org/x/tools/go/analysis/passes/unusedresult"
+
+	"gvisor.dev/gvisor/tools/checkescape"
+	"gvisor.dev/gvisor/tools/checkunsafe"
+)
+
+var analyzerConfig = map[*analysis.Analyzer]matcher{
+	// Standard analyzers.
+	asmdecl.Analyzer: alwaysMatches(),
+	assign.Analyzer: externalExcluded(
+		".*gazelle/walk/walk.go", // False positive.
+	),
+	atomic.Analyzer:   alwaysMatches(),
+	bools.Analyzer:    alwaysMatches(),
+	buildtag.Analyzer: alwaysMatches(),
+	cgocall.Analyzer:  alwaysMatches(),
+	composite.Analyzer: and(
+		disableMatches(), // Disabled for now.
+		resultExcluded{
+			"Object_",
+			"Range{",
+		},
+	),
+	copylock.Analyzer:     internalMatches(), // Common external issues (e.g. protos).
+	errorsas.Analyzer:     alwaysMatches(),
+	httpresponse.Analyzer: alwaysMatches(),
+	loopclosure.Analyzer:  alwaysMatches(),
+	lostcancel.Analyzer:   internalMatches(), // Common external issues.
+	nilfunc.Analyzer:      alwaysMatches(),
+	nilness.Analyzer: and(
+		internalMatches(), // Common "tautological checks".
+		internalExcluded(
+			"pkg/sentry/platform/kvm/kvm_test.go", // Intentional.
+			"tools/bigquery/bigquery.go",          // False positive.
+		),
+	),
+	printf.Analyzer:     alwaysMatches(),
+	shift.Analyzer:      alwaysMatches(),
+	stdmethods.Analyzer: internalMatches(), // Common external issues (e.g. methods named "Write").
+	stringintconv.Analyzer: and(
+		internalExcluded(),
+		externalExcluded(
+			".*protobuf/.*.go",              // Bad conversions.
+			".*flate/huffman_bit_writer.go", // Bad conversion.
+		),
+	),
+	shadow.Analyzer:      disableMatches(),  // Disabled for now.
+	structtag.Analyzer:   internalMatches(), // External not subject to rules.
+	tests.Analyzer:       alwaysMatches(),
+	unmarshal.Analyzer:   alwaysMatches(),
+	unreachable.Analyzer: internalMatches(),
+	unsafeptr.Analyzer: and(
+		internalMatches(),
+		internalExcluded(
+			".*_test.go",                                               // Exclude tests.
+			"pkg/flipcall/.*_unsafe.go",                                // Special case.
+			"pkg/gohacks/gohacks_unsafe.go",                            // Special case.
+			"pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go",          // Special case.
+			"pkg/sentry/platform/kvm/bluepill_unsafe.go",               // Special case.
+			"pkg/sentry/platform/kvm/machine_unsafe.go",                // Special case.
+			"pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go", // Special case.
+			"pkg/sentry/platform/safecopy/safecopy_unsafe.go",          // Special case.
+			"pkg/sentry/vfs/mount_unsafe.go",                           // Special case.
+			"pkg/sentry/platform/systrap/stub_unsafe.go",               // Special case.
+			"pkg/sentry/platform/systrap/switchto_google_unsafe.go",    // Special case.
+			"pkg/sentry/platform/systrap/sysmsg_thread_unsafe.go",      // Special case.
+		),
+	),
+	unusedresult.Analyzer: alwaysMatches(),
+
+	// Internal analyzers: external packages not subject.
+	checkescape.Analyzer: internalMatches(),
+	checkunsafe.Analyzer: internalMatches(),
+}
diff --git a/tools/nogo/data/BUILD b/tools/nogo/data/BUILD
new file mode 100644
index 000000000..b7564cc44
--- /dev/null
+++ b/tools/nogo/data/BUILD
@@ -0,0 +1,10 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "data",
+    srcs = ["data.go"],
+    nogo = False,
+    visibility = ["//tools:__subpackages__"],
+)
diff --git a/pkg/fspath/builder_unsafe.go b/tools/nogo/data/data.go
index 75606808d..eb84d0d27 100644
--- a/pkg/fspath/builder_unsafe.go
+++ b/tools/nogo/data/data.go
@@ -12,16 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package fspath
-
-import (
-	"unsafe"
-)
+// Package data contains shared data for nogo analysis.
+//
+// This is used to break a dependency cycle.
+package data
 
-// String returns the accumulated string. No other methods should be called
-// after String.
-func (b *Builder) String() string {
-	bs := b.buf[b.start:]
-	// Compare strings.Builder.String().
-	return *(*string)(unsafe.Pointer(&bs))
-}
+// Objdump is the dumped binary under analysis.
+var Objdump string
diff --git a/tools/nogo/defs.bzl b/tools/nogo/defs.bzl
new file mode 100644
index 000000000..6560b57c8
--- /dev/null
+++ b/tools/nogo/defs.bzl
@@ -0,0 +1,172 @@
+"""Nogo rules."""
+
+load("//tools/bazeldefs:defs.bzl", "go_context", "go_importpath", "go_rule")
+
+# NogoInfo is the serialized set of package facts for a nogo analysis.
+#
+# Each go_library rule will generate a corresponding nogo rule, which will run
+# with the source files as input. Note however, that the individual nogo rules
+# are simply stubs that enter into the shadow dependency tree (the "aspect").
+NogoInfo = provider(
+    fields = {
+        "facts": "serialized package facts",
+        "importpath": "package import path",
+        "binaries": "package binary files",
+    },
+)
+
+def _nogo_aspect_impl(target, ctx):
+    # If this is a nogo rule itself (and not the shadow of a go_library or
+    # go_binary rule created by such a rule), then we simply return nothing.
+    # All work is done in the shadow properties for go rules. For a proto
+    # library, we simply skip the analysis portion but still need to return a
+    # valid NogoInfo to reference the generated binary.
+    if ctx.rule.kind == "go_library":
+        srcs = ctx.rule.files.srcs
+    elif ctx.rule.kind == "go_proto_library" or ctx.rule.kind == "go_wrap_cc":
+        srcs = []
+    else:
+        return [NogoInfo()]
+
+    # Construct the Go environment from the go_context.env dictionary.
+    env_prefix = " ".join(["%s=%s" % (key, value) for (key, value) in go_context(ctx).env.items()])
+
+    # Start with all target files and srcs as input.
+    inputs = target.files.to_list() + srcs
+
+    # Generate a shell script that dumps the binary. Annoyingly, this seems
+    # necessary as the context in which a run_shell command runs does not seem
+    # to cleanly allow us redirect stdout to the actual output file. Perhaps
+    # I'm missing something here, but the intermediate script does work.
+    binaries = target.files.to_list()
+    disasm_file = ctx.actions.declare_file(target.label.name + ".out")
+    dumper = ctx.actions.declare_file("%s-dumper" % ctx.label.name)
+    ctx.actions.write(dumper, "\n".join([
+        "#!/bin/bash",
+        "%s %s tool objdump %s > %s\n" % (
+            env_prefix,
+            go_context(ctx).go.path,
+            [f.path for f in binaries if f.path.endswith(".a")][0],
+            disasm_file.path,
+        ),
+    ]), is_executable = True)
+    ctx.actions.run(
+        inputs = binaries,
+        outputs = [disasm_file],
+        tools = go_context(ctx).runfiles,
+        mnemonic = "GoObjdump",
+        progress_message = "Objdump %s" % target.label,
+        executable = dumper,
+    )
+    inputs.append(disasm_file)
+
+    # Extract the importpath for this package.
+    importpath = go_importpath(target)
+
+    # The nogo tool requires a configfile serialized in JSON format to do its
+    # work. This must line up with the nogo.Config fields.
+    facts = ctx.actions.declare_file(target.label.name + ".facts")
+    config = struct(
+        ImportPath = importpath,
+        GoFiles = [src.path for src in srcs if src.path.endswith(".go")],
+        NonGoFiles = [src.path for src in srcs if not src.path.endswith(".go")],
+        GOOS = go_context(ctx).goos,
+        GOARCH = go_context(ctx).goarch,
+        Tags = go_context(ctx).tags,
+        FactMap = {},  # Constructed below.
+        ImportMap = {},  # Constructed below.
+        FactOutput = facts.path,
+        Objdump = disasm_file.path,
+    )
+
+    # Collect all info from shadow dependencies.
+    for dep in ctx.rule.attr.deps:
+        # There will be no file attribute set for all transitive dependencies
+        # that are not go_library or go_binary rules, such as a proto rules.
+        # This is handled by the ctx.rule.kind check above.
+        info = dep[NogoInfo]
+        if not hasattr(info, "facts"):
+            continue
+
+        # Configure where to find the binary & fact files. Note that this will
+        # use .x and .a regardless of whether this is a go_binary rule, since
+        # these dependencies must be go_library rules.
+        x_files = [f.path for f in info.binaries if f.path.endswith(".x")]
+        if not len(x_files):
+            x_files = [f.path for f in info.binaries if f.path.endswith(".a")]
+        config.ImportMap[info.importpath] = x_files[0]
+        config.FactMap[info.importpath] = info.facts.path
+
+        # Ensure the above are available as inputs.
+        inputs.append(info.facts)
+        inputs += info.binaries
+
+    # Write the configuration and run the tool.
+    config_file = ctx.actions.declare_file(target.label.name + ".cfg")
+    ctx.actions.write(config_file, config.to_json())
+    inputs.append(config_file)
+
+    # Run the nogo tool itself.
+    ctx.actions.run(
+        inputs = inputs,
+        outputs = [facts],
+        tools = go_context(ctx).runfiles,
+        executable = ctx.files._nogo[0],
+        mnemonic = "GoStaticAnalysis",
+        progress_message = "Analyzing %s" % target.label,
+        arguments = ["-config=%s" % config_file.path],
+    )
+
+    # Return the package facts as output.
+    return [NogoInfo(
+        facts = facts,
+        importpath = importpath,
+        binaries = binaries,
+    )]
+
+nogo_aspect = go_rule(
+    aspect,
+    implementation = _nogo_aspect_impl,
+    attr_aspects = ["deps"],
+    attrs = {
+        "_nogo": attr.label(
+            default = "//tools/nogo/check:check",
+            allow_single_file = True,
+        ),
+    },
+)
+
+def _nogo_test_impl(ctx):
+    """Check nogo findings."""
+
+    # Build a runner that checks for the existence of the facts file. Note that
+    # the actual build will fail in the case of a broken analysis. We things
+    # this way so that any test applied is effectively pushed down to all
+    # upstream dependencies through the aspect.
+    inputs = []
+    runner = ctx.actions.declare_file("%s-executer" % ctx.label.name)
+    runner_content = ["#!/bin/bash"]
+    for dep in ctx.attr.deps:
+        info = dep[NogoInfo]
+        inputs.append(info.facts)
+
+        # Draw a sweet unicode checkmark with the package name (in green).
+        runner_content.append("echo -e \"\\033[0;32m\\xE2\\x9C\\x94\\033[0;31m\\033[0m %s\"" % info.importpath)
+    runner_content.append("exit 0\n")
+    ctx.actions.write(runner, "\n".join(runner_content), is_executable = True)
+    return [DefaultInfo(
+        runfiles = ctx.runfiles(files = inputs),
+        executable = runner,
+    )]
+
+_nogo_test = rule(
+    implementation = _nogo_test_impl,
+    attrs = {
+        "deps": attr.label_list(aspects = [nogo_aspect]),
+    },
+    test = True,
+)
+
+def nogo_test(**kwargs):
+    tags = kwargs.pop("tags", []) + ["nogo"]
+    _nogo_test(tags = tags, **kwargs)
diff --git a/tools/nogo/io_bazel_rules_go-visibility.patch b/tools/nogo/io_bazel_rules_go-visibility.patch
new file mode 100644
index 000000000..6b64b2e85
--- /dev/null
+++ b/tools/nogo/io_bazel_rules_go-visibility.patch
@@ -0,0 +1,25 @@
+diff --git a/third_party/org_golang_x_tools-extras.patch b/third_party/org_golang_x_tools-extras.patch
+index 133fbccc..5f0d9a47 100644
+--- a/third_party/org_golang_x_tools-extras.patch
++++ b/third_party/org_golang_x_tools-extras.patch
+@@ -32,7 +32,7 @@ diff -urN c/go/analysis/internal/facts/BUILD.bazel d/go/analysis/internal/facts/
+  
+  go_library(
+      name = "go_default_library",
+-@@ -14,6 +14,23 @@
++@@ -14,6 +14,20 @@
+      ],
+  )
+  
+@@ -43,10 +43,7 @@ diff -urN c/go/analysis/internal/facts/BUILD.bazel d/go/analysis/internal/facts/
+ +        "imports.go",
+ +    ],
+ +    importpath = "golang.org/x/tools/go/analysis/internal/facts",
+-+    visibility = [
+-+        "//go/analysis:__subpackages__",
+-+        "@io_bazel_rules_go//go/tools/builders:__pkg__",
+-+    ],
+++    visibility = ["//visibility:public"],
+ +    deps = [
+ +        "//go/analysis:go_tool_library",
+ +        "//go/types/objectpath:go_tool_library",
diff --git a/tools/nogo/matchers.go b/tools/nogo/matchers.go
new file mode 100644
index 000000000..bc5772303
--- /dev/null
+++ b/tools/nogo/matchers.go
@@ -0,0 +1,138 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nogo
+
+import (
+	"go/token"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"golang.org/x/tools/go/analysis"
+)
+
+type matcher interface {
+	ShouldReport(d analysis.Diagnostic, fs *token.FileSet) bool
+}
+
+// pathRegexps excludes explicit paths.
+type pathRegexps struct {
+	expr      []*regexp.Regexp
+	whitelist bool
+}
+
+// buildRegexps builds a list of regular expressions.
+//
+// This will panic on error.
+func buildRegexps(prefix string, args ...string) []*regexp.Regexp {
+	result := make([]*regexp.Regexp, 0, len(args))
+	for _, arg := range args {
+		result = append(result, regexp.MustCompile(filepath.Join(prefix, arg)))
+	}
+	return result
+}
+
+// ShouldReport implements matcher.ShouldReport.
+func (p *pathRegexps) ShouldReport(d analysis.Diagnostic, fs *token.FileSet) bool {
+	fullPos := fs.Position(d.Pos).String()
+	for _, path := range p.expr {
+		if path.MatchString(fullPos) {
+			return p.whitelist
+		}
+	}
+	return !p.whitelist
+}
+
+// internalExcluded excludes specific internal paths.
+func internalExcluded(paths ...string) *pathRegexps {
+	return &pathRegexps{
+		expr:      buildRegexps(internalPrefix, paths...),
+		whitelist: false,
+	}
+}
+
+// excludedExcluded excludes specific external paths.
+func externalExcluded(paths ...string) *pathRegexps {
+	return &pathRegexps{
+		expr:      buildRegexps(externalPrefix, paths...),
+		whitelist: false,
+	}
+}
+
+// internalMatches returns a path matcher for internal packages.
+func internalMatches() *pathRegexps {
+	return &pathRegexps{
+		expr:      buildRegexps(internalPrefix, ".*"),
+		whitelist: true,
+	}
+}
+
+// resultExcluded excludes explicit message contents.
+type resultExcluded []string
+
+// ShouldReport implements matcher.ShouldReport.
+func (r resultExcluded) ShouldReport(d analysis.Diagnostic, _ *token.FileSet) bool {
+	for _, str := range r {
+		if strings.Contains(d.Message, str) {
+			return false
+		}
+	}
+	return true // Not blacklisted.
+}
+
+// andMatcher is a composite matcher.
+type andMatcher struct {
+	first  matcher
+	second matcher
+}
+
+// ShouldReport implements matcher.ShouldReport.
+func (a *andMatcher) ShouldReport(d analysis.Diagnostic, fs *token.FileSet) bool {
+	return a.first.ShouldReport(d, fs) && a.second.ShouldReport(d, fs)
+}
+
+// and is a syntactic convension for andMatcher.
+func and(first matcher, second matcher) *andMatcher {
+	return &andMatcher{
+		first:  first,
+		second: second,
+	}
+}
+
+// anyMatcher matches everything.
+type anyMatcher struct{}
+
+// ShouldReport implements matcher.ShouldReport.
+func (anyMatcher) ShouldReport(analysis.Diagnostic, *token.FileSet) bool {
+	return true
+}
+
+// alwaysMatches returns an anyMatcher instance.
+func alwaysMatches() anyMatcher {
+	return anyMatcher{}
+}
+
+// neverMatcher will never match.
+type neverMatcher struct{}
+
+// ShouldReport implements matcher.ShouldReport.
+func (neverMatcher) ShouldReport(analysis.Diagnostic, *token.FileSet) bool {
+	return false
+}
+
+// disableMatches returns a neverMatcher instance.
+func disableMatches() neverMatcher {
+	return neverMatcher{}
+}
diff --git a/tools/nogo/nogo.go b/tools/nogo/nogo.go
new file mode 100644
index 000000000..203cdf688
--- /dev/null
+++ b/tools/nogo/nogo.go
@@ -0,0 +1,316 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package nogo implements binary analysis similar to bazel's nogo,
+// or the unitchecker package. It exists in order to provide additional
+// facilities for analysis, namely plumbing through the output from
+// dumping the generated binary (to analyze actual produced code).
+package nogo
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"go/ast"
+	"go/build"
+	"go/parser"
+	"go/token"
+	"go/types"
+	"io"
+	"io/ioutil"
+	"log"
+	"os"
+	"path/filepath"
+	"reflect"
+
+	"golang.org/x/tools/go/analysis"
+	"golang.org/x/tools/go/analysis/internal/facts"
+	"golang.org/x/tools/go/gcexportdata"
+	"gvisor.dev/gvisor/tools/nogo/data"
+)
+
+// pkgConfig is serialized as the configuration.
+//
+// This contains everything required for the analysis.
+type pkgConfig struct {
+	ImportPath string
+	GoFiles    []string
+	NonGoFiles []string
+	Tags       []string
+	GOOS       string
+	GOARCH     string
+	ImportMap  map[string]string
+	FactMap    map[string]string
+	FactOutput string
+	Objdump    string
+}
+
+// loadFacts finds and loads facts per FactMap.
+func (c *pkgConfig) loadFacts(path string) ([]byte, error) {
+	realPath, ok := c.FactMap[path]
+	if !ok {
+		return nil, nil // No facts available.
+	}
+
+	// Read the files file.
+	data, err := ioutil.ReadFile(realPath)
+	if err != nil {
+		return nil, err
+	}
+	return data, nil
+}
+
+// shouldInclude indicates whether the file should be included.
+//
+// NOTE: This does only basic parsing of tags.
+func (c *pkgConfig) shouldInclude(path string) (bool, error) {
+	ctx := build.Default
+	ctx.GOOS = c.GOOS
+	ctx.GOARCH = c.GOARCH
+	ctx.BuildTags = c.Tags
+	return ctx.MatchFile(filepath.Dir(path), filepath.Base(path))
+}
+
+// importer is an implementation of go/types.Importer.
+//
+// This wraps a configuration, which provides the map of package names to
+// files, and the facts. Note that this importer implementation will always
+// pass when a given package is not available.
+type importer struct {
+	pkgConfig
+	fset  *token.FileSet
+	cache map[string]*types.Package
+}
+
+// Import implements types.Importer.Import.
+func (i *importer) Import(path string) (*types.Package, error) {
+	if path == "unsafe" {
+		// Special case: go/types has pre-defined type information for
+		// unsafe. We ensure that this package is correct, in case any
+		// analyzers are specifically looking for this.
+		return types.Unsafe, nil
+	}
+	realPath, ok := i.ImportMap[path]
+	var (
+		rc  io.ReadCloser
+		err error
+	)
+	if !ok {
+		// Not found in the import path. Attempt to find the package
+		// via the standard library.
+		rc, err = findStdPkg(path, i.GOOS, i.GOARCH)
+	} else {
+		// Open the file.
+		rc, err = os.Open(realPath)
+	}
+	if err != nil {
+		return nil, err
+	}
+	defer rc.Close()
+
+	// Load all exported data.
+	r, err := gcexportdata.NewReader(rc)
+	if err != nil {
+		return nil, err
+	}
+
+	return gcexportdata.Read(r, i.fset, i.cache, path)
+}
+
+// checkPackage runs all analyzers.
+//
+// The implementation was adapted from [1], which was in turn adpated from [2].
+// This returns a list of matching analysis issues, or an error if the analysis
+// could not be completed.
+//
+// [1] bazelbuid/rules_go/tools/builders/nogo_main.go
+// [2] golang.org/x/tools/go/checker/internal/checker
+func checkPackage(config pkgConfig) ([]string, error) {
+	imp := &importer{
+		pkgConfig: config,
+		fset:      token.NewFileSet(),
+		cache:     make(map[string]*types.Package),
+	}
+
+	// Load all source files.
+	var syntax []*ast.File
+	for _, file := range config.GoFiles {
+		include, err := config.shouldInclude(file)
+		if err != nil {
+			return nil, fmt.Errorf("error evaluating file %q: %v", file, err)
+		}
+		if !include {
+			continue
+		}
+		s, err := parser.ParseFile(imp.fset, file, nil, parser.ParseComments)
+		if err != nil {
+			return nil, fmt.Errorf("error parsing file %q: %v", file, err)
+		}
+		syntax = append(syntax, s)
+	}
+
+	// Check type information.
+	typesSizes := types.SizesFor("gc", config.GOARCH)
+	typeConfig := types.Config{Importer: imp}
+	typesInfo := &types.Info{
+		Types:      make(map[ast.Expr]types.TypeAndValue),
+		Uses:       make(map[*ast.Ident]types.Object),
+		Defs:       make(map[*ast.Ident]types.Object),
+		Implicits:  make(map[ast.Node]types.Object),
+		Scopes:     make(map[ast.Node]*types.Scope),
+		Selections: make(map[*ast.SelectorExpr]*types.Selection),
+	}
+	types, err := typeConfig.Check(config.ImportPath, imp.fset, syntax, typesInfo)
+	if err != nil {
+		return nil, fmt.Errorf("error checking types: %v", err)
+	}
+
+	// Load all package facts.
+	facts, err := facts.Decode(types, config.loadFacts)
+	if err != nil {
+		return nil, fmt.Errorf("error decoding facts: %v", err)
+	}
+
+	// Set the binary global for use.
+	data.Objdump = config.Objdump
+
+	// Register fact types and establish dependencies between analyzers.
+	// The visit closure will execute recursively, and populate results
+	// will all required analysis results.
+	diagnostics := make(map[*analysis.Analyzer][]analysis.Diagnostic)
+	results := make(map[*analysis.Analyzer]interface{})
+	var visit func(*analysis.Analyzer) error // For recursion.
+	visit = func(a *analysis.Analyzer) error {
+		if _, ok := results[a]; ok {
+			return nil
+		}
+
+		// Run recursively for all dependencies.
+		for _, req := range a.Requires {
+			if err := visit(req); err != nil {
+				return err
+			}
+		}
+
+		// Prepare the matcher.
+		m := analyzerConfig[a]
+		report := func(d analysis.Diagnostic) {
+			if m.ShouldReport(d, imp.fset) {
+				diagnostics[a] = append(diagnostics[a], d)
+			}
+		}
+
+		// Run the analysis.
+		factFilter := make(map[reflect.Type]bool)
+		for _, f := range a.FactTypes {
+			factFilter[reflect.TypeOf(f)] = true
+		}
+		p := &analysis.Pass{
+			Analyzer:          a,
+			Fset:              imp.fset,
+			Files:             syntax,
+			Pkg:               types,
+			TypesInfo:         typesInfo,
+			ResultOf:          results, // All results.
+			Report:            report,
+			ImportPackageFact: facts.ImportPackageFact,
+			ExportPackageFact: facts.ExportPackageFact,
+			ImportObjectFact:  facts.ImportObjectFact,
+			ExportObjectFact:  facts.ExportObjectFact,
+			AllPackageFacts:   func() []analysis.PackageFact { return facts.AllPackageFacts(factFilter) },
+			AllObjectFacts:    func() []analysis.ObjectFact { return facts.AllObjectFacts(factFilter) },
+			TypesSizes:        typesSizes,
+		}
+		result, err := a.Run(p)
+		if err != nil {
+			return fmt.Errorf("error running analysis %s: %v", a, err)
+		}
+
+		// Sanity check & save the result.
+		if got, want := reflect.TypeOf(result), a.ResultType; got != want {
+			return fmt.Errorf("error: analyzer %s returned a result of type %v, but declared ResultType %v", a, got, want)
+		}
+		results[a] = result
+		return nil // Success.
+	}
+
+	// Visit all analysis recursively.
+	for a, _ := range analyzerConfig {
+		if err := visit(a); err != nil {
+			return nil, err // Already has context.
+		}
+	}
+
+	// Write the output file.
+	if config.FactOutput != "" {
+		factData := facts.Encode()
+		if err := ioutil.WriteFile(config.FactOutput, factData, 0644); err != nil {
+			return nil, fmt.Errorf("error: unable to open facts output %q: %v", config.FactOutput, err)
+		}
+	}
+
+	// Convert all diagnostics to strings.
+	findings := make([]string, 0, len(diagnostics))
+	for a, ds := range diagnostics {
+		for _, d := range ds {
+			// Include the anlyzer name for debugability and configuration.
+			findings = append(findings, fmt.Sprintf("%s: %s: %s", a.Name, imp.fset.Position(d.Pos), d.Message))
+		}
+	}
+
+	// Return all findings.
+	return findings, nil
+}
+
+var (
+	configFile = flag.String("config", "", "configuration file (in JSON format)")
+)
+
+// Main is the entrypoint; it should be called directly from main.
+//
+// N.B. This package registers it's own flags.
+func Main() {
+	// Parse all flags.
+	flag.Parse()
+
+	// Load the configuration.
+	f, err := os.Open(*configFile)
+	if err != nil {
+		log.Fatalf("unable to open configuration %q: %v", *configFile, err)
+	}
+	defer f.Close()
+	config := new(pkgConfig)
+	dec := json.NewDecoder(f)
+	dec.DisallowUnknownFields()
+	if err := dec.Decode(config); err != nil {
+		log.Fatalf("unable to decode configuration: %v", err)
+	}
+
+	// Process the package.
+	findings, err := checkPackage(*config)
+	if err != nil {
+		log.Fatalf("error checking package: %v", err)
+	}
+
+	// No findings?
+	if len(findings) == 0 {
+		os.Exit(0)
+	}
+
+	// Print findings and exit with non-zero code.
+	for _, finding := range findings {
+		fmt.Fprintf(os.Stdout, "%s\n", finding)
+	}
+	os.Exit(1)
+}
diff --git a/tools/nogo/register.go b/tools/nogo/register.go
new file mode 100644
index 000000000..62b499661
--- /dev/null
+++ b/tools/nogo/register.go
@@ -0,0 +1,64 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nogo
+
+import (
+	"encoding/gob"
+	"log"
+
+	"golang.org/x/tools/go/analysis"
+)
+
+// analyzers returns all configured analyzers.
+func analyzers() (all []*analysis.Analyzer) {
+	for a, _ := range analyzerConfig {
+		all = append(all, a)
+	}
+	return all
+}
+
+func init() {
+	// Validate basic configuration.
+	if err := analysis.Validate(analyzers()); err != nil {
+		log.Fatalf("unable to validate analyzer: %v", err)
+	}
+
+	// Register all fact types.
+	//
+	// N.B. This needs to be done recursively, because there may be
+	// analyzers in the Requires list that do not appear explicitly above.
+	registered := make(map[*analysis.Analyzer]struct{})
+	var register func(*analysis.Analyzer)
+	register = func(a *analysis.Analyzer) {
+		if _, ok := registered[a]; ok {
+			return
+		}
+
+		// Regsiter dependencies.
+		for _, da := range a.Requires {
+			register(da)
+		}
+
+		// Register local facts.
+		for _, f := range a.FactTypes {
+			gob.Register(f)
+		}
+
+		registered[a] = struct{}{} // Done.
+	}
+	for _, a := range analyzers() {
+		register(a)
+	}
+}
diff --git a/tools/tag_release.sh b/tools/tag_release.sh
index f33b902d6..b0bab74b4 100755
--- a/tools/tag_release.sh
+++ b/tools/tag_release.sh
@@ -18,16 +18,28 @@
 # validate a provided release name, create a tag and push it. It must be
 # run manually when a release is created.
 
-set -xeu
+set -xeuo pipefail
 
 # Check arguments.
-if [ "$#" -ne 2 ]; then
-  echo "usage: $0 <commit|revid> <release.rc>"
+if [[ "$#" -ne 3 ]]; then
+  echo "usage: $0 <commit|revid> <release.rc> <message-file>"
   exit 1
 fi
 
 declare -r target_commit="$1"
 declare -r release="$2"
+declare -r message_file="$3"
+
+if [[ -z "${target_commit}" ]]; then
+  echo "error: <commit|revid> is empty."
+fi
+if [[ -z "${release}" ]]; then
+  echo "error: <release.rc> is empty."
+fi
+if ! [[ -r "${message_file}" ]]; then
+  echo "error: message file '${message_file}' is not readable."
+  exit 1
+fi
 
 closest_commit() {
   while read line; do
@@ -62,8 +74,9 @@ if ! [[ "${release}" =~ ^20[0-9]{6}\.[0-9]+$ ]]; then
   exit 1
 fi
 
-# Tag the given commit (annotated, to record the committer).
+# Tag the given commit (annotated, to record the committer). Note that the tag
+# here is applied as a force, in case the tag already exists and is the same.
+# The push will fail in this case (because it is not forced).
 declare -r tag="release-${release}"
-(git tag -m "Release ${release}" -a "${tag}" "${commit}" && \
-  git push origin tag "${tag}") || \
-  (git tag -d "${tag}" && false)
+git tag -f -F "${message_file}" -a "${tag}" "${commit}" && \
+  git push origin tag "${tag}"
diff --git a/tools/images/BUILD b/tools/vm/BUILD
index fe11f08a3..f7160c627 100644
--- a/tools/images/BUILD
+++ b/tools/vm/BUILD
@@ -1,19 +1,14 @@
 load("//tools:defs.bzl", "cc_binary", "gtest")
-load("//tools/images:defs.bzl", "vm_image", "vm_test")
+load("//tools/vm:defs.bzl", "vm_image", "vm_test")
 
 package(
     default_visibility = ["//:sandbox"],
     licenses = ["notice"],
 )
 
-genrule(
+sh_binary(
     name = "zone",
-    outs = ["zone.txt"],
-    cmd = "gcloud config get-value compute/zone > $@",
-    tags = [
-        "local",
-        "manual",
-    ],
+    srcs = ["zone.sh"],
 )
 
 sh_binary(
@@ -42,27 +37,21 @@ vm_image(
     family = "ubuntu-1604-lts",
     project = "ubuntu-os-cloud",
     scripts = [
-        "//tools/images/ubuntu1604",
+        "//tools/vm/ubuntu1604",
     ],
 )
 
-vm_test(
-    name = "ubuntu1604_test",
-    image = ":ubuntu1604",
-    targets = [":test"],
-)
-
 vm_image(
     name = "ubuntu1804",
     family = "ubuntu-1804-lts",
     project = "ubuntu-os-cloud",
     scripts = [
-        "//tools/images/ubuntu1804",
+        "//tools/vm/ubuntu1804",
     ],
 )
 
 vm_test(
-    name = "ubuntu1804_test",
-    image = ":ubuntu1804",
+    name = "vm_test",
+    shard_count = 2,
     targets = [":test"],
 )
diff --git a/tools/vm/README.md b/tools/vm/README.md
new file mode 100644
index 000000000..898c95fca
--- /dev/null
+++ b/tools/vm/README.md
@@ -0,0 +1,42 @@
+# VM Images & Tests
+
+All commands in this directory require the `gcloud` project to be set.
+
+For example: `gcloud config set project gvisor-kokoro-testing`.
+
+Images can be generated by using the `vm_image` rule. This rule will generate a
+binary target that builds an image in an idempotent way, and can be referenced
+from other rules.
+
+For example:
+
+```
+vm_image(
+    name = "ubuntu",
+    project = "ubuntu-1604-lts",
+    family = "ubuntu-os-cloud",
+    scripts = [
+        "script.sh",
+        "other.sh",
+    ],
+)
+```
+
+These images can be built manually by executing the target. The output on
+`stdout` will be the image id (in the current project).
+
+Images are always named per the hash of all the hermetic input scripts. This
+allows images to be memoized quickly and easily.
+
+The `vm_test` rule can be used to execute a command remotely. This is still
+under development however, and will likely change over time.
+
+For example:
+
+```
+vm_test(
+    name = "mycommand",
+    image = ":ubuntu",
+    targets = [":test"],
+)
+```
diff --git a/tools/images/build.sh b/tools/vm/build.sh
index be462d556..752b2b77b 100755
--- a/tools/images/build.sh
+++ b/tools/vm/build.sh
@@ -19,7 +19,7 @@
 # image_setup.sh script. This script should be idempotent, as we memoize the
 # setup script with a hash and check for that name.
 
-set -xeou pipefail
+set -eou pipefail
 
 # Parameters.
 declare -r USERNAME=${USERNAME:-test}
@@ -34,68 +34,84 @@ declare -r INSTANCE_NAME=$(mktemp -u build-XXXXXX | tr A-Z a-z)
 
 # Hash inputs in order to memoize the produced image.
 declare -r SETUP_HASH=$( (echo ${USERNAME} ${IMAGE_PROJECT} ${IMAGE_FAMILY} && cat "$@") | sha256sum - | cut -d' ' -f1 | cut -c 1-16)
-declare -r IMAGE_NAME=${IMAGE_FAMILY:-image-}${SETUP_HASH}
+declare -r IMAGE_NAME=${IMAGE_FAMILY:-image}-${SETUP_HASH}
 
 # Does the image already exist? Skip the build.
-declare -r existing=$(gcloud compute images list --filter="name=(${IMAGE_NAME})" --format="value(name)")
+declare -r existing=$(set -x; gcloud compute images list --filter="name=(${IMAGE_NAME})" --format="value(name)")
 if ! [[ -z "${existing}" ]]; then
   echo "${existing}"
   exit 0
 fi
 
+# Standard arguments (applies only on script execution).
+declare -ar SSH_ARGS=("-o" "ConnectTimeout=60" "--")
+
 # gcloud has path errors; is this a result of being a genrule?
 export PATH=${PATH:-/bin:/usr/bin:/usr/local/bin}
 
 # Start a unique instance. Note that this instance will have a unique persistent
 # disk as it's boot disk with the same name as the instance.
-gcloud compute instances create \
+(set -x; gcloud compute instances create \
     --quiet \
     --image-project "${IMAGE_PROJECT}" \
     --image-family "${IMAGE_FAMILY}" \
     --boot-disk-size "200GB" \
     --zone "${ZONE}" \
-    "${INSTANCE_NAME}" >/dev/null
+    "${INSTANCE_NAME}" >/dev/null)
 function cleanup {
-    gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}"
+  (set -x; gcloud compute instances delete --quiet --zone "${ZONE}" "${INSTANCE_NAME}")
 }
 trap cleanup EXIT
 
 # Wait for the instance to become available (up to 5 minutes).
+echo -n "Waiting for ${INSTANCE_NAME}" >&2
 declare timeout=300
 declare success=0
+declare internal=""
 declare -r start=$(date +%s)
 declare -r end=$((${start}+${timeout}))
 while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do
-  if gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- env - true 2>/dev/null; then
+  echo -n "." >&2
+  if gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then
+    success=$((${success}+1))
+  elif gcloud compute ssh --internal-ip --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then
     success=$((${success}+1))
+    internal="--internal-ip"
   fi
 done
+
 if [[ "${success}" -eq "0" ]]; then
-  echo "connect timed out after ${timeout} seconds."
+  echo "connect timed out after ${timeout} seconds." >&2
   exit 1
+else
+  echo "done." >&2
 fi
 
 # Run the install scripts provided.
 for arg; do
-  gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- sudo bash - <"${arg}" >/dev/null
+  (set -x; gcloud compute ssh ${internal} \
+      --zone "${ZONE}" \
+      "${USERNAME}"@"${INSTANCE_NAME}" -- \
+      "${SSH_ARGS[@]}" \
+      sudo bash - <"${arg}" >/dev/null)
 done
 
 # Stop the instance; required before creating an image.
-gcloud compute instances stop --quiet --zone "${ZONE}" "${INSTANCE_NAME}" >/dev/null
+(set -x; gcloud compute instances stop --quiet --zone "${ZONE}" "${INSTANCE_NAME}" >/dev/null)
 
 # Create a snapshot of the instance disk.
-gcloud compute disks snapshot \
+(set -x; gcloud compute disks snapshot \
     --quiet \
     --zone "${ZONE}" \
     --snapshot-names="${SNAPSHOT_NAME}" \
-    "${INSTANCE_NAME}" >/dev/null
+    "${INSTANCE_NAME}" >/dev/null)
 
 # Create the disk image.
-gcloud compute images create \
+(set -x; gcloud compute images create \
     --quiet \
     --source-snapshot="${SNAPSHOT_NAME}" \
     --licenses="https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx" \
-    "${IMAGE_NAME}" >/dev/null
+    "${IMAGE_NAME}" >/dev/null)
 
 # Finish up.
 echo "${IMAGE_NAME}"
diff --git a/tools/images/defs.bzl b/tools/vm/defs.bzl
index de365d153..0f67cfa92 100644
--- a/tools/images/defs.bzl
+++ b/tools/vm/defs.bzl
@@ -1,96 +1,108 @@
-"""Image configuration.
-
-Images can be generated by using the vm_image rule. For example,
-
-  vm_image(
-      name = "ubuntu",
-      project = "...",
-      family = "...",
-      scripts = [
-          "script.sh",
-          "other.sh",
-      ],
-  )
-
-This will always create an vm_image in the current default gcloud project. The
-rule has a text file as its output containing the image name. This will enforce
-serialization for all dependent rules.
-
-Images are always named per the hash of all the hermetic input scripts. This
-allows images to be memoized quickly and easily.
-
-The vm_test rule can be used to execute a command remotely. For example,
-
-  vm_test(
-      name = "mycommand",
-      image = ":myimage",
-      targets = [":test"],
-  )
-"""
+"""Image configuration. See README.md."""
 
 load("//tools:defs.bzl", "default_installer")
 
-def _vm_image_impl(ctx):
+# vm_image_builder is a rule that will construct a shell script that actually
+# generates a given VM image. Note that this does not _run_ the shell script
+# (although it can be run manually). It will be run manually during generation
+# of the vm_image target itself. This level of indirection is used so that the
+# build system itself only runs the builder once when multiple targets depend
+# on it, avoiding a set of races and conflicts.
+def _vm_image_builder_impl(ctx):
+    # Generate a binary that actually builds the image.
+    builder = ctx.actions.declare_file(ctx.label.name)
     script_paths = []
     for script in ctx.files.scripts:
         script_paths.append(script.short_path)
+    builder_content = "\n".join([
+        "#!/bin/bash",
+        "export ZONE=$(%s)" % ctx.files.zone[0].short_path,
+        "export USERNAME=%s" % ctx.attr.username,
+        "export IMAGE_PROJECT=%s" % ctx.attr.project,
+        "export IMAGE_FAMILY=%s" % ctx.attr.family,
+        "%s %s" % (ctx.files._builder[0].short_path, " ".join(script_paths)),
+        "",
+    ])
+    ctx.actions.write(builder, builder_content, is_executable = True)
 
-    resolved_inputs, argv, runfiles_manifests = ctx.resolve_command(
-        command = "USERNAME=%s ZONE=$(cat %s) IMAGE_PROJECT=%s IMAGE_FAMILY=%s %s %s > %s" %
-                  (
-                      ctx.attr.username,
-                      ctx.files.zone[0].path,
-                      ctx.attr.project,
-                      ctx.attr.family,
-                      ctx.executable.builder.path,
-                      " ".join(script_paths),
-                      ctx.outputs.out.path,
-                  ),
-        tools = [ctx.attr.builder] + ctx.attr.scripts,
-    )
-
-    ctx.actions.run_shell(
-        tools = resolved_inputs,
-        outputs = [ctx.outputs.out],
-        progress_message = "Building image...",
-        execution_requirements = {"local": "true"},
-        command = argv,
-        input_manifests = runfiles_manifests,
-    )
+    # Note that the scripts should only be files, and should not include any
+    # indirect transitive dependencies. The build script wouldn't work.
     return [DefaultInfo(
-        files = depset([ctx.outputs.out]),
-        runfiles = ctx.runfiles(files = [ctx.outputs.out]),
+        executable = builder,
+        runfiles = ctx.runfiles(
+            files = ctx.files.scripts + ctx.files._builder + ctx.files.zone,
+        ),
     )]
 
-_vm_image = rule(
+vm_image_builder = rule(
     attrs = {
-        "builder": attr.label(
+        "_builder": attr.label(
             executable = True,
-            default = "//tools/images:builder",
+            default = "//tools/vm:builder",
             cfg = "host",
         ),
         "username": attr.string(default = "$(whoami)"),
         "zone": attr.label(
-            default = "//tools/images:zone",
+            executable = True,
+            default = "//tools/vm:zone",
             cfg = "host",
         ),
         "family": attr.string(mandatory = True),
         "project": attr.string(mandatory = True),
         "scripts": attr.label_list(allow_files = True),
     },
-    outputs = {
-        "out": "%{name}.txt",
+    executable = True,
+    implementation = _vm_image_builder_impl,
+)
+
+# See vm_image_builder above.
+def _vm_image_impl(ctx):
+    # Run the builder to generate our output.
+    echo = ctx.actions.declare_file(ctx.label.name)
+    resolved_inputs, argv, runfiles_manifests = ctx.resolve_command(
+        command = "echo -ne \"#!/bin/bash\\nset -e\\nimage=$(%s)\\necho ${image}\\n\" > %s && chmod 0755 %s" % (
+            ctx.files.builder[0].path,
+            echo.path,
+            echo.path,
+        ),
+        tools = [ctx.attr.builder],
+    )
+    ctx.actions.run_shell(
+        tools = resolved_inputs,
+        outputs = [echo],
+        progress_message = "Building image...",
+        execution_requirements = {"local": "true"},
+        command = argv,
+        input_manifests = runfiles_manifests,
+    )
+
+    # Return just the echo command. All of the builder runfiles have been
+    # resolved and consumed in the generation of the trivial echo script.
+    return [DefaultInfo(executable = echo)]
+
+_vm_image_test = rule(
+    attrs = {
+        "builder": attr.label(
+            executable = True,
+            cfg = "host",
+        ),
     },
+    test = True,
     implementation = _vm_image_impl,
 )
 
-def vm_image(**kwargs):
-    _vm_image(
+def vm_image(name, **kwargs):
+    vm_image_builder(
+        name = name + "_builder",
+        **kwargs
+    )
+    _vm_image_test(
+        name = name,
+        builder = ":" + name + "_builder",
         tags = [
             "local",
             "manual",
         ],
-        **kwargs
     )
 
 def _vm_test_impl(ctx):
@@ -101,9 +113,9 @@ def _vm_test_impl(ctx):
     # they can be copied over for remote execution.
     runner_content = "\n".join([
         "#!/bin/bash",
-        "export ZONE=$(cat %s)" % ctx.files.zone[0].short_path,
+        "export ZONE=$(%s)" % ctx.files.zone[0].short_path,
         "export USERNAME=%s" % ctx.attr.username,
-        "export IMAGE=$(cat %s)" % ctx.files.image[0].short_path,
+        "export IMAGE=$(%s)" % ctx.files.image[0].short_path,
         "export SUDO=%s" % "true" if ctx.attr.sudo else "false",
         "%s %s" % (
             ctx.executable.executer.short_path,
@@ -133,17 +145,19 @@ def _vm_test_impl(ctx):
 _vm_test = rule(
     attrs = {
         "image": attr.label(
-            mandatory = True,
+            executable = True,
+            default = "//tools/vm:ubuntu1804",
             cfg = "host",
         ),
         "executer": attr.label(
             executable = True,
-            default = "//tools/images:executer",
+            default = "//tools/vm:executer",
             cfg = "host",
         ),
         "username": attr.string(default = "$(whoami)"),
         "zone": attr.label(
-            default = "//tools/images:zone",
+            executable = True,
+            default = "//tools/vm:zone",
             cfg = "host",
         ),
         "sudo": attr.bool(default = True),
@@ -159,7 +173,7 @@ _vm_test = rule(
 )
 
 def vm_test(
-        installer = "//tools/installers:head",
+        installers = None,
         **kwargs):
     """Runs the given targets as a remote test.
 
@@ -168,8 +182,12 @@ def vm_test(
       **kwargs: All test arguments. Should include targets and image.
     """
     targets = kwargs.pop("targets", [])
-    if installer:
-        targets = [installer] + targets
+    if installers == None:
+        installers = [
+            "//tools/installers:head",
+            "//tools/installers:images",
+        ]
+    targets = installers + targets
     if default_installer():
         targets = [default_installer()] + targets
     _vm_test(
diff --git a/tools/images/execute.sh b/tools/vm/execute.sh
index ba4b1ac0e..1f1f3ce01 100755
--- a/tools/images/execute.sh
+++ b/tools/vm/execute.sh
@@ -31,6 +31,9 @@ declare -r MACHINE=${MACHINE:-n1-standard-1}
 declare -r ZONE=${ZONE:-us-central1-f}
 declare -r SUDO=${SUDO:-false}
 
+# Standard arguments (applies only on script execution).
+declare -ar SSH_ARGS=("-o" "ConnectTimeout=60" "--")
+
 # This script is executed as a test rule, which will reset the value of HOME.
 # Unfortunately, it is needed to load the gconfig credentials. We will reset
 # HOME when we actually execute in the remote environment, defined below.
@@ -81,7 +84,9 @@ tar czf - --dereference --exclude=.git . |
     gcloud compute ssh \
         --ssh-key-file="${KEYNAME}" \
         --zone "${ZONE}" \
-        "${USERNAME}"@"${INSTANCE_NAME}" -- tar xzf -
+        "${USERNAME}"@"${INSTANCE_NAME}" -- \
+        "${SSH_ARGS[@]}" \
+        tar xzf -
 
 # Execute the command remotely.
 for cmd; do
@@ -108,6 +113,7 @@ for cmd; do
       --ssh-key-file="${KEYNAME}" \
       --zone "${ZONE}" \
       "${USERNAME}"@"${INSTANCE_NAME}" -- \
+      "${SSH_ARGS[@]}" \
       mkdir -p "/tmp/${REMOTE_TMPDIR}"
   fi
   if [[ -v XML_OUTPUT_FILE ]]; then
@@ -123,6 +129,7 @@ for cmd; do
     --ssh-key-file="${KEYNAME}" \
     --zone "${ZONE}" \
     "${USERNAME}"@"${INSTANCE_NAME}" -- \
+    "${SSH_ARGS[@]}" \
     "${PREFIX[@]}" "${cmd}"
 
   # Collect relevant results.
@@ -147,6 +154,7 @@ for cmd; do
       --ssh-key-file="${KEYNAME}" \
       --zone "${ZONE}" \
       "${USERNAME}"@"${INSTANCE_NAME}" -- \
+      "${SSH_ARGS[@]}" \
       rm -rf "/tmp/${REMOTE_TMPDIR}"
   fi
 done
diff --git a/tools/images/test.cc b/tools/vm/test.cc
index 4f31d93c5..c0ceacda1 100644
--- a/tools/images/test.cc
+++ b/tools/vm/test.cc
@@ -16,8 +16,12 @@
 
 namespace {
 
-TEST(Image, Sanity) {
-  // Do nothing.
+TEST(Image, Sanity0) {
+  // Do nothing (in shard 0).
+}
+
+TEST(Image, Sanity1) {
+  // Do nothing (in shard 1).
 }
 
 }  // namespace
diff --git a/tools/images/ubuntu1604/10_core.sh b/tools/vm/ubuntu1604/10_core.sh
index 46dda6bb1..629f7cf7a 100755
--- a/tools/images/ubuntu1604/10_core.sh
+++ b/tools/vm/ubuntu1604/10_core.sh
@@ -17,7 +17,20 @@
 set -xeo pipefail
 
 # Install all essential build tools.
-apt-get update && apt-get -y install make git-core build-essential linux-headers-$(uname -r) pkg-config
+while true; do
+  if (apt-get update && apt-get install -y \
+      make \
+      git-core \
+      build-essential \
+      linux-headers-$(uname -r) \
+      pkg-config); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
 
 # Install a recent go toolchain.
 if ! [[ -d /usr/local/go ]]; then
@@ -27,4 +40,4 @@ if ! [[ -d /usr/local/go ]]; then
 fi
 
 # Link the Go binary from /usr/bin; replacing anything there.
-(cd /usr/bin && rm -f go && sudo ln -fs /usr/local/go/bin/go go)
+(cd /usr/bin && rm -f go && ln -fs /usr/local/go/bin/go go)
diff --git a/tools/vm/ubuntu1604/15_gcloud.sh b/tools/vm/ubuntu1604/15_gcloud.sh
new file mode 100755
index 000000000..bc2e5eccc
--- /dev/null
+++ b/tools/vm/ubuntu1604/15_gcloud.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeo pipefail
+
+# Install all essential build tools.
+while true; do
+  if (apt-get update && apt-get install -y \
+      apt-transport-https \
+      ca-certificates \
+      gnupg); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
+
+# Add gcloud repositories.
+echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | \
+  tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+
+# Add the appropriate key.
+curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \
+  apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
+
+# Install the gcloud SDK.
+while true; do
+  if (apt-get update && apt-get install -y google-cloud-sdk); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
diff --git a/tools/images/ubuntu1604/20_bazel.sh b/tools/vm/ubuntu1604/20_bazel.sh
index b33e1656c..bb7afa676 100755
--- a/tools/images/ubuntu1604/20_bazel.sh
+++ b/tools/vm/ubuntu1604/20_bazel.sh
@@ -19,7 +19,17 @@ set -xeo pipefail
 declare -r BAZEL_VERSION=2.0.0
 
 # Install bazel dependencies.
-apt-get update && apt-get install -y openjdk-8-jdk-headless unzip
+while true; do
+  if (apt-get update && apt-get install -y \
+      openjdk-8-jdk-headless \
+      unzip); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
 
 # Use the release installer.
 curl -L -o bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
diff --git a/tools/images/ubuntu1604/25_docker.sh b/tools/vm/ubuntu1604/25_docker.sh
index 1d3defcd3..11eea2d72 100755
--- a/tools/images/ubuntu1604/25_docker.sh
+++ b/tools/vm/ubuntu1604/25_docker.sh
@@ -15,12 +15,20 @@
 # limitations under the License.
 
 # Add dependencies.
-apt-get update && apt-get -y install \
-    apt-transport-https \
-    ca-certificates \
-    curl \
-    gnupg-agent \
-    software-properties-common
+while true; do
+  if (apt-get update && apt-get install -y \
+      apt-transport-https \
+      ca-certificates \
+      curl \
+      gnupg-agent \
+      software-properties-common); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
 
 # Install the key.
 curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
@@ -32,4 +40,15 @@ add-apt-repository \
    stable"
 
 # Install docker.
-apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io
+while true; do
+  if (apt-get update && apt-get install -y \
+      docker-ce \
+      docker-ce-cli \
+      containerd.io); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
diff --git a/tools/images/ubuntu1604/30_containerd.sh b/tools/vm/ubuntu1604/30_containerd.sh
index a7472bd1c..fb3699c12 100755
--- a/tools/images/ubuntu1604/30_containerd.sh
+++ b/tools/vm/ubuntu1604/30_containerd.sh
@@ -34,7 +34,17 @@ install_helper() {
 }
 
 # Install dependencies for the crictl tests.
-apt-get install -y btrfs-tools libseccomp-dev
+while true; do
+  if (apt-get update && apt-get install -y \
+      btrfs-tools \
+      libseccomp-dev); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
 
 # Install containerd & cri-tools.
 GOPATH=$(mktemp -d --tmpdir gopathXXXXX)
diff --git a/tools/images/ubuntu1604/40_kokoro.sh b/tools/vm/ubuntu1604/40_kokoro.sh
index 5f2dfc858..2974f156c 100755
--- a/tools/images/ubuntu1604/40_kokoro.sh
+++ b/tools/vm/ubuntu1604/40_kokoro.sh
@@ -23,19 +23,34 @@ declare -r ssh_public_keys=(
 )
 
 # Install dependencies.
-apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip python3-pip zip
+while true; do
+  if (apt-get update && apt-get install -y \
+      rsync \
+      coreutils \
+      python-psutil \
+      qemu-kvm \
+      python-pip \
+      python3-pip \
+      zip); then
+    break
+  fi
+  result=$?
+  if [[ $result -ne 100 ]]; then
+    exit $result
+  fi
+done
 
 # junitparser is used to merge junit xml files.
 pip install junitparser
 
-# We need a kbuilder user.
-if useradd -c "kbuilder user" -m -s /bin/bash kbuilder; then
-    # User was added successfully; we add the relevant SSH keys here.
-    mkdir -p ~kbuilder/.ssh
-    (IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys
-    chmod 0600 ~kbuilder/.ssh/authorized_keys
-    chown -R kbuilder ~kbuilder/.ssh
-fi
+# We need a kbuilder user, which may already exist.
+useradd -c "kbuilder user" -m -s /bin/bash kbuilder || true
+
+# We need to provision appropriate keys.
+mkdir -p ~kbuilder/.ssh
+(IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys
+chmod 0600 ~kbuilder/.ssh/authorized_keys
+chown -R kbuilder ~kbuilder/.ssh
 
 # Give passwordless sudo access.
 cat > /etc/sudoers.d/kokoro <<EOF
diff --git a/tools/images/ubuntu1604/BUILD b/tools/vm/ubuntu1604/BUILD
index ab1df0c4c..ab1df0c4c 100644
--- a/tools/images/ubuntu1604/BUILD
+++ b/tools/vm/ubuntu1604/BUILD
diff --git a/tools/images/ubuntu1804/BUILD b/tools/vm/ubuntu1804/BUILD
index 7aa1ecdf7..0c8856dde 100644
--- a/tools/images/ubuntu1804/BUILD
+++ b/tools/vm/ubuntu1804/BUILD
@@ -2,6 +2,6 @@ package(licenses = ["notice"])
 
 alias(
     name = "ubuntu1804",
-    actual = "//tools/images/ubuntu1604",
+    actual = "//tools/vm/ubuntu1604",
     visibility = ["//:sandbox"],
 )
diff --git a/tools/vm/zone.sh b/tools/vm/zone.sh
new file mode 100755
index 000000000..79569fb19
--- /dev/null
+++ b/tools/vm/zone.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Copyright 2020 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+exec gcloud config get-value compute/zone
diff --git a/vdso/BUILD b/vdso/BUILD
index d37d4266d..c70bb8218 100644
--- a/vdso/BUILD
+++ b/vdso/BUILD
@@ -3,7 +3,7 @@
 #   normal system VDSO (time, gettimeofday, clock_gettimeofday) but which uses
 #   timekeeping parameters managed by the sandbox kernel.
 
-load("//tools:defs.bzl", "cc_flags_supplier", "cc_toolchain", "select_arch")
+load("//tools:defs.bzl", "cc_flags_supplier", "cc_toolchain", "select_arch", "vdso_linker_option")
 
 package(licenses = ["notice"])
 
@@ -33,7 +33,7 @@ genrule(
           # Some toolchains enable stack protector by default. Disable it, the
           # VDSO has no hooks to handle failures.
           "-fno-stack-protector " +
-          "-fuse-ld=gold " +
+          vdso_linker_option +
           select_arch(
               amd64 = "-m64 ",
               arm64 = "",
diff --git a/vdso/syscalls.h b/vdso/syscalls.h
index b6d15a7d3..0c6a922a0 100644
--- a/vdso/syscalls.h
+++ b/vdso/syscalls.h
@@ -26,6 +26,9 @@
 #include <stddef.h>
 #include <sys/types.h>
 
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
 namespace vdso {
 
 #if __x86_64__
@@ -51,20 +54,13 @@ static inline int sys_getcpu(unsigned* cpu, unsigned* node,
   return num;
 }
 
-#elif __aarch64__
-
-static inline int sys_rt_sigreturn(void) {
-  int num = __NR_rt_sigreturn;
-
-  asm volatile(
-      "mov x8, %0\n"
-      "svc #0    \n"
-      : "+r"(num)
-      :
-      :);
-  return num;
+static inline void sys_rt_sigreturn(void) {
+  asm volatile("movl $" __stringify(__NR_rt_sigreturn)", %eax \n"
+               "syscall \n");
 }
 
+#elif __aarch64__
+
 static inline int sys_clock_gettime(clockid_t _clkid, struct timespec* _ts) {
   register struct timespec* ts asm("x1") = _ts;
   register clockid_t clkid asm("x0") = _clkid;
@@ -91,6 +87,11 @@ static inline int sys_clock_getres(clockid_t _clkid, struct timespec* _ts) {
   return ret;
 }
 
+static inline void sys_rt_sigreturn(void) {
+  asm volatile("mov x8, #" __stringify(__NR_rt_sigreturn)" \n"
+               "svc #0 \n");
+}
+
 #else
 #error "unsupported architecture"
 #endif
diff --git a/vdso/vdso.cc b/vdso/vdso.cc
index 8bb80a7a4..3b6653b5d 100644
--- a/vdso/vdso.cc
+++ b/vdso/vdso.cc
@@ -69,6 +69,12 @@ int __common_gettimeofday(struct timeval* tv, struct timezone* tz) {
 }
 }  // namespace
 
+// __kernel_rt_sigreturn() implements rt_sigreturn()
+extern "C" void __kernel_rt_sigreturn(unsigned long unused) {
+  // No optimizations yet, just make the real system call.
+  sys_rt_sigreturn();
+}
+
 #if __x86_64__
 
 // __vdso_clock_gettime() implements clock_gettime()
@@ -126,6 +132,10 @@ extern "C" int __kernel_clock_getres(clockid_t clock, struct timespec* res) {
     case CLOCK_REALTIME:
     case CLOCK_MONOTONIC:
     case CLOCK_BOOTTIME: {
+      if (res == nullptr) {
+        return 0;
+      }
+
       res->tv_sec = 0;
       res->tv_nsec = 1;
       break;
@@ -139,12 +149,6 @@ extern "C" int __kernel_clock_getres(clockid_t clock, struct timespec* res) {
   return ret;
 }
 
-// __kernel_rt_sigreturn() implements gettimeofday()
-extern "C" int __kernel_rt_sigreturn(unsigned long unused) {
-  // No optimizations yet, just make the real system call.
-  return sys_rt_sigreturn();
-}
-
 #else
 #error "unsupported architecture"
 #endif
diff --git a/vdso/vdso_amd64.lds b/vdso/vdso_amd64.lds
index e2615ae9e..d114290da 100644
--- a/vdso/vdso_amd64.lds
+++ b/vdso/vdso_amd64.lds
@@ -95,6 +95,7 @@ VERSION {
     __vdso_getcpu;
     time;
     __vdso_time;
+    __kernel_rt_sigreturn;
 
   local: *;
   };
diff --git a/website/BUILD b/website/BUILD
new file mode 100644
index 000000000..c97b2560b
--- /dev/null
+++ b/website/BUILD
@@ -0,0 +1,180 @@
+load("//tools:defs.bzl", "pkg_tar")
+load("//website:defs.bzl", "doc", "docs")
+
+package(licenses = ["notice"])
+
+# website is the full container image. Note that this actually just collects
+# other dependendcies and runs Docker locally to import and tag the image.
+sh_binary(
+    name = "website",
+    srcs = ["import.sh"],
+    data = [":files"],
+    tags = [
+        "local",
+        "manual",
+    ],
+)
+
+# files is the full file system of the generated container.
+#
+# It must collect the all tarballs (produced by the rules below), and run it
+# through the Dockerfile to generate the site. Note that this checks all links,
+# and therefore requires all static content to be present as well.
+#
+# Note that this rule violates most aspects of hermetic builds. However, this
+# works much more reliably than depending on the container_image rules from
+# bazel itself, which are convoluted and seem to have a hard time even finding
+# the toolchain.
+genrule(
+    name = "files",
+    srcs = [
+        ":config",
+        ":css",
+        ":docs",
+        ":static",
+        ":syscallmd",
+        "//website/blog:posts",
+        "//website/cmd/server",
+    ],
+    outs = ["files.tgz"],
+    cmd = "set -x; " +
+          "T=$$(mktemp -d); " +
+          "mkdir -p $$T/input && " +
+          "mkdir -p $$T/output/_site && " +
+          "tar -xf $(location :config)    -C $$T/input && " +
+          "tar -xf $(location :css)       -C $$T/input && " +
+          "tar -xf $(location :docs)      -C $$T/input && " +
+          "tar -xf $(location :syscallmd) -C $$T/input && " +
+          "tar -xf $(location //website/blog:posts) -C $$T/input && " +
+          "find $$T/input -type f -exec chmod u+rw {} \\; && " +
+          "docker run -i --user $$(id -u):$$(id -g) " +
+          "-v $$(readlink -m $$T/input):/input " +
+          "-v $$(readlink -m $$T/output/_site):/output " +
+          "gvisor.dev/images/jekyll && " +
+          "tar -xf $(location :static) -C $$T/output/_site && " +
+          "docker run -i --user $$(id -u):$$(id -g) " +
+          "-v $$(readlink -m $$T/output/_site):/output " +
+          "gvisor.dev/images/jekyll " +
+          "/usr/gem/bin/htmlproofer " +
+          "--disable-external " +
+          "--check-html " +
+          "/output && " +
+          "cp $(location //website/cmd/server) $$T/output/server && " +
+          "tar -zcf $@ -C $$T/output . && " +
+          "rm -rf $$T",
+    tags = [
+        "local",
+        "manual",
+        "nosandbox",
+    ],
+)
+
+# static are the purely static parts of the site. These are effectively copied
+# in after jekyll generates all the dynamic content.
+pkg_tar(
+    name = "static",
+    srcs = [
+        "archive.key",
+    ] + glob([
+        "performance/**",
+    ]),
+    strip_prefix = "./",
+)
+
+# main.scss requires front-matter to be processed.
+genrule(
+    name = "css",
+    srcs = glob([
+        "css/**",
+    ]),
+    outs = [
+        "css.tar",
+    ],
+    cmd = "T=$$(mktemp -d); " +
+          "mkdir -p $$T/css && " +
+          "for file in $(SRCS); do " +
+          "echo -en '---\\n---\\n' > $$T/css/$$(basename $$file) && " +
+          "cat $$file >> $$T/css/$$(basename $$file); " +
+          "done && " +
+          "tar -C $$T -czf $@ . && " +
+          "rm -rf $$T",
+)
+
+# config is "mostly" static content. These are parts of the site that are
+# present when jekyll runs, but are not dynamically generated.
+pkg_tar(
+    name = "config",
+    srcs = [
+        ":css",
+        "_config.yml",
+        "//website/blog:index.html",
+    ] + glob([
+        "assets/**",
+        "_includes/**",
+        "_layouts/**",
+        "_plugins/**",
+        "_sass/**",
+    ]),
+    strip_prefix = "./",
+)
+
+# index is the index file.
+doc(
+    name = "index",
+    src = "index.md",
+    layout = "base",
+    permalink = "/",
+)
+
+# docs is the dynamic content of the site.
+docs(
+    name = "docs",
+    deps = [
+        ":index",
+        "//:code_of_conduct",
+        "//:contributing",
+        "//:governance",
+        "//:security",
+        "//g3doc:community",
+        "//g3doc:index",
+        "//g3doc:roadmap",
+        "//g3doc/architecture_guide:performance",
+        "//g3doc/architecture_guide:platforms",
+        "//g3doc/architecture_guide:resources",
+        "//g3doc/architecture_guide:security",
+        "//g3doc/user_guide:FAQ",
+        "//g3doc/user_guide:checkpoint_restore",
+        "//g3doc/user_guide:compatibility",
+        "//g3doc/user_guide:debugging",
+        "//g3doc/user_guide:filesystem",
+        "//g3doc/user_guide:install",
+        "//g3doc/user_guide:networking",
+        "//g3doc/user_guide:platforms",
+        "//g3doc/user_guide/quick_start:docker",
+        "//g3doc/user_guide/quick_start:kubernetes",
+        "//g3doc/user_guide/quick_start:oci",
+        "//g3doc/user_guide/tutorials:cni",
+        "//g3doc/user_guide/tutorials:docker",
+        "//g3doc/user_guide/tutorials:kubernetes",
+    ],
+)
+
+# Generate JSON for system call tables
+genrule(
+    name = "syscalljson",
+    outs = ["syscalls.json"],
+    cmd = "$(location //runsc) -- help syscalls -format json -filename $@",
+    tools = ["//runsc"],
+)
+
+# Generate markdown from the json dump.
+genrule(
+    name = "syscallmd",
+    srcs = [":syscalljson"],
+    outs = ["syscallsmd"],
+    cmd = "T=$$(mktemp -d) && " +
+          "$(location //website/cmd/syscalldocs) -in $< -out $$T && " +
+          "tar -C $$T -czf $@ . && " +
+          "rm -rf $$T",
+    tools = ["//website/cmd/syscalldocs"],
+)
diff --git a/website/_config.yml b/website/_config.yml
new file mode 100644
index 000000000..b08602970
--- /dev/null
+++ b/website/_config.yml
@@ -0,0 +1,36 @@
+destination: _site
+markdown: kramdown
+kramdown:
+  syntax_highlighter: rouge
+  toc_levels: "2,3"
+highlighter: rouge
+paginate: 5
+paginate_path: "/blog/page:num/"
+plugins:
+  - jekyll-paginate
+  - jekyll-autoprefixer
+  - jekyll-inline-svg
+  - jekyll-relative-links
+  - jekyll-feed
+  - jekyll-sitemap
+site_url: https://gvisor.dev
+feed:
+  path: blog/index.xml
+svg:
+  optimize: true
+defaults:
+  - scope:
+      path: ""
+    values:
+      layout: default
+analytics: "UA-150193582-1"
+authors:
+  jsprad:
+    name: Jeremiah Spradlin
+    email: jsprad@google.com
+  zkoopmans:
+    name: Zach Koopmans
+    email: zkoopmans@google.com
+  igudger:
+    name: Ian Gudger
+    email: igudger@google.com
diff --git a/website/_includes/byline.html b/website/_includes/byline.html
new file mode 100644
index 000000000..d8ae22cb0
--- /dev/null
+++ b/website/_includes/byline.html
@@ -0,0 +1,18 @@
+By
+{% assign last_pos=include.authors.size | minus: 1 %}
+{% assign and_pos=include.authors.size | minus: 2 %}
+{% for i in (0..last_pos) %}
+  {% assign author_id=include.authors[i] %}
+  {% assign author=site.authors[author_id] %}
+  {% if author %}
+    <a href="mailto:{{ author.email }}">{{ author.name }}</a>
+  {% else %}
+    {{ author_id }}
+  {% endif %}
+  {% if i == and_pos %}
+    and
+  {% elsif i < and_pos %}
+    ,
+  {% endif %}
+{% endfor %}
+on <span class="text-muted">{{ include.date | date_to_long_string }}</span>
diff --git a/website/_includes/footer-links.html b/website/_includes/footer-links.html
new file mode 100644
index 000000000..10c28ead4
--- /dev/null
+++ b/website/_includes/footer-links.html
@@ -0,0 +1,43 @@
+<div class="container">
+  <div class="row">
+    <div class="col-sm-3 col-md-2">
+      <p>About</p>
+      <ul class="list-unstyled">
+        <li><a href="/roadmap/">Roadmap</a></li>
+        <li><a href="/contributing/">Contributing</a></li>
+        <li><a href="/security/">Security</a></li>
+        <li><a href="/community/governance/">Governance</a></li>
+        <li><a href="https://policies.google.com/privacy">Privacy Policy</a></li>
+      </ul>
+    </div>
+    <div class="col-sm-3 col-md-2">
+      <p>Support</p>
+      <ul class="list-unstyled">
+        <li><a href="https://github.com/google/gvisor/issues">Issues</a></li>
+        <li><a href="/docs">Documentation</a></li>
+        <li><a href="/docs/user_guide/FAQ">FAQ</a></li>
+      </ul>
+    </div>
+    <div class="col-sm-3 col-md-2">
+      <p>Connect</p>
+      <ul class="list-unstyled">
+        <li><a href="https://github.com/google/gvisor">GitHub</a></li>
+        <li><a href="https://groups.google.com/forum/#!forum/gvisor-users">User Mailing List</a></li>
+        <li><a href="https://groups.google.com/forum/#!forum/gvisor-dev">Developer Mailing List</a></li>
+        <li><a href="https://gitter.im/gvisor/community">Gitter Chat</a></li>
+        <li><a href="/blog">Blog</a></li>
+      </ul>
+    </div>
+    <div class="col-sm-3 col-md-3"></div>
+    <div class="hidden-xs hidden-sm col-md-3">
+      <a href="https://cloud.google.com/run">
+        <img style="float: right;" src="/assets/logos/powered-gvisor.png" alt="Powered by gVisor"/>
+      </a>
+    </div>
+  </div>
+  <div class="row">
+    <div class="col-lg-12">
+      <p>&copy; {{ 'now' | date: "%Y" }} The gVisor Authors</p>
+    </div>
+  </div>
+</div>
diff --git a/website/_includes/footer.html b/website/_includes/footer.html
new file mode 100644
index 000000000..9cc8176f7
--- /dev/null
+++ b/website/_includes/footer.html
@@ -0,0 +1,72 @@
+<footer class="footer">
+  {% include footer-links.html %}
+</footer>
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.10.1/js/all.min.js" integrity="sha256-Z1Nvg/+y2+vRFhFgFij7Lv0r77yG3hOvWz2wI0SfTa0=" crossorigin="anonymous"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/4.13.0/d3.min.js" integrity="sha256-hYXbQJK4qdJiAeDVjjQ9G0D6A0xLnDQ4eJI9dkm7Fpk=" crossorigin="anonymous"></script>
+
+{% if site.analytics %}
+<script type="application/javascript">
+var doNotTrack = false;
+if (!doNotTrack) {
+  window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
+  ga('create', '{{ site.analytics }}', 'auto');
+  ga('send', 'pageview');
+}
+</script>
+<script async src='https://www.google-analytics.com/analytics.js'></script>
+{% endif %}
+
+<script>
+  var shiftWindow = function() {
+    if (location.hash.length !== 0) {
+      window.scrollBy(0, -50);
+    }
+  };
+  window.addEventListener("hashchange", shiftWindow);
+
+  var highlightCurrentSidebarNav = function() {
+    var href = location.pathname;
+    var item = $('#sidebar-nav [href$="' + href + '"]');
+    if (item) {
+      var li = item.parent();
+      li.addClass("active");
+
+      if (li.parent() && li.parent().is("ul")) {
+        do {
+          var ul = li.parent();
+          if (ul.hasClass("collapse")) {
+            ul.collapse("show");
+          }
+          li = ul.parent();
+        } while (li && li.is("li"));
+      }
+    }
+  };
+
+  $(document).ready(function() {
+    // Scroll to anchor of location hash, adjusted for fixed navbar.
+    window.setTimeout(function() {
+      shiftWindow();
+    }, 1);
+
+    // Flip the caret when submenu toggles are clicked.
+    $(".sidebar-submenu").on("show.bs.collapse", function() {
+      var toggle = $('[href$="#' + $(this).attr('id') + '"]');
+      if (toggle) {
+        toggle.addClass("dropup");
+      }
+    });
+    $(".sidebar-submenu").on("hide.bs.collapse", function() {
+      var toggle = $('[href$="#' + $(this).attr('id') + '"]');
+      if (toggle) {
+        toggle.removeClass("dropup");
+      }
+    });
+
+    // Highlight the current page on the sidebar nav.
+    highlightCurrentSidebarNav();
+  });
+</script>
diff --git a/website/_includes/graph.html b/website/_includes/graph.html
new file mode 100644
index 000000000..f3a999341
--- /dev/null
+++ b/website/_includes/graph.html
@@ -0,0 +1,205 @@
+{::nomarkdown}
+{% assign fn = include.id | remove: " " | remove: "-" | downcase %}
+<figure><a href="{{ include.url }}"><svg id="{{ include.id }}" width=500 height=200 onload="render_{{ fn }}()"><title>{{ include.title }}</title></svg></a></figure>
+<script type="text/javascript">
+function render_{{ fn }}() {
+d3.csv("{{ include.url }}", function(d, i, columns) {
+    return d; // Transformed below.
+}, function(error, data) {
+    if (error) throw(error);
+
+    // Create a new data that pivots on runtime.
+    //
+    // To start, we have:
+    //    runtime, ..., result
+    //    runc,    ..., 1
+    //    runsc,   ..., 2
+    //
+    // In the end we want:
+    //    ..., runsc, runc
+    //    ..., 1,     2
+
+    // Filter by metric, if required.
+    if ("{{ include.metric }}" != "") {
+      orig_columns = data.columns;
+      data = data.filter(d => d.metric == "{{ include.metric }}");
+      data.columns = orig_columns;
+    }
+
+    // Filter by method, if required.
+    if ("{{ include.method }}" != "") {
+      orig_columns = data.columns;
+      data = data.filter(d => d.method == "{{ include.method }}");
+      data.columns = orig_columns.filter(key => key != "method");
+    }
+
+    // Enumerate runtimes.
+    var runtimes = Array.from(new Set(data.map(d => d.runtime)));
+    var metrics = Array.from(new Set(data.map(d => d.metric)));
+    if (metrics.length < 1) {
+        console.log(data);
+        throw("need at least one metric");
+    } else if (metrics.length == 1) {
+        metric = metrics[0];
+        data.columns = data.columns.filter(key => key != "metric");
+    } else {
+        metric = ""; // Used for grouping.
+    }
+
+    var isSubset = function(a, sup) {
+        var ap = Object.getOwnPropertyNames(a);
+        for (var i = 0; i < ap.length; i++) {
+            if (a[ap[i]] !== sup[ap[i]]) {
+                return false;
+            }
+        }
+        return true;
+    };
+
+    // Execute a pivot to include runtimes as attributes.
+    var new_data = data.map(function(data_item) {
+        // Generate a prototype data item.
+        var proto_item = Object.assign({}, data_item);
+        delete proto_item.runtime;
+        delete proto_item.result;
+        var next_item = Object.assign({}, proto_item);
+
+        // Find all matching runtime items.
+        data.forEach(function(d) {
+            if (isSubset(proto_item, d)) {
+                // Add the result result.
+                next_item[d.runtime] = d.result;
+            }
+        });
+        return next_item;
+    });
+
+    // Remove any duplication.
+    new_data = Array.from(new Set(new_data));
+    new_data.columns = data.columns;
+    new_data.columns = new_data.columns.filter(key => key != "runtime" && key != "result");
+    new_data.columns = new_data.columns.concat(runtimes);
+    data = new_data;
+
+    // Slice based on the first key.
+    if (data.columns.length != runtimes.length) {
+        x0_key = new_data.columns[0];
+        var x1_domain = data.columns.slice(1);
+    } else {
+        x0_key = "runtime";
+        var x1_domain = runtimes;
+    }
+
+    // Determine varaible margins.
+    var x0_domain = data.map(d => d[x0_key]);
+    var margin_bottom_pad = 0;
+    if (x0_domain.length > 8) {
+        margin_bottom_pad = 50;
+    }
+
+    // Use log scale if required.
+    var y_min = 0;
+    if ({{ include.log | default: "false" }}) {
+        // Need to cap lower end of the domain at 1.
+        y_min = 1;
+    }
+
+    if ({{ include.y_min | default: "false" }}) {
+        y_min = "{{ include.y_min }}";
+    }
+
+    var svg = d3.select("#{{ include.id }}"),
+        margin = {top: 20, right: 20, bottom: 30 + margin_bottom_pad, left: 50},
+        width = +svg.attr("width") - margin.left - margin.right,
+        height = +svg.attr("height") - margin.top - margin.bottom,
+        g = svg.append("g").attr("transform", "translate(" + margin.left + "," + margin.top + ")");
+
+    var x0 = d3.scaleBand()
+        .rangeRound([margin.left / 2, width - (4 * margin.right)])
+        .paddingInner(0.1);
+
+    var x1 = d3.scaleBand()
+        .padding(0.05);
+
+    var y = d3.scaleLinear()
+        .rangeRound([height, 0]);
+    if ({{ include.log | default: "false" }}) {
+        y = d3.scaleLog()
+           .rangeRound([height, 0]);
+    }
+
+    var z = d3.scaleOrdinal()
+        .range(["#262362", "#FBB03B", "#286FD7", "#6b486b"]);
+
+    // Set all domains.
+    x0.domain(x0_domain);
+    x1.domain(x1_domain).rangeRound([0, x0.bandwidth()]);
+    y.domain([y_min, d3.max(data, d => d3.max(x1_domain, key => parseFloat(d[key])))]).nice();
+
+    // The data.
+    g.append("g")
+        .selectAll("g")
+        .data(data)
+        .enter().append("g")
+          .attr("transform", function(d) { return "translate(" + x0(d[x0_key]) + ",0)"; })
+        .selectAll("rect")
+        .data(d => x1_domain.map(key => ({key, value: d[key]})))
+        .enter().append("rect")
+          .attr("x", d => x1(d.key))
+          .attr("y", d => y(d.value))
+          .attr("width", x1.bandwidth())
+          .attr("height", d => y(y_min) - y(d.value))
+          .attr("fill", d => z(d.key));
+
+    // X0 ticks and labels.
+    var x0_axis = g.append("g")
+        .attr("class", "axis")
+        .attr("transform", "translate(0," + height + ")")
+        .call(d3.axisBottom(x0));
+    if (x0_domain.length > 8) {
+        x0_axis.selectAll("text")
+            .style("text-anchor", "end")
+            .attr("dx", "-.8em")
+            .attr("dy", ".15em")
+            .attr("transform", "rotate(-65)");
+    }
+
+    // Y ticks and top-label.
+    if (metric == "default") {
+        metric = ""; // Don't display.
+    }
+    g.append("g")
+        .attr("class", "axis")
+        .call(d3.axisLeft(y).ticks(null, "s"))
+        .append("text")
+        .attr("x", -30.0)
+        .attr("y", y(y.ticks().pop()) - 10.0)
+        .attr("dy", "0.32em")
+        .attr("fill", "#000")
+        .attr("font-weight", "bold")
+        .attr("text-anchor", "start")
+        .text(metric);
+
+    // The legend.
+    var legend = g.append("g")
+        .attr("font-family", "sans-serif")
+        .attr("font-size", 10)
+        .attr("text-anchor", "end")
+        .selectAll("g")
+        .data(x1_domain.slice().reverse())
+        .enter().append("g")
+        .attr("transform", function(d, i) { return "translate(0," + i * 20 + ")"; });
+    legend.append("rect")
+        .attr("x", width - 19)
+        .attr("width", 19)
+        .attr("height", 19)
+        .attr("fill", z);
+    legend.append("text")
+        .attr("x", width - 24)
+        .attr("y", 9.5)
+        .attr("dy", "0.32em")
+        .text(function(d) { return d; });
+});
+}
+</script>
+{:/}
diff --git a/website/_includes/header-links.html b/website/_includes/header-links.html
new file mode 100644
index 000000000..467bb1e72
--- /dev/null
+++ b/website/_includes/header-links.html
@@ -0,0 +1,19 @@
+<nav class="navbar navbar-expand-sm navbar-inverse navbar-fixed-top">
+  <div class="container">
+    <div class="navbar-brand">
+      <a href="/">
+        <img src="/assets/logos/logo_solo_on_dark.svg" height="25px" class="d-inline-block align-top" style="margin-right: 10px;" alt="logo"/>
+        gVisor
+      </a>
+    </div>
+
+    <div class="collapse navbar-collapse">
+      <ul class="nav navbar-nav navbar-right">
+        <li><a href="/docs">Documentation</a></li>
+        <li><a href="/blog">Blog</a></li>
+        <li><a href="/community/">Community</a></li>
+        <li><a href="https://github.com/google/gvisor">GitHub</a></li>
+      </ul>
+    </div>
+  </div>
+</nav>
diff --git a/website/_includes/header.html b/website/_includes/header.html
new file mode 100644
index 000000000..c80310069
--- /dev/null
+++ b/website/_includes/header.html
@@ -0,0 +1,30 @@
+  <head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    {% if page.title %}
+    <title>{{ page.title }} - gVisor</title>
+    {% else %}
+    <title>gVisor</title>
+    {% endif %}
+    <link rel="canonical" href="{{ page.url | replace:'index.html','' | prepend: site_root }}">
+
+    <!-- Dependencies. -->
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.10.1/css/all.min.css" integrity="sha256-fdcFNFiBMrNfWL6OcAGQz6jDgNTRxnrLEd4vJYFWScE=" crossorigin="anonymous" />
+
+    <!-- Our own style sheet. -->
+    <link rel="stylesheet" type="text/css" href="/css/main.css">
+    <link rel="icon" type="image/png" href="/assets/favicons/favicon-32x32.png" sizes="32x32">
+    <link rel="icon" type="image/png" href="/assets/favicons/favicon-16x16.png" sizes="16x16">
+
+    {% if page.title %}
+    <meta name="og:title" content="{{ page.title }}">
+    {% else %}
+    <meta name="og:title" content="gVisor">
+    {% endif %}
+    {% if page.description %}
+    <meta name="og:description" content="{{ page.description }}">
+    {% endif %}
+    <meta name="og:image" content="{{ site.site_url }}/assets/logos/logo_solo_on_white_bordered.svg">
+  </head>
diff --git a/website/_includes/paginator.html b/website/_includes/paginator.html
new file mode 100644
index 000000000..b4ff4c3b1
--- /dev/null
+++ b/website/_includes/paginator.html
@@ -0,0 +1,10 @@
+<nav aria-label="...">
+  <ul class="pager">
+    {% if paginator.previous_page %}
+    <li class="previous"><a href="{{ paginator.previous_page_path }}"><span aria-hidden="true">&larr;</span> Newer</a></li>
+    {% endif %}
+    {% if paginator.next_page %}
+    <li class="next"><a href="{{ paginator.next_page_path }}">Older <span aria-hidden="true">&rarr;</span></a></li>
+    {% endif %}
+  </ul>
+</nav>
diff --git a/website/_includes/required_linux.html b/website/_includes/required_linux.html
new file mode 100644
index 000000000..e9d1b7548
--- /dev/null
+++ b/website/_includes/required_linux.html
@@ -0,0 +1,2 @@
+> Note: gVisor supports only x86\_64 and requires Linux 4.14.77+
+> ([older Linux](/docs/user_guide/networking/#gso)).
diff --git a/website/_layouts/base.html b/website/_layouts/base.html
new file mode 100644
index 000000000..b30bee0dc
--- /dev/null
+++ b/website/_layouts/base.html
@@ -0,0 +1,9 @@
+<!DOCTYPE html>
+<html lang="en" itemscope itemtype="https://schema.org/WebPage">
+  {% include header.html %}
+  <body>
+    {% include header-links.html %}
+    {{ content }}
+    {% include footer.html %}
+  </body>
+</html>
diff --git a/website/_layouts/blog.html b/website/_layouts/blog.html
new file mode 100644
index 000000000..6c371ab50
--- /dev/null
+++ b/website/_layouts/blog.html
@@ -0,0 +1,17 @@
+---
+layout: base
+---
+
+<div class="container">
+  <div class="row">
+    <div class="col-lg-2"></div>
+    <div class="col-lg-8">
+      <h1>{{ page.title }}</h1>
+      {% if page.feed %}
+      <a class="btn-inverse" href="/blog/index.xml">Feed&nbsp;<i class="fas fa-rss ml-2"></i></a>
+      {% endif %}
+      {{ content }}
+    </div>
+    <div class="col-lg-2"></div>
+  </div>
+</div>
diff --git a/website/_layouts/default.html b/website/_layouts/default.html
new file mode 100644
index 000000000..e5523e3fc
--- /dev/null
+++ b/website/_layouts/default.html
@@ -0,0 +1,14 @@
+---
+layout: base
+---
+{% if page.title %}
+<div class="container">
+  <div class="page-header">
+    <h1>{{ page.title }}</h1>
+  </div>
+</div>
+{% endif %}
+
+<div class="container">
+  {{ content }}
+</div>
diff --git a/website/_layouts/docs.html b/website/_layouts/docs.html
new file mode 100644
index 000000000..549305089
--- /dev/null
+++ b/website/_layouts/docs.html
@@ -0,0 +1,59 @@
+---
+layout: base
+categories:
+  - Project
+  - User Guide
+  - Architecture Guide
+  - Compatibility
+---
+
+<div class="container">
+  <div class="row">
+    <div class="col-md-3">
+      <nav class="sidebar" id="sidebar-nav">
+        {% for category in layout.categories %}
+          <h3>{{ category }}</h3>
+          <ul class="sidebar-nav">
+          {% assign sorted_pages = site.pages | where: 'layout', 'docs' | where: 'category', category | sort: 'weight' | sort: 'subcategory' %}
+          {% assign subcategory = nil %}
+          {% for p in sorted_pages %}
+            {% if p.subcategory != subcategory %}
+              {% if subcategory != nil %}
+                  </ul>
+                </li>
+              {% endif %}
+              {% assign subcategory = p.subcategory %}
+              {% if subcategory != nil %}
+                {% assign ac = "aria-controls" %}
+                {% assign cid = p.category | remove: " " | downcase %}
+                {% assign sid = p.subcategory | remove: " " | downcase %}
+                <li>
+                  <a class="sidebar-nav-heading" data-toggle="collapse" href="#{{ cid }}-{{ sid }}" aria-expanded="false" {{ ac }}="{{ cid }}-{{ sid }}">{{ subcategory }}<span class="caret"></span></a>
+                  <ul class="collapse sidebar-nav sidebar-submenu" id="{{ cid }}-{{ sid }}">
+              {% endif %}
+            {% endif %}
+            <li><a href="{{ p.url }}">{{ p.title }}</a></li>
+          {% endfor %}
+          {% if subcategory != nil %}
+                </ul>
+              </li>
+          {% endif %}
+          </ul>
+        {% endfor %}
+      </nav>
+    </div>
+
+    <div class="col-md-9">
+      <h1>{{ page.title }}</h1>
+      {% if page.editpath %}
+        <p>
+        <a href="https://github.com/google/gvisor/edit/master/{{page.editpath}}" target="_blank"><i class="fa fa-edit fa-fw"></i> Edit this page</a>
+        <a href="https://github.com/google/gvisor/issues/new?title={{page.title | url_encode}}" target="_blank"><i class="fab fa-github fa-fw"></i> Create issue</a>
+        </p>
+      {% endif %}
+      <div class="docs-content">
+      {{ content }}
+      </div>
+    </div>
+  </div>
+</div>
diff --git a/website/_layouts/post.html b/website/_layouts/post.html
new file mode 100644
index 000000000..640bee5af
--- /dev/null
+++ b/website/_layouts/post.html
@@ -0,0 +1,10 @@
+---
+layout: blog
+---
+
+<div class="blog-meta">
+  {% include byline.html authors=page.authors date=page.date %}
+</div>
+<div class="blog-content">
+  {{ content }}
+</div>
diff --git a/website/_plugins/svg_mime_type.rb b/website/_plugins/svg_mime_type.rb
new file mode 100644
index 000000000..ad6bb6480
--- /dev/null
+++ b/website/_plugins/svg_mime_type.rb
@@ -0,0 +1,3 @@
+require 'webrick'
+include WEBrick
+WEBrick::HTTPUtils::DefaultMimeTypes.store 'svg', 'image/svg+xml'
diff --git a/website/_sass/footer.scss b/website/_sass/footer.scss
new file mode 100644
index 000000000..ec2ba5e20
--- /dev/null
+++ b/website/_sass/footer.scss
@@ -0,0 +1,15 @@
+.footer {
+  margin-top: 40px;
+  background-color: #222;
+  color: #fff;
+  padding: 20px;
+
+  a {
+    color: $inverse-link-color;
+
+    &:hover,
+    &:focus {
+      color: $inverse-link-hover-color;
+    }
+  }
+}
diff --git a/website/_sass/front.scss b/website/_sass/front.scss
new file mode 100644
index 000000000..0e4208f3c
--- /dev/null
+++ b/website/_sass/front.scss
@@ -0,0 +1,17 @@
+.jumbotron {
+  background-image: url(/assets/images/background.jpg);
+  background-position: center;
+  background-repeat: no-repeat;
+  background-size: cover;
+  background-blend-mode: darken;
+  background-color: rgba(0, 0, 0, 0.3);
+
+  p {
+    color: #fff;
+    margin-top: 0;
+    margin-bottom: 0;
+    font-weight: 300;
+    font-size: 24px;
+    line-height: 30px;
+  }
+}
diff --git a/website/_sass/navbar.scss b/website/_sass/navbar.scss
new file mode 100644
index 000000000..65bc573ac
--- /dev/null
+++ b/website/_sass/navbar.scss
@@ -0,0 +1,26 @@
+.navbar-inverse {
+  background-color: $primary;
+  border-bottom: 1px solid $primary;
+
+  .navbar-brand > a {
+    color: #fff;
+
+    &:focus,
+    &:hover {
+      color: #fff;
+    }
+  }
+
+  .navbar-nav > li > a {
+    color: $inverse-link-color;
+
+    &:focus,
+    &:hover {
+      color: $inverse-link-hover-color;
+    }
+  }
+
+  .navbar-nav .nav-icon {
+    font-size: 18px;
+  }
+}
diff --git a/website/_sass/sidebar.scss b/website/_sass/sidebar.scss
new file mode 100644
index 000000000..f4ca05df9
--- /dev/null
+++ b/website/_sass/sidebar.scss
@@ -0,0 +1,61 @@
+$sidebar-border-color: #fff;
+$sidebar-hover-border-color: #66bb6a;
+
+.sidebar {
+  margin-top: 40px;
+
+  ul.sidebar-nav {
+    list-style-type: none;
+    padding: 0;
+    transition: height 0.01s;
+
+    li {
+      &.sidebar-nav-heading {
+        padding: 10px 0;
+        margin: 0;
+        display: block;
+        font-size: 16px;
+        font-weight: 300;
+      }
+
+      a {
+        padding: 4px 0;
+        display: block;
+        border-right: 2px solid $sidebar-border-color;
+
+        &:focus {
+          text-decoration: none;
+        }
+
+        .caret {
+          float: right;
+          margin-top: 8px;
+          margin-right: 10px;
+        }
+      }
+
+      &.active {
+        a {
+          border-left: 2px solid $sidebar-hover-border-color;
+          padding-left: 6px;
+        }
+      }
+    }
+
+    ul.sidebar-nav {
+      padding-left: 10px;
+    }
+  }
+}
+
+@media (min-width: 992px) {
+  .sidebar-toggle {
+    display: none;
+  }
+
+  .sidebar {
+    &.collapse {
+      display: block;
+    }
+  }
+}
diff --git a/website/_sass/style.scss b/website/_sass/style.scss
new file mode 100644
index 000000000..4deb945d4
--- /dev/null
+++ b/website/_sass/style.scss
@@ -0,0 +1,154 @@
+$primary: #262362;
+$secondary: #fff;
+$link-color: #286fd7;
+$inverse-link-color: #fff;
+
+$link-hover-color: darken($link-color, 10%);
+$inverse-link-hover-color: darken($inverse-link-color, 10%);
+
+$text-color: #444;
+
+$body-font-family: 'Roboto', 'Helvetica Neue', Helvetica, Arial, sans-serif;
+$code-font-family: 'Source Code Pro', monospace;
+
+html {
+  position: relative;
+  min-height: 100%;
+}
+
+body {
+  color: $text-color;
+  font-family: $body-font-family;
+  padding-top: 40px;
+}
+
+a {
+  color: $link-color;
+
+  &:hover,
+  &:focus {
+    color: $link-hover-color;
+    text-decoration: none;
+  }
+
+  code {
+    color: $link-color;
+  }
+}
+
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+  color: $text-color;
+  font-weight: 400;
+}
+
+h1 code,
+h2 code,
+h3 code,
+h4 code,
+h5 code,
+h6 code {
+  color: $text-color;
+  background: transparent;
+}
+
+h1 {
+  font-size: 30px;
+  margin-top: 40px;
+  margin-bottom: 40px;
+}
+
+h2 {
+  font-size: 24px;
+  margin-top: 30px;
+  margin-bottom: 30px;
+
+  code {
+    font-size: 24px;
+  }
+}
+
+h3 {
+  font-size: 20px;
+  margin-top: 24px;
+  margin-bottom: 24px;
+
+  code {
+    font-size: 20px;
+  }
+}
+
+h4 {
+  font-size: 18px;
+  margin-top: 20px;
+  margin-bottom: 20px;
+
+  code {
+    font-size: 18px;
+  }
+}
+
+p,
+li {
+  font-size: 14px;
+  line-height: 22px;
+}
+
+code {
+  font-family: $code-font-family;
+  font-size: 13px;
+}
+
+.btn {
+  color: $text-color;
+  background-color: $inverse-link-color;
+}
+
+.btn-inverse {
+  color: $text-color;
+  background-color: #fff;
+}
+
+.well {
+  box-shadow: none;
+}
+
+table {
+  width: 100%;
+}
+
+table td,
+table th {
+  border: 1px solid #ddd;
+  padding: 8px;
+}
+
+table tr:nth-child(even) {
+  background-color: #eee;
+}
+
+table th {
+  padding-top: 12px;
+  padding-bottom: 12px;
+  background-color: $primary;
+  color: $secondary;
+}
+
+.blog-meta {
+  margin-top: 10px;
+  margin-bottom: 20px;
+}
+
+.docs-content * img {
+  display: block;
+  margin: 20px auto;
+}
+
+.blog-content * img {
+  display: block;
+  margin: 20px auto;
+}
diff --git a/website/archive.key b/website/archive.key
new file mode 100644
index 000000000..1a91698bf
--- /dev/null
+++ b/website/archive.key
@@ -0,0 +1,29 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBF0meAYBEACcBYPOSBiKtid+qTQlbgKGPxUYt0cNZiQqWXylhYUT4PuNlNx5
+s+sBLFvNTpdTrXMmZ8NkekyjD1HardWvebvJT4u+Ho/9jUr4rP71cNwNtocz/w8G
+DsUXSLgH8SDkq6xw0L+5eGc78BBg9cOeBeFBm3UPgxTBXS9Zevoi2w1lzSxkXvjx
+cGzltzMZfPXERljgLzp9AAfhg/2ouqVQm37fY+P/NDzFMJ1XHPIIp9KJl/prBVud
+jJJteFZ5sgL6MwjBQq2kw+q2Jb8Zfjl0BeXDgGMN5M5lGhX2wTfiMbfo7KWyzRnB
+RpSP3BxlLqYeQUuLG5Yx8z3oA3uBkuKaFOKvXtiScxmGM/+Ri2YM3m66imwDhtmP
+AKwTPI3Re4gWWOffglMVSv2sUAY32XZ74yXjY1VhK3bN3WFUPGrgQx4X7GP0A1Te
+lzqkT3VSMXieImTASosK5L5Q8rryvgCeI9tQLn9EpYFCtU3LXvVgTreGNEEjMOnL
+dR7yOU+Fs775stn6ucqmdYarx7CvKUrNAhgEeHMonLe1cjYScF7NfLO1GIrQKJR2
+DE0f+uJZ52inOkO8ufh3WVQJSYszuS3HCY7w5oj1aP38k/y9zZdZvVvwAWZaiqBQ
+iwjVs6Kub76VVZZhRDf4iYs8k1Zh64nXdfQt250d8U5yMPF3wIJ+c1yhxwARAQAB
+tCpUaGUgZ1Zpc29yIEF1dGhvcnMgPGd2aXNvci1ib3RAZ29vZ2xlLmNvbT6JAlQE
+EwEKAD4WIQRvHfheOnHCSRjnJ9VvxtVU4yvZQwUCXSZ4BgIbAwUJA8JnAAULCQgH
+AgYVCgkICwIEFgIDAQIeAQIXgAAKCRBvxtVU4yvZQ5WFD/9VZXMW5I2rKV+2gTHT
+CsW74kZVi1VFdAVYiUJZXw2jJNtcg3xdgBcscYPyecyka/6TS2q7q2fOGAzCZkcR
+e3lLzkGAngMlZ7PdHAE0PDMNFaeMZW0dxNH68vn7AiA1y2XwENnxVec7iXQH6aX5
+xUNg2OCiv5f6DJItHc/Q4SvFUi8QK7TT/GYE1RJXVJlLqfO6y4V8SeqfM+FHpHZM
+gzrwdTgsNiEm4lMjWcgb2Ib4i2JUVAjIRPfcpysiV5E7c3SPXyu4bOovKKlbhiJ1
+Q1M9M0zHik34Kjf4YNO1EW936j7Msd181CJt5Bl9XvlhPb8gey/ygpIvcicLx6M5
+lRJTy4z1TtkmtZ7E8EbJZWoPTaHlA6hoMtGeE35j3vMZN1qZYaYt26eFOxxhh7PA
+J0h1lS7T2O8u1c2JKhKvajtdmbqbJgI8FRhVsMoVBnqDK5aE9MOAso36OibfweEL
+8iV2z8JnBpWtbbUEaWro4knPtbLJbQFvXVietm3cFsbGg+DMIwI6x6HcU91IEFYI
+Sv4orK7xgLuM+f6dxo/Wel3ht18dg3x3krBLALTYBidRfnQYYR3sTfLquB8b5WaY
+o829L2Bop9GBygdLevkHHN5It6q8CVpn0H5HEJMNaDOX1LcPbf0CKwkkAVCBd9YZ
+eAX38ds9LliK7XPXdC4c+zEkGA==
+=x8TG
+-----END PGP PUBLIC KEY BLOCK-----
diff --git a/website/assets/favicons/apple-touch-icon-180x180.png b/website/assets/favicons/apple-touch-icon-180x180.png
new file mode 100644
index 000000000..bf4b6ce9b
--- /dev/null
+++ b/website/assets/favicons/apple-touch-icon-180x180.png
diff --git a/website/assets/favicons/favicon-16x16.png b/website/assets/favicons/favicon-16x16.png
new file mode 100644
index 000000000..083264206
--- /dev/null
+++ b/website/assets/favicons/favicon-16x16.png
diff --git a/website/assets/favicons/favicon-32x32.png b/website/assets/favicons/favicon-32x32.png
new file mode 100644
index 000000000..b8e4caff1
--- /dev/null
+++ b/website/assets/favicons/favicon-32x32.png
diff --git a/website/assets/favicons/favicon.ico b/website/assets/favicons/favicon.ico
new file mode 100644
index 000000000..9238b79d9
--- /dev/null
+++ b/website/assets/favicons/favicon.ico
diff --git a/website/assets/favicons/pwa-192x192.png b/website/assets/favicons/pwa-192x192.png
new file mode 100644
index 000000000..5d2fab785
--- /dev/null
+++ b/website/assets/favicons/pwa-192x192.png
diff --git a/website/assets/favicons/pwa-512x512.png b/website/assets/favicons/pwa-512x512.png
new file mode 100644
index 000000000..23824439e
--- /dev/null
+++ b/website/assets/favicons/pwa-512x512.png
diff --git a/website/assets/favicons/tile150x150.png b/website/assets/favicons/tile150x150.png
new file mode 100644
index 000000000..f76fcffae
--- /dev/null
+++ b/website/assets/favicons/tile150x150.png
diff --git a/website/assets/favicons/tile310x150.png b/website/assets/favicons/tile310x150.png
new file mode 100644
index 000000000..4f87e4c12
--- /dev/null
+++ b/website/assets/favicons/tile310x150.png
diff --git a/website/assets/favicons/tile310x310.png b/website/assets/favicons/tile310x310.png
new file mode 100644
index 000000000..a2926d0bd
--- /dev/null
+++ b/website/assets/favicons/tile310x310.png
diff --git a/website/assets/favicons/tile70x70.png b/website/assets/favicons/tile70x70.png
new file mode 100644
index 000000000..96cc69fc4
--- /dev/null
+++ b/website/assets/favicons/tile70x70.png
diff --git a/website/assets/images/2019-11-18-security-basics-figure1.png b/website/assets/images/2019-11-18-security-basics-figure1.png
new file mode 100644
index 000000000..2a8134a7a
--- /dev/null
+++ b/website/assets/images/2019-11-18-security-basics-figure1.png
diff --git a/website/assets/images/2019-11-18-security-basics-figure2.png b/website/assets/images/2019-11-18-security-basics-figure2.png
new file mode 100644
index 000000000..f8b416e1d
--- /dev/null
+++ b/website/assets/images/2019-11-18-security-basics-figure2.png
diff --git a/website/assets/images/2019-11-18-security-basics-figure3.png b/website/assets/images/2019-11-18-security-basics-figure3.png
new file mode 100644
index 000000000..833e3e2b5
--- /dev/null
+++ b/website/assets/images/2019-11-18-security-basics-figure3.png
diff --git a/website/assets/images/2020-04-02-networking-security-figure1.png b/website/assets/images/2020-04-02-networking-security-figure1.png
new file mode 100644
index 000000000..b49cb0242
--- /dev/null
+++ b/website/assets/images/2020-04-02-networking-security-figure1.png
diff --git a/website/assets/images/background.jpg b/website/assets/images/background.jpg
new file mode 100644
index 000000000..81f8e332b
--- /dev/null
+++ b/website/assets/images/background.jpg
diff --git a/website/assets/logos/Makefile b/website/assets/logos/Makefile
new file mode 100644
index 000000000..49289ecc1
--- /dev/null
+++ b/website/assets/logos/Makefile
@@ -0,0 +1,13 @@
+#!/usr/bin/make -f
+
+srcs := $(wildcard *.svg)
+dsts := $(patsubst %.svg,%.png,$(srcs))
+
+all: $(dsts)
+.PHONY: all
+
+%.png %-16.png %-128.png %-1024.png: %.svg
+	@inkscape -z -e $*.png $<
+	@inkscape -z -w 16 -e $*-16.png $<
+	@inkscape -z -w 128 -e $*-128.png $<
+	@inkscape -z -w 1024 -e $*-1024.png $<
diff --git a/website/assets/logos/README.md b/website/assets/logos/README.md
new file mode 100644
index 000000000..2964982dd
--- /dev/null
+++ b/website/assets/logos/README.md
@@ -0,0 +1,10 @@
+# Logos
+
+This directory contains logo assets.
+
+The colors used are:
+
+*   Background (blue): #262262
+*   Highlight (yellow): #FBB03B
+
+Use `make` to generate sized PNGs from SVGs.
diff --git a/website/assets/logos/logo_solo_monochrome.png b/website/assets/logos/logo_solo_monochrome.png
new file mode 100644
index 000000000..e09c5ad5e
--- /dev/null
+++ b/website/assets/logos/logo_solo_monochrome.png
diff --git a/website/assets/logos/logo_solo_monochrome.svg b/website/assets/logos/logo_solo_monochrome.svg
new file mode 100644
index 000000000..73126fd8f
--- /dev/null
+++ b/website/assets/logos/logo_solo_monochrome.svg
@@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   id="svg2"
+   xml:space="preserve"
+   width="175.35599"
+   height="193.20036"
+   viewBox="0 0 175.35599 193.20036"
+   sodipodi:docname="logo_solo_monochrome.svg"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath20"><path
+         d="M 0,821.614 H 1366 V 0 H 0 Z"
+         id="path18"
+         inkscape:connector-curvature="0" /></clipPath></defs><sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="640"
+     inkscape:window-height="480"
+     id="namedview4"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.43085925"
+     inkscape:cx="374.99057"
+     inkscape:cy="88.483321"
+     inkscape:window-x="0"
+     inkscape:window-y="9"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g10" /><g
+     id="g10"
+     inkscape:groupmode="layer"
+     inkscape:label="gvisor_final-logo_20190313"
+     transform="matrix(1.3333333,0,0,-1.3333333,-614.45037,638.9628)"><g
+       id="g48"
+       transform="translate(548.2423,363.2485)"><path
+         d="m 0,0 c 16.655,21.121 22.696,44.433 18.328,70.995 3.068,0 5.743,-0.023 8.417,0.007 2.222,0.025 4.443,0.102 6.664,0.175 4.79,0.154 4.818,0.165 5.88,-4.582 3.145,-14.051 2.18,-28.09 -0.179,-42.118 -0.25,-1.492 -0.7,-2.956 -0.864,-4.454 C 37.05,9.081 30.089,3.645 20.165,1.097 13.787,-0.54 7.323,-0.829 0,0"
+         style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path50"
+         inkscape:connector-curvature="0" /></g><g
+       id="g52"
+       transform="translate(544.6891,396.1763)"><path
+         d="M 0,0 C 0,2.594 -3.457,4.322 -3.457,4.322 -0.864,5.187 0,8.644 0,8.644 0,8.644 0.865,5.187 3.458,4.322 3.458,4.322 0,2.594 0,0 m -17.099,6.454 c 0,6.742 -8.989,11.236 -8.989,11.236 6.742,2.248 8.989,11.238 8.989,11.238 0,0 2.247,-8.99 8.99,-11.238 0,0 -8.99,-4.494 -8.99,-11.236"
+         style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path54"
+         inkscape:connector-curvature="0" /></g><g
+       id="g56"
+       transform="translate(485.0861,429.7925)"><path
+         d="m 0,0 c 0,0 -2.214,3.359 -9.736,2.059 0,0 5.987,28.738 36.298,38.806 C 26.562,40.865 -2.82,24.275 0,0 M 0.583,-33.208 0.58,-33.211 c -1.297,-2.026 -2.821,-3.579 -4.53,-4.616 -1.515,-0.933 -3.178,-1.425 -4.743,-1.425 -0.528,0 -1.044,0.057 -1.539,0.17 l -2.209,0.507 2.184,0.603 c 1.63,0.451 3.063,1.347 4.259,2.664 1.014,1.108 1.856,2.485 2.577,4.213 1.175,2.82 1.784,6.162 1.81,9.936 -0.049,3.718 -0.683,7.054 -1.886,9.902 -0.703,1.654 -1.585,3.056 -2.621,4.163 -1.227,1.311 -2.671,2.178 -4.294,2.576 l -2.187,0.538 2.179,0.572 c 0.48,0.126 0.998,0.196 1.539,0.209 h 0.031 l 1.688,-0.153 c 1.045,-0.206 2.104,-0.616 3.061,-1.185 1.755,-1.031 3.302,-2.567 4.598,-4.565 2.155,-3.374 3.315,-7.536 3.357,-12.042 -0.028,-4.548 -1.159,-8.717 -3.271,-12.064 m 101.949,51.176 c 0,0 -0.075,0.05 -0.226,0.136 -0.541,0.327 -1.113,0.603 -1.715,0.815 -3.044,1.241 -9.881,3.186 -21.906,2.623 -0.029,0 -0.056,0 -0.085,-0.001 C 52.916,21.123 30.022,10.922 30.022,10.922 c 0,0 1.439,1.761 3.453,3.692 10e-4,0 10e-4,10e-4 0.002,10e-4 1.052,0.974 2.355,2.076 3.912,3.227 0.046,0.031 0.088,0.063 0.124,0.093 8.708,6.384 25.34,14.163 51.625,9.541 -0.989,1.124 -2.002,2.192 -3.036,3.215 -1.112,0.883 -2.231,1.693 -3.354,2.456 0.02,-0.012 0.039,-0.023 0.059,-0.036 0,0 -17.016,19.415 -48.683,15.891 C 30.19,48.622 25.983,47.867 21.66,46.564 21.653,46.563 21.646,46.562 21.64,46.56 L 21.638,46.558 C 11.48,43.492 0.683,37.387 -8.719,25.911 -9.29,25.193 -9.841,24.479 -10.376,23.77 c -0.065,-0.099 -0.141,-0.202 -0.226,-0.307 -1.182,-1.581 -2.271,-3.14 -3.279,-4.674 -3.266,-5.427 -5.631,-11.665 -6.311,-13.545 -10.58,-32.401 2.586,-57.55 5.144,-61.967 8.93,-15.158 24.565,-32.355 50.771,-37.327 0.197,-0.047 0.382,-0.101 0.582,-0.147 1.723,-0.367 4.864,-0.929 8.908,-1.196 1.524,-0.069 3.088,-0.094 4.699,-0.067 1.548,-0.009 2.999,0.017 4.335,0.064 0.396,0.028 0.74,0.041 1.044,0.044 5.102,0.238 8.272,0.775 8.272,0.775 -26.543,1.299 -39.847,13.409 -45.691,21.142 -1.325,1.648 -2.46,3.42 -3.377,5.316 -0.361,0.71 -0.523,1.115 -0.523,1.115 8.459,-7.181 20.294,-13.362 20.294,-13.362 10.611,-4.993 21.737,-7.451 33.524,-5.837 0,0 24.645,2.263 34.464,25.09 -0.423,0.322 -0.366,0.278 -0.79,0.6 -0.652,-0.526 -1.294,-1.023 -1.926,-1.496 -0.991,-0.651 -1.964,-1.357 -2.937,-2.07 -5.265,-3.485 -9.561,-5.128 -12.12,-5.879 -19.359,-4.887 -37.273,-1.252 -52.93,12.455 -7.253,6.349 -11.754,14.352 -11.944,24.291 -0.124,6.465 -0.19,12.935 -0.136,19.4 0.085,10.181 7.246,17.921 17.394,19.284 20.561,2.759 41.234,3.71 61.947,4.193 3.174,0.073 5.981,0.037 7.458,-3.356 0.025,-0.058 0.363,0.02 0.552,0.035 1.4,4.461 -0.617,9.27 -4.29,11.624"
+         style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path58"
+         inkscape:connector-curvature="0" /></g></g></svg>
+\ No newline at end of file
diff --git a/website/assets/logos/logo_solo_on_dark-1024.png b/website/assets/logos/logo_solo_on_dark-1024.png
new file mode 100644
index 000000000..6df428c65
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_dark-1024.png
diff --git a/website/assets/logos/logo_solo_on_dark-128.png b/website/assets/logos/logo_solo_on_dark-128.png
new file mode 100644
index 000000000..78a85475f
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_dark-128.png
diff --git a/website/assets/logos/logo_solo_on_dark-16.png b/website/assets/logos/logo_solo_on_dark-16.png
new file mode 100644
index 000000000..4f1e91c02
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_dark-16.png
diff --git a/website/assets/logos/logo_solo_on_dark.png b/website/assets/logos/logo_solo_on_dark.png
new file mode 100644
index 000000000..da20756f7
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_dark.png
diff --git a/website/assets/logos/logo_solo_on_dark.svg b/website/assets/logos/logo_solo_on_dark.svg
new file mode 100644
index 000000000..ae8d9e879
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_dark.svg
@@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   id="svg2"
+   xml:space="preserve"
+   width="175.35599"
+   height="193.19984"
+   viewBox="0 0 175.35599 193.19985"
+   sodipodi:docname="logo_solo_on_dark.svg"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title /></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath20"><path
+         d="M 0,821.614 H 1366 V 0 H 0 Z"
+         id="path18"
+         inkscape:connector-curvature="0" /></clipPath></defs><sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="1278"
+     inkscape:window-height="699"
+     id="namedview4"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.8617185"
+     inkscape:cx="257.20407"
+     inkscape:cy="172.193"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="g10" /><g
+     id="g10"
+     inkscape:groupmode="layer"
+     inkscape:label="gvisor_final-logo_20190313"
+     transform="matrix(1.3333333,0,0,-1.3333333,-614.45037,638.96254)"><g
+       id="g48"
+       transform="translate(548.2423,363.2484)"><path
+         d="m 0,0 c 16.655,21.121 22.696,44.434 18.328,70.995 3.068,0 5.743,-0.023 8.417,0.007 2.222,0.025 4.443,0.102 6.664,0.175 4.79,0.154 4.818,0.165 5.88,-4.582 3.145,-14.05 2.18,-28.089 -0.179,-42.118 -0.25,-1.492 -0.7,-2.956 -0.864,-4.454 C 37.05,9.081 30.089,3.645 20.165,1.097 13.787,-0.54 7.323,-0.829 0,0"
+         style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path50"
+         inkscape:connector-curvature="0" /></g><g
+       id="g52"
+       transform="translate(544.6891,396.1771)"><path
+         d="M 0,0 C 0,2.593 -3.457,4.321 -3.457,4.321 -0.864,5.186 0,8.644 0,8.644 0,8.644 0.865,5.186 3.458,4.321 3.458,4.321 0,2.593 0,0 m -17.099,6.453 c 0,6.742 -8.989,11.237 -8.989,11.237 6.742,2.247 8.989,11.237 8.989,11.237 0,0 2.247,-8.99 8.99,-11.237 0,0 -8.99,-4.495 -8.99,-11.237"
+         style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path54"
+         inkscape:connector-curvature="0" /></g><g
+       id="g56"
+       transform="translate(485.0861,429.7923)"><path
+         d="m 0,0 c 0,0 -2.214,3.359 -9.736,2.059 0,0 5.987,28.738 36.298,38.806 C 26.562,40.865 -2.82,24.275 0,0 M 0.583,-33.208 0.58,-33.211 c -1.297,-2.026 -2.821,-3.578 -4.53,-4.615 -1.515,-0.934 -3.178,-1.425 -4.743,-1.425 -0.528,0 -1.044,0.056 -1.539,0.169 l -2.209,0.507 2.184,0.603 c 1.63,0.451 3.063,1.347 4.259,2.664 1.014,1.108 1.856,2.485 2.577,4.214 1.175,2.819 1.784,6.161 1.81,9.935 -0.049,3.719 -0.683,7.054 -1.886,9.902 -0.703,1.655 -1.585,3.057 -2.621,4.163 -1.227,1.311 -2.671,2.178 -4.294,2.576 l -2.187,0.538 2.179,0.572 c 0.48,0.127 0.998,0.196 1.539,0.209 l 0.031,10e-4 1.688,-0.154 c 1.045,-0.206 2.104,-0.615 3.061,-1.184 1.755,-1.032 3.302,-2.568 4.598,-4.565 2.155,-3.374 3.315,-7.537 3.357,-12.043 -0.028,-4.548 -1.159,-8.717 -3.271,-12.064 m 101.949,51.176 c 0,0 -0.075,0.051 -0.226,0.137 -0.541,0.326 -1.113,0.602 -1.715,0.814 -3.044,1.241 -9.881,3.187 -21.906,2.623 -0.029,0 -0.056,0 -0.085,-0.001 C 52.916,21.123 30.022,10.923 30.022,10.923 c 0,0 1.439,1.76 3.453,3.691 10e-4,10e-4 10e-4,10e-4 0.002,0.002 1.052,0.973 2.355,2.076 3.912,3.226 0.046,0.032 0.088,0.063 0.124,0.094 8.708,6.383 25.34,14.162 51.625,9.54 -0.989,1.124 -2.002,2.193 -3.036,3.215 -1.112,0.884 -2.231,1.694 -3.354,2.456 0.02,-0.012 0.039,-0.023 0.059,-0.036 0,0 -17.016,19.415 -48.683,15.891 C 30.19,48.622 25.983,47.868 21.66,46.564 21.653,46.563 21.646,46.562 21.64,46.56 L 21.638,46.558 C 11.48,43.492 0.683,37.387 -8.719,25.911 c -0.571,-0.718 -1.122,-1.431 -1.657,-2.14 -0.065,-0.1 -0.141,-0.202 -0.226,-0.307 -1.182,-1.582 -2.271,-3.141 -3.279,-4.674 -3.266,-5.427 -5.631,-11.666 -6.311,-13.546 -10.58,-32.401 2.586,-57.549 5.144,-61.967 8.93,-15.157 24.565,-32.355 50.771,-37.327 0.197,-0.046 0.382,-0.101 0.582,-0.146 1.723,-0.367 4.864,-0.929 8.908,-1.197 1.524,-0.069 3.088,-0.094 4.699,-0.066 1.548,-0.01 2.999,0.017 4.335,0.064 0.396,0.027 0.74,0.041 1.044,0.044 5.102,0.237 8.272,0.774 8.272,0.774 -26.543,1.3 -39.847,13.41 -45.691,21.142 -1.325,1.648 -2.46,3.421 -3.377,5.316 -0.361,0.711 -0.523,1.115 -0.523,1.115 8.459,-7.18 20.294,-13.361 20.294,-13.361 10.611,-4.993 21.737,-7.452 33.524,-5.838 0,0 24.645,2.264 34.464,25.09 -0.423,0.322 -0.366,0.278 -0.79,0.6 -0.652,-0.526 -1.294,-1.022 -1.926,-1.496 -0.991,-0.65 -1.964,-1.357 -2.937,-2.07 -5.265,-3.485 -9.561,-5.127 -12.12,-5.879 -19.359,-4.887 -37.273,-1.252 -52.93,12.455 -7.253,6.349 -11.754,14.353 -11.944,24.291 -0.124,6.466 -0.19,12.935 -0.136,19.4 0.085,10.181 7.246,17.921 17.394,19.284 20.561,2.759 41.234,3.71 61.947,4.193 3.174,0.074 5.981,0.037 7.458,-3.356 0.025,-0.058 0.363,0.02 0.552,0.035 1.4,4.462 -0.617,9.27 -4.29,11.624"
+         style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path58"
+         inkscape:connector-curvature="0" /></g></g></svg>
+\ No newline at end of file
diff --git a/website/assets/logos/logo_solo_on_dark_full-1024.png b/website/assets/logos/logo_solo_on_dark_full-1024.png
new file mode 100644
index 000000000..8d597dd3d
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_dark_full-1024.png
diff --git a/website/assets/logos/logo_solo_on_dark_full-128.png b/website/assets/logos/logo_solo_on_dark_full-128.png
new file mode 100644
index 000000000..fe6dd5dea
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_dark_full-128.png
diff --git a/website/assets/logos/logo_solo_on_dark_full-16.png b/website/assets/logos/logo_solo_on_dark_full-16.png
new file mode 100644
index 000000000..f9aa7dfdd
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_dark_full-16.png
diff --git a/website/assets/logos/logo_solo_on_dark_full.png b/website/assets/logos/logo_solo_on_dark_full.png
new file mode 100644
index 000000000..611b0565e
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_dark_full.png
diff --git a/website/assets/logos/logo_solo_on_dark_full.svg b/website/assets/logos/logo_solo_on_dark_full.svg
new file mode 100644
index 000000000..6440835b1
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_dark_full.svg
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   id="svg2"
+   xml:space="preserve"
+   width="246.17325"
+   height="246.17325"
+   viewBox="0 0 246.17325 246.17326"
+   sodipodi:docname="logo_solo_on_dark_full.svg"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title /></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath20"><path
+         d="M 0,821.614 H 1366 V 0 H 0 Z"
+         id="path18"
+         inkscape:connector-curvature="0" /></clipPath></defs><sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="1278"
+     inkscape:window-height="699"
+     id="namedview4"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.8617185"
+     inkscape:cx="291.21659"
+     inkscape:cy="198.28704"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="g10" /><g
+     id="g10"
+     inkscape:groupmode="layer"
+     inkscape:label="gvisor_final-logo_20190313"
+     transform="matrix(1.3333333,0,0,-1.3333333,-580.43785,665.8419)"><circle
+       id="path83"
+       cx="527.64337"
+       cy="-407.06647"
+       r="92.314972"
+       transform="scale(1,-1)"
+       style="fill:#262262;fill-opacity:1;stroke-width:0.48076925" /><g
+       id="g48"
+       transform="translate(548.2423,363.2484)"><path
+         d="m 0,0 c 16.655,21.121 22.696,44.434 18.328,70.995 3.068,0 5.743,-0.023 8.417,0.007 2.222,0.025 4.443,0.102 6.664,0.175 4.79,0.154 4.818,0.165 5.88,-4.582 3.145,-14.05 2.18,-28.089 -0.179,-42.118 -0.25,-1.492 -0.7,-2.956 -0.864,-4.454 C 37.05,9.081 30.089,3.645 20.165,1.097 13.787,-0.54 7.323,-0.829 0,0"
+         style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path50"
+         inkscape:connector-curvature="0" /></g><g
+       id="g52"
+       transform="translate(544.6891,396.1771)"><path
+         d="M 0,0 C 0,2.593 -3.457,4.321 -3.457,4.321 -0.864,5.186 0,8.644 0,8.644 0,8.644 0.865,5.186 3.458,4.321 3.458,4.321 0,2.593 0,0 m -17.099,6.453 c 0,6.742 -8.989,11.237 -8.989,11.237 6.742,2.247 8.989,11.237 8.989,11.237 0,0 2.247,-8.99 8.99,-11.237 0,0 -8.99,-4.495 -8.99,-11.237"
+         style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path54"
+         inkscape:connector-curvature="0" /></g><g
+       id="g56"
+       transform="translate(485.0861,429.7923)"><path
+         d="m 0,0 c 0,0 -2.214,3.359 -9.736,2.059 0,0 5.987,28.738 36.298,38.806 C 26.562,40.865 -2.82,24.275 0,0 M 0.583,-33.208 0.58,-33.211 c -1.297,-2.026 -2.821,-3.578 -4.53,-4.615 -1.515,-0.934 -3.178,-1.425 -4.743,-1.425 -0.528,0 -1.044,0.056 -1.539,0.169 l -2.209,0.507 2.184,0.603 c 1.63,0.451 3.063,1.347 4.259,2.664 1.014,1.108 1.856,2.485 2.577,4.214 1.175,2.819 1.784,6.161 1.81,9.935 -0.049,3.719 -0.683,7.054 -1.886,9.902 -0.703,1.655 -1.585,3.057 -2.621,4.163 -1.227,1.311 -2.671,2.178 -4.294,2.576 l -2.187,0.538 2.179,0.572 c 0.48,0.127 0.998,0.196 1.539,0.209 l 0.031,10e-4 1.688,-0.154 c 1.045,-0.206 2.104,-0.615 3.061,-1.184 1.755,-1.032 3.302,-2.568 4.598,-4.565 2.155,-3.374 3.315,-7.537 3.357,-12.043 -0.028,-4.548 -1.159,-8.717 -3.271,-12.064 m 101.949,51.176 c 0,0 -0.075,0.051 -0.226,0.137 -0.541,0.326 -1.113,0.602 -1.715,0.814 -3.044,1.241 -9.881,3.187 -21.906,2.623 -0.029,0 -0.056,0 -0.085,-0.001 C 52.916,21.123 30.022,10.923 30.022,10.923 c 0,0 1.439,1.76 3.453,3.691 10e-4,10e-4 10e-4,10e-4 0.002,0.002 1.052,0.973 2.355,2.076 3.912,3.226 0.046,0.032 0.088,0.063 0.124,0.094 8.708,6.383 25.34,14.162 51.625,9.54 -0.989,1.124 -2.002,2.193 -3.036,3.215 -1.112,0.884 -2.231,1.694 -3.354,2.456 0.02,-0.012 0.039,-0.023 0.059,-0.036 0,0 -17.016,19.415 -48.683,15.891 C 30.19,48.622 25.983,47.868 21.66,46.564 21.653,46.563 21.646,46.562 21.64,46.56 L 21.638,46.558 C 11.48,43.492 0.683,37.387 -8.719,25.911 c -0.571,-0.718 -1.122,-1.431 -1.657,-2.14 -0.065,-0.1 -0.141,-0.202 -0.226,-0.307 -1.182,-1.582 -2.271,-3.141 -3.279,-4.674 -3.266,-5.427 -5.631,-11.666 -6.311,-13.546 -10.58,-32.401 2.586,-57.549 5.144,-61.967 8.93,-15.157 24.565,-32.355 50.771,-37.327 0.197,-0.046 0.382,-0.101 0.582,-0.146 1.723,-0.367 4.864,-0.929 8.908,-1.197 1.524,-0.069 3.088,-0.094 4.699,-0.066 1.548,-0.01 2.999,0.017 4.335,0.064 0.396,0.027 0.74,0.041 1.044,0.044 5.102,0.237 8.272,0.774 8.272,0.774 -26.543,1.3 -39.847,13.41 -45.691,21.142 -1.325,1.648 -2.46,3.421 -3.377,5.316 -0.361,0.711 -0.523,1.115 -0.523,1.115 8.459,-7.18 20.294,-13.361 20.294,-13.361 10.611,-4.993 21.737,-7.452 33.524,-5.838 0,0 24.645,2.264 34.464,25.09 -0.423,0.322 -0.366,0.278 -0.79,0.6 -0.652,-0.526 -1.294,-1.022 -1.926,-1.496 -0.991,-0.65 -1.964,-1.357 -2.937,-2.07 -5.265,-3.485 -9.561,-5.127 -12.12,-5.879 -19.359,-4.887 -37.273,-1.252 -52.93,12.455 -7.253,6.349 -11.754,14.353 -11.944,24.291 -0.124,6.466 -0.19,12.935 -0.136,19.4 0.085,10.181 7.246,17.921 17.394,19.284 20.561,2.759 41.234,3.71 61.947,4.193 3.174,0.074 5.981,0.037 7.458,-3.356 0.025,-0.058 0.363,0.02 0.552,0.035 1.4,4.462 -0.617,9.27 -4.29,11.624"
+         style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path58"
+         inkscape:connector-curvature="0" /></g></g></svg>
+\ No newline at end of file
diff --git a/website/assets/logos/logo_solo_on_white.png b/website/assets/logos/logo_solo_on_white.png
new file mode 100644
index 000000000..ca539cdff
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_white.png
diff --git a/website/assets/logos/logo_solo_on_white.svg b/website/assets/logos/logo_solo_on_white.svg
new file mode 100644
index 000000000..d794ad8e7
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_white.svg
@@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   id="svg2"
+   xml:space="preserve"
+   width="175.35599"
+   height="193.20036"
+   viewBox="0 0 175.35599 193.20037"
+   sodipodi:docname="logo_solo_on_white.svg"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath18"><path
+         d="M 0,821.614 H 1366 V 0 H 0 Z"
+         id="path16"
+         inkscape:connector-curvature="0" /></clipPath></defs><sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="640"
+     inkscape:window-height="480"
+     id="namedview4"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.43085925"
+     inkscape:cx="370.53985"
+     inkscape:cy="50.91009"
+     inkscape:window-x="0"
+     inkscape:window-y="9"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g10" /><g
+     id="g10"
+     inkscape:groupmode="layer"
+     inkscape:label="gvisor_final-logo_20190313"
+     transform="matrix(1.3333333,0,0,-1.3333333,-614.45037,638.9628)"><g
+       id="g46"
+       transform="translate(548.2428,363.2485)"><path
+         d="m 0,0 c 16.655,21.121 22.696,44.433 18.328,70.995 3.068,0 5.742,-0.023 8.417,0.007 2.221,0.025 4.442,0.102 6.663,0.175 4.79,0.154 4.818,0.165 5.881,-4.582 C 42.434,52.544 41.469,38.505 39.11,24.477 38.859,22.985 38.409,21.521 38.246,20.023 37.05,9.081 30.089,3.645 20.164,1.097 13.786,-0.54 7.323,-0.829 0,0"
+         style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path48"
+         inkscape:connector-curvature="0" /></g><g
+       id="g50"
+       transform="translate(544.6891,396.1763)"><path
+         d="M 0,0 C 0,2.594 -3.457,4.322 -3.457,4.322 -0.864,5.187 0,8.644 0,8.644 0,8.644 0.865,5.187 3.458,4.322 3.458,4.322 0,2.594 0,0 m -17.099,6.454 c 0,6.742 -8.989,11.236 -8.989,11.236 6.742,2.248 8.989,11.238 8.989,11.238 0,0 2.248,-8.99 8.99,-11.238 0,0 -8.99,-4.494 -8.99,-11.236"
+         style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path52"
+         inkscape:connector-curvature="0" /></g><g
+       id="g54"
+       transform="translate(485.0861,429.7925)"><path
+         d="m 0,0 c 0,0 -2.214,3.359 -9.736,2.059 0,0 5.987,28.738 36.298,38.806 C 26.562,40.865 -2.82,24.275 0,0 M 0.583,-33.208 0.58,-33.211 c -1.297,-2.026 -2.821,-3.579 -4.53,-4.616 -1.515,-0.933 -3.178,-1.425 -4.743,-1.425 -0.528,0 -1.044,0.057 -1.539,0.17 l -2.209,0.507 2.184,0.603 c 1.63,0.451 3.063,1.347 4.259,2.664 1.014,1.108 1.856,2.485 2.577,4.213 1.175,2.82 1.784,6.162 1.81,9.936 -0.049,3.718 -0.683,7.054 -1.886,9.902 -0.703,1.654 -1.585,3.056 -2.621,4.163 -1.227,1.311 -2.671,2.178 -4.294,2.576 l -2.187,0.538 2.179,0.572 c 0.48,0.126 0.998,0.196 1.539,0.209 h 0.031 l 1.688,-0.153 c 1.045,-0.206 2.104,-0.616 3.061,-1.185 1.755,-1.031 3.302,-2.567 4.598,-4.565 2.155,-3.374 3.315,-7.536 3.357,-12.042 -0.028,-4.548 -1.159,-8.717 -3.271,-12.064 m 101.949,51.176 c 0,0 -0.075,0.05 -0.226,0.136 -0.541,0.327 -1.113,0.603 -1.715,0.815 -3.044,1.241 -9.881,3.186 -21.906,2.623 -0.029,0 -0.056,0 -0.085,-0.001 C 52.916,21.123 30.022,10.922 30.022,10.922 c 0,0 1.439,1.761 3.453,3.692 10e-4,0 10e-4,10e-4 0.002,10e-4 1.052,0.974 2.355,2.076 3.912,3.227 0.046,0.031 0.088,0.063 0.124,0.093 8.708,6.384 25.341,14.163 51.625,9.541 -0.989,1.124 -2.002,2.192 -3.036,3.215 -1.112,0.883 -2.231,1.693 -3.354,2.456 0.02,-0.012 0.039,-0.023 0.059,-0.036 0,0 -17.016,19.415 -48.683,15.891 C 30.19,48.622 25.983,47.867 21.66,46.564 21.653,46.563 21.646,46.562 21.64,46.56 L 21.638,46.558 C 11.48,43.492 0.683,37.387 -8.719,25.911 -9.29,25.193 -9.841,24.479 -10.376,23.77 c -0.065,-0.099 -0.141,-0.202 -0.226,-0.307 -1.182,-1.581 -2.271,-3.14 -3.279,-4.674 -3.266,-5.427 -5.631,-11.665 -6.311,-13.545 -10.58,-32.401 2.586,-57.55 5.144,-61.967 8.93,-15.158 24.565,-32.355 50.771,-37.327 0.197,-0.047 0.382,-0.101 0.582,-0.147 1.723,-0.367 4.864,-0.929 8.908,-1.196 1.524,-0.069 3.088,-0.094 4.699,-0.067 1.548,-0.009 2.999,0.017 4.335,0.064 0.396,0.028 0.74,0.041 1.044,0.044 5.102,0.238 8.272,0.775 8.272,0.775 -26.542,1.299 -39.847,13.409 -45.691,21.142 -1.325,1.648 -2.46,3.42 -3.377,5.316 -0.361,0.71 -0.523,1.115 -0.523,1.115 8.459,-7.181 20.294,-13.362 20.294,-13.362 10.611,-4.993 21.737,-7.451 33.525,-5.837 0,0 24.644,2.263 34.463,25.09 -0.423,0.322 -0.366,0.278 -0.79,0.6 -0.651,-0.526 -1.294,-1.023 -1.926,-1.496 -0.991,-0.651 -1.964,-1.357 -2.937,-2.07 -5.265,-3.485 -9.561,-5.128 -12.12,-5.879 -19.359,-4.887 -37.273,-1.252 -52.93,12.455 -7.253,6.349 -11.754,14.352 -11.944,24.291 -0.124,6.465 -0.19,12.935 -0.136,19.4 0.085,10.181 7.246,17.921 17.394,19.284 20.561,2.759 41.234,3.71 61.948,4.193 3.173,0.073 5.98,0.037 7.457,-3.356 0.025,-0.058 0.363,0.02 0.552,0.035 1.4,4.461 -0.617,9.27 -4.29,11.624"
+         style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+         id="path56"
+         inkscape:connector-curvature="0" /></g></g></svg>
+\ No newline at end of file
diff --git a/website/assets/logos/logo_solo_on_white_bordered-1024.png b/website/assets/logos/logo_solo_on_white_bordered-1024.png
new file mode 100644
index 000000000..62bb88d50
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_white_bordered-1024.png
diff --git a/website/assets/logos/logo_solo_on_white_bordered-128.png b/website/assets/logos/logo_solo_on_white_bordered-128.png
new file mode 100644
index 000000000..a8988766c
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_white_bordered-128.png
diff --git a/website/assets/logos/logo_solo_on_white_bordered-16.png b/website/assets/logos/logo_solo_on_white_bordered-16.png
new file mode 100644
index 000000000..a545c49cf
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_white_bordered-16.png
diff --git a/website/assets/logos/logo_solo_on_white_bordered.png b/website/assets/logos/logo_solo_on_white_bordered.png
new file mode 100644
index 000000000..cc99b7c51
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_white_bordered.png
diff --git a/website/assets/logos/logo_solo_on_white_bordered.svg b/website/assets/logos/logo_solo_on_white_bordered.svg
new file mode 100644
index 000000000..2e26f144a
--- /dev/null
+++ b/website/assets/logos/logo_solo_on_white_bordered.svg
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   id="svg2"
+   xml:space="preserve"
+   width="190.7361"
+   height="207.92123"
+   viewBox="0 0 190.7361 207.92124"
+   sodipodi:docname="logo_solo_on_white_bordered.svg"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath18"><path
+         d="M 0,821.614 H 1366 V 0 H 0 Z"
+         id="path16"
+         inkscape:connector-curvature="0" /></clipPath></defs><sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="640"
+     inkscape:window-height="480"
+     id="namedview4"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.609327"
+     inkscape:cx="298.55736"
+     inkscape:cy="108.65533"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g14" /><g
+     id="g10"
+     inkscape:groupmode="layer"
+     inkscape:label="gvisor_final-logo_20190313"
+     transform="matrix(1.3333333,0,0,-1.3333333,-612.10927,647.00852)"><g
+       id="g12"><g
+         id="g14"
+         clip-path="url(#clipPath18)"><g
+           id="g46"
+           transform="translate(594.4321,453.1439)"><path
+             d="m 0,0 c 0,0 -0.204,0.139 -0.45,0.277 -0.906,0.547 -1.856,1 -2.744,1.321 -1.685,0.679 -3.62,1.262 -5.761,1.738 l -2.063,2.533 c -1.793,2.283 -6.09,7.358 -13.132,13.334 -8.142,6.209 -24.212,15.045 -47.595,12.442 -4.578,-0.444 -9.077,-1.318 -13.368,-2.597 l -0.232,-0.068 c -12.978,-3.918 -24.155,-11.512 -33.24,-22.601 -0.6,-0.754 -1.179,-1.504 -1.783,-2.307 l -0.134,-0.191 -0.062,-0.074 c -1.194,-1.596 -2.36,-3.258 -3.485,-4.969 l -0.125,-0.198 c -3.559,-5.915 -6.126,-12.72 -6.85,-14.73 -11.284,-34.556 2.735,-61.502 5.669,-66.567 8.482,-14.4 23.945,-32.461 50.005,-38.975 1.42,-0.354 2.872,-0.676 4.356,-0.96 0.016,-0.004 0.036,-0.009 0.053,-0.013 6.537,-1.118 16.647,-1.928 29.969,-0.317 3.21,0.621 8.236,2.535 8.646,8.445 2.209,0.842 10.261,3.812 10.261,3.812 8.572,3.874 18.586,11.106 24.334,24.546 1.21,2.83 0.277,6.128 -2.171,7.994 l -0.202,0.152 c 0.639,1.557 1.125,3.209 1.488,4.93 l 0.019,0.009 c 0,0 0.063,0.325 0.164,0.854 0.013,0.073 0.028,0.144 0.041,0.215 0.402,2.114 1.294,6.91 1.719,10.035 0.02,0.149 0.029,0.268 0.033,0.371 2.136,14.686 2.099,26.608 -0.156,37.847 1.443,0.114 2.672,1.095 3.106,2.477 l 0.655,2.086 C 9.188,-12.066 6.243,-4.003 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path48"
+             inkscape:connector-curvature="0" /></g><g
+           id="g50"
+           transform="translate(551.9267,364.2689)"><path
+             d="m 0,0 c 16.398,20.796 22.346,43.748 18.045,69.9 3.02,0 5.654,-0.022 8.288,0.007 2.187,0.025 4.374,0.101 6.56,0.172 4.716,0.152 4.743,0.163 5.79,-4.512 C 41.779,51.734 40.83,37.911 38.507,24.1 38.26,22.631 37.817,21.189 37.656,19.715 36.479,8.941 29.625,3.588 19.853,1.08 13.574,-0.531 7.209,-0.816 0,0"
+             style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path52"
+             inkscape:connector-curvature="0" /></g><g
+           id="g54"
+           transform="translate(548.4282,396.6898)"><path
+             d="M 0,0 C 0,2.553 -3.404,4.255 -3.404,4.255 -0.851,5.105 0,8.51 0,8.51 0,8.51 0.851,5.105 3.404,4.255 3.404,4.255 0,2.553 0,0 m -16.835,6.354 c 0,6.638 -8.851,11.063 -8.851,11.063 6.638,2.213 8.851,11.063 8.851,11.063 0,0 2.212,-8.85 8.85,-11.063 0,0 -8.85,-4.425 -8.85,-11.063"
+             style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path56"
+             inkscape:connector-curvature="0" /></g><g
+           id="g58"
+           transform="translate(489.744,429.7865)"><path
+             d="m 0,0 c 0,0 -2.18,3.308 -9.585,2.026 0,0 5.894,28.296 35.737,38.209 C 26.152,40.235 -2.777,23.901 0,0 m 0.574,-32.696 -0.003,-0.003 c -1.277,-1.994 -2.778,-3.523 -4.46,-4.544 -1.492,-0.919 -3.13,-1.403 -4.67,-1.403 -0.519,0 -1.028,0.055 -1.516,0.167 l -2.174,0.5 2.15,0.593 c 1.605,0.443 3.016,1.325 4.194,2.623 0.997,1.09 1.826,2.447 2.536,4.149 1.158,2.775 1.757,6.066 1.783,9.781 -0.048,3.661 -0.673,6.945 -1.857,9.75 -0.693,1.629 -1.56,3.009 -2.58,4.099 -1.208,1.29 -2.63,2.143 -4.228,2.536 l -2.154,0.529 2.145,0.564 c 0.473,0.125 0.983,0.193 1.516,0.205 l 0.031,0.002 1.662,-0.152 c 1.029,-0.203 2.071,-0.605 3.014,-1.167 1.727,-1.015 3.25,-2.527 4.526,-4.493 2.122,-3.323 3.264,-7.421 3.305,-11.857 C 3.767,-25.296 2.653,-29.4 0.574,-32.696 M 100.951,17.69 c 0,0 -0.074,0.05 -0.223,0.136 -0.532,0.32 -1.095,0.593 -1.688,0.801 -2.997,1.223 -9.729,3.139 -21.568,2.583 -0.029,0 -0.055,0 -0.084,-0.001 C 52.1,20.798 29.559,10.754 29.559,10.754 c 0,0 1.417,1.733 3.4,3.636 0,0 10e-4,0 10e-4,10e-4 1.036,0.958 2.319,2.044 3.853,3.176 0.044,0.031 0.086,0.062 0.122,0.091 8.573,6.286 24.949,13.945 50.829,9.394 -0.615,0.698 -1.254,1.341 -1.887,1.998 l 0.057,-0.009 c 0,0 -5.858,6.886 -16.555,12.616 -0.613,0.331 -1.252,0.659 -1.91,0.985 -0.038,0.018 -0.073,0.038 -0.112,0.057 -0.067,0.033 -0.128,0.057 -0.195,0.089 -8.007,3.888 -19.263,7.05 -33.564,5.458 -3.873,-0.374 -8.015,-1.116 -12.272,-2.399 -0.007,-0.002 -0.013,-0.003 -0.02,-0.005 L 21.304,45.84 c -10,-3.018 -20.632,-9.029 -29.888,-20.328 -0.563,-0.707 -1.105,-1.409 -1.632,-2.109 -0.064,-0.096 -0.138,-0.198 -0.222,-0.301 -1.164,-1.557 -2.236,-3.091 -3.229,-4.602 -3.216,-5.343 -5.544,-11.486 -6.214,-13.337 -10.417,-31.901 2.546,-56.661 5.065,-61.011 8.793,-14.923 24.186,-31.856 49.988,-36.751 0.195,-0.046 0.377,-0.101 0.573,-0.145 1.697,-0.361 4.79,-0.914 8.771,-1.178 1.5,-0.067 3.04,-0.093 4.626,-0.065 1.525,-0.01 2.953,0.016 4.268,0.063 0.39,0.027 0.729,0.04 1.029,0.044 5.023,0.233 8.144,0.762 8.144,0.762 -26.133,1.279 -39.232,13.203 -44.987,20.816 -1.304,1.622 -2.422,3.367 -3.324,5.234 -0.356,0.699 -0.515,1.097 -0.515,1.097 8.328,-7.069 19.98,-13.155 19.98,-13.155 10.449,-4.917 21.402,-7.337 33.008,-5.747 0,0 24.264,2.227 33.932,24.702 -0.417,0.317 -0.361,0.275 -0.777,0.592 -0.642,-0.518 -1.274,-1.007 -1.898,-1.474 -0.975,-0.64 -1.933,-1.336 -2.89,-2.038 -5.184,-3.431 -9.414,-5.048 -11.934,-5.788 -19.06,-4.811 -36.698,-1.232 -52.114,12.263 -7.14,6.251 -11.572,14.131 -11.759,23.917 -0.122,6.365 -0.188,12.734 -0.135,19.101 0.084,10.023 7.135,17.645 17.126,18.986 20.245,2.716 40.598,3.652 60.993,4.129 3.125,0.072 5.888,0.036 7.342,-3.306 0.025,-0.057 0.358,0.021 0.543,0.035 1.379,4.393 -0.607,9.127 -4.223,11.444"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path60"
+             inkscape:connector-curvature="0" /></g></g></g></g></svg>
+\ No newline at end of file
diff --git a/website/assets/logos/logo_with_text_monochrome.png b/website/assets/logos/logo_with_text_monochrome.png
new file mode 100644
index 000000000..17442f55d
--- /dev/null
+++ b/website/assets/logos/logo_with_text_monochrome.png
diff --git a/website/assets/logos/logo_with_text_monochrome.svg b/website/assets/logos/logo_with_text_monochrome.svg
new file mode 100644
index 000000000..4648e06c0
--- /dev/null
+++ b/website/assets/logos/logo_with_text_monochrome.svg
@@ -0,0 +1,116 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   id="svg2"
+   xml:space="preserve"
+   width="607.97211"
+   height="193.20036"
+   viewBox="0 0 607.97212 193.20036"
+   sodipodi:docname="logo_with_text_monochrome.svg"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath20"><path
+         d="M 0,821.614 H 1366 V 0 H 0 Z"
+         id="path18"
+         inkscape:connector-curvature="0" /></clipPath></defs><sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="640"
+     inkscape:window-height="480"
+     id="namedview4"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.21542963"
+     inkscape:cx="296.21626"
+     inkscape:cy="101.98009"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g10" /><g
+     id="g10"
+     inkscape:groupmode="layer"
+     inkscape:label="gvisor_final-logo_20190313"
+     transform="matrix(1.3333333,0,0,-1.3333333,-614.45037,638.9628)"><g
+       id="g14"><g
+         id="g16"
+         clip-path="url(#clipPath20)"><g
+           id="g22"
+           transform="translate(668.8995,400.2876)"><path
+             d="m 0,0 c -0.698,-5.234 -4.362,-8.375 -10.991,-9.421 -9.072,0.349 -13.783,5.407 -14.132,15.178 0.698,8.374 5.408,12.719 14.132,13.033 C -4.362,17.776 -0.698,14.341 0,8.479 Z m 0,26.117 c -2.442,2.826 -6.629,4.413 -12.561,4.763 -8.026,0 -14.219,-2.443 -18.581,-7.327 -4.361,-4.886 -6.542,-11.167 -6.542,-18.842 0,-8.026 2.006,-14.395 6.019,-19.105 4.012,-4.71 10.031,-7.066 18.057,-7.066 5.164,0 9.7,1.57 13.608,4.711 v -2.617 c 0,-3.141 -0.986,-6.019 -2.957,-8.636 -1.972,-2.617 -5.749,-3.925 -11.331,-3.925 -5.479,0.349 -10.137,1.919 -13.975,4.71 l -6.804,-7.85 c 5.582,-5.582 13.433,-8.48 23.553,-8.689 l 3.14,0.314 c 6.629,0.279 12.124,2.879 16.487,7.798 4.361,4.92 6.542,11.044 6.542,18.372 V 29.31 L 0,30.88 Z"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path24"
+             inkscape:connector-curvature="0" /></g><g
+           id="g26"
+           transform="translate(720.3033,399.9331)"><path
+             d="M 0,0 -19.986,51.176 H -37.513 L -8.457,-21.105 H 8.891 L 37.875,51.176 H 20.348 Z"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path28"
+             inkscape:connector-curvature="0" /></g><path
+           d="m 762.522,378.828 h 14.655 v 52.392 h -14.655 z"
+           style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           id="path30"
+           inkscape:connector-curvature="0" /><g
+           id="g32"
+           transform="translate(769.7443,451.1089)"><path
+             d="m 0,0 c -2.373,0 -4.257,-0.707 -5.652,-2.12 -1.397,-1.413 -2.094,-3.166 -2.094,-5.26 0,-2.094 0.697,-3.839 2.094,-5.234 1.395,-1.396 3.315,-2.094 5.757,-2.094 2.442,0 4.361,0.698 5.757,2.094 1.395,1.395 2.094,3.14 2.094,5.234 0,2.094 -0.699,3.847 -2.094,5.26 C 4.466,-0.707 2.512,0 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path34"
+             inkscape:connector-curvature="0" /></g><g
+           id="g36"
+           transform="translate(816.9264,406.8296)"><path
+             d="m 0,0 c -3.315,2.477 -7.572,3.96 -12.771,4.449 -3.524,0.697 -5.879,1.578 -7.065,2.643 -1.187,1.064 -1.187,2.251 0,3.559 1.186,1.309 3.541,1.962 7.065,1.962 3.56,0 6.525,-1.412 8.898,-4.239 l 8.165,9.212 c -2.374,2.338 -4.92,4.1 -7.642,5.286 -2.721,1.186 -6.542,1.78 -11.462,1.78 -4.257,0 -8.4,-1.423 -12.43,-4.265 -4.03,-2.845 -6.046,-6.971 -6.046,-12.379 0,-5.898 1.771,-9.91 5.313,-12.038 3.541,-2.129 7.44,-3.437 11.697,-3.925 4.712,-0.454 7.476,-1.335 8.297,-2.645 0.819,-1.307 0.645,-2.615 -0.523,-3.924 -1.171,-1.309 -3.761,-1.963 -7.774,-1.963 -4.99,0.453 -8.67,2.093 -11.043,4.921 l -8.113,-9.161 c 2.373,-2.373 5.208,-4.265 8.505,-5.678 3.298,-1.413 7.424,-2.12 12.379,-2.12 5.199,0 9.752,1.36 13.66,4.083 3.908,2.721 5.862,6.558 5.862,11.514 C 4.972,-6.787 3.315,-2.479 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path38"
+             inkscape:connector-curvature="0" /></g><g
+           id="g40"
+           transform="translate(862.2008,394.9751)"><path
+             d="m 0,0 c -2.478,-2.67 -5.705,-4.004 -9.683,-4.004 -3.977,0 -7.205,1.334 -9.682,4.004 -2.478,2.669 -3.717,5.992 -3.717,9.97 0,3.978 1.239,7.31 3.717,9.997 2.477,2.686 5.686,4.03 9.63,4.03 4.012,0 7.257,-1.344 9.735,-4.03 C 2.477,17.28 3.716,13.948 3.716,9.97 3.716,5.992 2.477,2.669 0,0 m -9.63,36.716 c -7.608,0 -13.957,-2.548 -19.052,-7.642 -5.095,-5.095 -7.641,-11.445 -7.641,-19.051 0,-7.573 2.546,-13.915 7.641,-19.025 5.095,-5.113 11.444,-7.669 19.052,-7.669 7.606,0 13.956,2.547 19.051,7.642 5.095,5.093 7.641,11.445 7.641,19.052 0,7.606 -2.546,13.956 -7.641,19.051 -5.095,5.094 -11.445,7.642 -19.051,7.642"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path42"
+             inkscape:connector-curvature="0" /></g><g
+           id="g44"
+           transform="translate(911.9489,431.1675)"><path
+             d="m 0,0 c -5.374,0 -9.927,-1.92 -13.66,-5.758 v 5.81 L -28.315,-1.519 V -52.34 h 14.655 v 32.19 c 2.372,4.92 5.338,7.501 8.898,7.746 3.175,-0.105 5.617,-0.925 7.327,-2.46 L 4.868,-0.419 C 3.332,-0.14 1.709,0 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path46"
+             inkscape:connector-curvature="0" /></g><g
+           id="g48"
+           transform="translate(548.2423,363.2485)"><path
+             d="m 0,0 c 16.655,21.121 22.696,44.433 18.328,70.995 3.068,0 5.743,-0.023 8.417,0.007 2.222,0.025 4.443,0.102 6.664,0.175 4.79,0.154 4.818,0.165 5.88,-4.582 3.145,-14.051 2.18,-28.09 -0.179,-42.118 -0.25,-1.492 -0.7,-2.956 -0.864,-4.454 C 37.05,9.081 30.089,3.645 20.165,1.097 13.787,-0.54 7.323,-0.829 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path50"
+             inkscape:connector-curvature="0" /></g><g
+           id="g52"
+           transform="translate(544.6891,396.1763)"><path
+             d="M 0,0 C 0,2.594 -3.457,4.322 -3.457,4.322 -0.864,5.187 0,8.644 0,8.644 0,8.644 0.865,5.187 3.458,4.322 3.458,4.322 0,2.594 0,0 m -17.099,6.454 c 0,6.742 -8.989,11.236 -8.989,11.236 6.742,2.248 8.989,11.238 8.989,11.238 0,0 2.247,-8.99 8.99,-11.238 0,0 -8.99,-4.494 -8.99,-11.236"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path54"
+             inkscape:connector-curvature="0" /></g><g
+           id="g56"
+           transform="translate(485.0861,429.7925)"><path
+             d="m 0,0 c 0,0 -2.214,3.359 -9.736,2.059 0,0 5.987,28.738 36.298,38.806 C 26.562,40.865 -2.82,24.275 0,0 M 0.583,-33.208 0.58,-33.211 c -1.297,-2.026 -2.821,-3.579 -4.53,-4.616 -1.515,-0.933 -3.178,-1.425 -4.743,-1.425 -0.528,0 -1.044,0.057 -1.539,0.17 l -2.209,0.507 2.184,0.603 c 1.63,0.451 3.063,1.347 4.259,2.664 1.014,1.108 1.856,2.485 2.577,4.213 1.175,2.82 1.784,6.162 1.81,9.936 -0.049,3.718 -0.683,7.054 -1.886,9.902 -0.703,1.654 -1.585,3.056 -2.621,4.163 -1.227,1.311 -2.671,2.178 -4.294,2.576 l -2.187,0.538 2.179,0.572 c 0.48,0.126 0.998,0.196 1.539,0.209 h 0.031 l 1.688,-0.153 c 1.045,-0.206 2.104,-0.616 3.061,-1.185 1.755,-1.031 3.302,-2.567 4.598,-4.565 2.155,-3.374 3.315,-7.536 3.357,-12.042 -0.028,-4.548 -1.159,-8.717 -3.271,-12.064 m 101.949,51.176 c 0,0 -0.075,0.05 -0.226,0.136 -0.541,0.327 -1.113,0.603 -1.715,0.815 -3.044,1.241 -9.881,3.186 -21.906,2.623 -0.029,0 -0.056,0 -0.085,-0.001 C 52.916,21.123 30.022,10.922 30.022,10.922 c 0,0 1.439,1.761 3.453,3.692 10e-4,0 10e-4,10e-4 0.002,10e-4 1.052,0.974 2.355,2.076 3.912,3.227 0.046,0.031 0.088,0.063 0.124,0.093 8.708,6.384 25.34,14.163 51.625,9.541 -0.989,1.124 -2.002,2.192 -3.036,3.215 -1.112,0.883 -2.231,1.693 -3.354,2.456 0.02,-0.012 0.039,-0.023 0.059,-0.036 0,0 -17.016,19.415 -48.683,15.891 C 30.19,48.622 25.983,47.867 21.66,46.564 21.653,46.563 21.646,46.562 21.64,46.56 L 21.638,46.558 C 11.48,43.492 0.683,37.387 -8.719,25.911 -9.29,25.193 -9.841,24.479 -10.376,23.77 c -0.065,-0.099 -0.141,-0.202 -0.226,-0.307 -1.182,-1.581 -2.271,-3.14 -3.279,-4.674 -3.266,-5.427 -5.631,-11.665 -6.311,-13.545 -10.58,-32.401 2.586,-57.55 5.144,-61.967 8.93,-15.158 24.565,-32.355 50.771,-37.327 0.197,-0.047 0.382,-0.101 0.582,-0.147 1.723,-0.367 4.864,-0.929 8.908,-1.196 1.524,-0.069 3.088,-0.094 4.699,-0.067 1.548,-0.009 2.999,0.017 4.335,0.064 0.396,0.028 0.74,0.041 1.044,0.044 5.102,0.238 8.272,0.775 8.272,0.775 -26.543,1.299 -39.847,13.409 -45.691,21.142 -1.325,1.648 -2.46,3.42 -3.377,5.316 -0.361,0.71 -0.523,1.115 -0.523,1.115 8.459,-7.181 20.294,-13.362 20.294,-13.362 10.611,-4.993 21.737,-7.451 33.524,-5.837 0,0 24.645,2.263 34.464,25.09 -0.423,0.322 -0.366,0.278 -0.79,0.6 -0.652,-0.526 -1.294,-1.023 -1.926,-1.496 -0.991,-0.651 -1.964,-1.357 -2.937,-2.07 -5.265,-3.485 -9.561,-5.128 -12.12,-5.879 -19.359,-4.887 -37.273,-1.252 -52.93,12.455 -7.253,6.349 -11.754,14.352 -11.944,24.291 -0.124,6.465 -0.19,12.935 -0.136,19.4 0.085,10.181 7.246,17.921 17.394,19.284 20.561,2.759 41.234,3.71 61.947,4.193 3.174,0.073 5.981,0.037 7.458,-3.356 0.025,-0.058 0.363,0.02 0.552,0.035 1.4,4.461 -0.617,9.27 -4.29,11.624"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path58"
+             inkscape:connector-curvature="0" /></g></g></g></g></svg>
+\ No newline at end of file
diff --git a/website/assets/logos/logo_with_text_on_dark-1024.png b/website/assets/logos/logo_with_text_on_dark-1024.png
new file mode 100644
index 000000000..a02a9014b
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_dark-1024.png
diff --git a/website/assets/logos/logo_with_text_on_dark-128.png b/website/assets/logos/logo_with_text_on_dark-128.png
new file mode 100644
index 000000000..efae725b8
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_dark-128.png
diff --git a/website/assets/logos/logo_with_text_on_dark-16.png b/website/assets/logos/logo_with_text_on_dark-16.png
new file mode 100644
index 000000000..a6069f98f
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_dark-16.png
diff --git a/website/assets/logos/logo_with_text_on_dark.png b/website/assets/logos/logo_with_text_on_dark.png
new file mode 100644
index 000000000..24de18c11
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_dark.png
diff --git a/website/assets/logos/logo_with_text_on_dark.svg b/website/assets/logos/logo_with_text_on_dark.svg
new file mode 100644
index 000000000..52d8e52da
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_dark.svg
@@ -0,0 +1,116 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   id="svg2"
+   xml:space="preserve"
+   width="607.97211"
+   height="193.19984"
+   viewBox="0 0 607.97212 193.19985"
+   sodipodi:docname="logo_with_text_on_dark.svg"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath20"><path
+         d="M 0,821.614 H 1366 V 0 H 0 Z"
+         id="path18"
+         inkscape:connector-curvature="0" /></clipPath></defs><sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="640"
+     inkscape:window-height="480"
+     id="namedview4"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.21542963"
+     inkscape:cx="296.21626"
+     inkscape:cy="101.97992"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g10" /><g
+     id="g10"
+     inkscape:groupmode="layer"
+     inkscape:label="gvisor_final-logo_20190313"
+     transform="matrix(1.3333333,0,0,-1.3333333,-614.45037,638.96254)"><g
+       id="g14"><g
+         id="g16"
+         clip-path="url(#clipPath20)"><g
+           id="g22"
+           transform="translate(668.8995,400.2874)"><path
+             d="m 0,0 c -0.698,-5.233 -4.362,-8.375 -10.991,-9.421 -9.072,0.349 -13.783,5.408 -14.132,15.178 0.698,8.375 5.408,12.719 14.132,13.033 C -4.362,17.777 -0.698,14.341 0,8.479 Z m 0,26.117 c -2.442,2.826 -6.629,4.414 -12.561,4.763 -8.026,0 -14.219,-2.443 -18.581,-7.327 -4.361,-4.886 -6.542,-11.166 -6.542,-18.842 0,-8.026 2.006,-14.394 6.019,-19.105 4.012,-4.71 10.031,-7.065 18.057,-7.065 5.164,0 9.7,1.569 13.608,4.71 v -2.616 c 0,-3.141 -0.986,-6.02 -2.957,-8.636 -1.972,-2.618 -5.749,-3.926 -11.331,-3.926 -5.479,0.349 -10.137,1.919 -13.975,4.71 l -6.804,-7.85 c 5.582,-5.582 13.433,-8.48 23.553,-8.689 l 3.14,0.314 c 6.629,0.28 12.124,2.879 16.487,7.798 4.361,4.92 6.542,11.044 6.542,18.373 V 29.31 L 0,30.88 Z"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path24"
+             inkscape:connector-curvature="0" /></g><g
+           id="g26"
+           transform="translate(720.3033,399.9339)"><path
+             d="m 0,0 -19.986,51.175 h -17.527 l 29.056,-72.28 H 8.891 l 28.984,72.28 H 20.348 Z"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path28"
+             inkscape:connector-curvature="0" /></g><path
+           d="m 762.522,378.828 h 14.655 v 52.392 h -14.655 z"
+           style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           id="path30"
+           inkscape:connector-curvature="0" /><g
+           id="g32"
+           transform="translate(769.7443,451.1087)"><path
+             d="m 0,0 c -2.373,0 -4.257,-0.706 -5.652,-2.12 -1.397,-1.413 -2.094,-3.166 -2.094,-5.26 0,-2.094 0.697,-3.839 2.094,-5.233 1.395,-1.397 3.315,-2.094 5.757,-2.094 2.442,0 4.361,0.697 5.757,2.094 1.395,1.394 2.094,3.139 2.094,5.233 0,2.094 -0.699,3.847 -2.094,5.26 C 4.466,-0.706 2.512,0 0,0"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path34"
+             inkscape:connector-curvature="0" /></g><g
+           id="g36"
+           transform="translate(816.9264,406.8294)"><path
+             d="m 0,0 c -3.315,2.478 -7.572,3.96 -12.771,4.449 -3.524,0.697 -5.879,1.578 -7.065,2.644 -1.187,1.063 -1.187,2.25 0,3.558 1.186,1.309 3.541,1.963 7.065,1.963 3.56,0 6.525,-1.413 8.898,-4.239 l 8.165,9.211 c -2.374,2.338 -4.92,4.101 -7.642,5.286 -2.721,1.187 -6.542,1.78 -11.462,1.78 -4.257,0 -8.4,-1.422 -12.43,-4.265 -4.03,-2.845 -6.046,-6.971 -6.046,-12.379 0,-5.897 1.771,-9.91 5.313,-12.038 3.541,-2.128 7.44,-3.437 11.697,-3.925 4.712,-0.454 7.476,-1.335 8.297,-2.644 0.819,-1.308 0.645,-2.616 -0.523,-3.924 -1.171,-1.309 -3.761,-1.963 -7.774,-1.963 -4.99,0.453 -8.67,2.092 -11.043,4.92 l -8.113,-9.16 c 2.373,-2.374 5.208,-4.266 8.505,-5.678 3.298,-1.414 7.424,-2.121 12.379,-2.121 5.199,0 9.752,1.361 13.66,4.083 3.908,2.722 5.862,6.559 5.862,11.515 C 4.972,-6.787 3.315,-2.478 0,0"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path38"
+             inkscape:connector-curvature="0" /></g><g
+           id="g40"
+           transform="translate(862.2008,394.9749)"><path
+             d="m 0,0 c -2.478,-2.669 -5.705,-4.004 -9.683,-4.004 -3.977,0 -7.205,1.335 -9.682,4.004 -2.478,2.67 -3.717,5.992 -3.717,9.97 0,3.978 1.239,7.31 3.717,9.998 2.477,2.685 5.686,4.029 9.63,4.029 4.012,0 7.257,-1.344 9.735,-4.029 C 2.477,17.28 3.716,13.948 3.716,9.97 3.716,5.992 2.477,2.67 0,0 m -9.63,36.716 c -7.608,0 -13.957,-2.547 -19.052,-7.642 -5.095,-5.095 -7.641,-11.445 -7.641,-19.051 0,-7.572 2.546,-13.914 7.641,-19.025 5.095,-5.112 11.444,-7.669 19.052,-7.669 7.606,0 13.956,2.548 19.051,7.642 5.095,5.094 7.641,11.445 7.641,19.052 0,7.606 -2.546,13.956 -7.641,19.051 -5.095,5.095 -11.445,7.642 -19.051,7.642"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path42"
+             inkscape:connector-curvature="0" /></g><g
+           id="g44"
+           transform="translate(911.9489,431.1673)"><path
+             d="m 0,0 c -5.374,0 -9.927,-1.919 -13.66,-5.757 v 5.81 l -14.655,-1.571 v -50.821 h 14.655 v 32.189 c 2.372,4.92 5.338,7.502 8.898,7.746 3.175,-0.105 5.617,-0.925 7.327,-2.46 L 4.868,-0.419 C 3.332,-0.14 1.709,0 0,0"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path46"
+             inkscape:connector-curvature="0" /></g><g
+           id="g48"
+           transform="translate(548.2423,363.2484)"><path
+             d="m 0,0 c 16.655,21.121 22.696,44.434 18.328,70.995 3.068,0 5.743,-0.023 8.417,0.007 2.222,0.025 4.443,0.102 6.664,0.175 4.79,0.154 4.818,0.165 5.88,-4.582 3.145,-14.05 2.18,-28.089 -0.179,-42.118 -0.25,-1.492 -0.7,-2.956 -0.864,-4.454 C 37.05,9.081 30.089,3.645 20.165,1.097 13.787,-0.54 7.323,-0.829 0,0"
+             style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path50"
+             inkscape:connector-curvature="0" /></g><g
+           id="g52"
+           transform="translate(544.6891,396.1771)"><path
+             d="M 0,0 C 0,2.593 -3.457,4.321 -3.457,4.321 -0.864,5.186 0,8.644 0,8.644 0,8.644 0.865,5.186 3.458,4.321 3.458,4.321 0,2.593 0,0 m -17.099,6.453 c 0,6.742 -8.989,11.237 -8.989,11.237 6.742,2.247 8.989,11.237 8.989,11.237 0,0 2.247,-8.99 8.99,-11.237 0,0 -8.99,-4.495 -8.99,-11.237"
+             style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path54"
+             inkscape:connector-curvature="0" /></g><g
+           id="g56"
+           transform="translate(485.0861,429.7923)"><path
+             d="m 0,0 c 0,0 -2.214,3.359 -9.736,2.059 0,0 5.987,28.738 36.298,38.806 C 26.562,40.865 -2.82,24.275 0,0 M 0.583,-33.208 0.58,-33.211 c -1.297,-2.026 -2.821,-3.578 -4.53,-4.615 -1.515,-0.934 -3.178,-1.425 -4.743,-1.425 -0.528,0 -1.044,0.056 -1.539,0.169 l -2.209,0.507 2.184,0.603 c 1.63,0.451 3.063,1.347 4.259,2.664 1.014,1.108 1.856,2.485 2.577,4.214 1.175,2.819 1.784,6.161 1.81,9.935 -0.049,3.719 -0.683,7.054 -1.886,9.902 -0.703,1.655 -1.585,3.057 -2.621,4.163 -1.227,1.311 -2.671,2.178 -4.294,2.576 l -2.187,0.538 2.179,0.572 c 0.48,0.127 0.998,0.196 1.539,0.209 l 0.031,10e-4 1.688,-0.154 c 1.045,-0.206 2.104,-0.615 3.061,-1.184 1.755,-1.032 3.302,-2.568 4.598,-4.565 2.155,-3.374 3.315,-7.537 3.357,-12.043 -0.028,-4.548 -1.159,-8.717 -3.271,-12.064 m 101.949,51.176 c 0,0 -0.075,0.051 -0.226,0.137 -0.541,0.326 -1.113,0.602 -1.715,0.814 -3.044,1.241 -9.881,3.187 -21.906,2.623 -0.029,0 -0.056,0 -0.085,-0.001 C 52.916,21.123 30.022,10.923 30.022,10.923 c 0,0 1.439,1.76 3.453,3.691 10e-4,10e-4 10e-4,10e-4 0.002,0.002 1.052,0.973 2.355,2.076 3.912,3.226 0.046,0.032 0.088,0.063 0.124,0.094 8.708,6.383 25.34,14.162 51.625,9.54 -0.989,1.124 -2.002,2.193 -3.036,3.215 -1.112,0.884 -2.231,1.694 -3.354,2.456 0.02,-0.012 0.039,-0.023 0.059,-0.036 0,0 -17.016,19.415 -48.683,15.891 C 30.19,48.622 25.983,47.868 21.66,46.564 21.653,46.563 21.646,46.562 21.64,46.56 L 21.638,46.558 C 11.48,43.492 0.683,37.387 -8.719,25.911 c -0.571,-0.718 -1.122,-1.431 -1.657,-2.14 -0.065,-0.1 -0.141,-0.202 -0.226,-0.307 -1.182,-1.582 -2.271,-3.141 -3.279,-4.674 -3.266,-5.427 -5.631,-11.666 -6.311,-13.546 -10.58,-32.401 2.586,-57.549 5.144,-61.967 8.93,-15.157 24.565,-32.355 50.771,-37.327 0.197,-0.046 0.382,-0.101 0.582,-0.146 1.723,-0.367 4.864,-0.929 8.908,-1.197 1.524,-0.069 3.088,-0.094 4.699,-0.066 1.548,-0.01 2.999,0.017 4.335,0.064 0.396,0.027 0.74,0.041 1.044,0.044 5.102,0.237 8.272,0.774 8.272,0.774 -26.543,1.3 -39.847,13.41 -45.691,21.142 -1.325,1.648 -2.46,3.421 -3.377,5.316 -0.361,0.711 -0.523,1.115 -0.523,1.115 8.459,-7.18 20.294,-13.361 20.294,-13.361 10.611,-4.993 21.737,-7.452 33.524,-5.838 0,0 24.645,2.264 34.464,25.09 -0.423,0.322 -0.366,0.278 -0.79,0.6 -0.652,-0.526 -1.294,-1.022 -1.926,-1.496 -0.991,-0.65 -1.964,-1.357 -2.937,-2.07 -5.265,-3.485 -9.561,-5.127 -12.12,-5.879 -19.359,-4.887 -37.273,-1.252 -52.93,12.455 -7.253,6.349 -11.754,14.353 -11.944,24.291 -0.124,6.466 -0.19,12.935 -0.136,19.4 0.085,10.181 7.246,17.921 17.394,19.284 20.561,2.759 41.234,3.71 61.947,4.193 3.174,0.074 5.981,0.037 7.458,-3.356 0.025,-0.058 0.363,0.02 0.552,0.035 1.4,4.462 -0.617,9.27 -4.29,11.624"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path58"
+             inkscape:connector-curvature="0" /></g></g></g></g></svg>
+\ No newline at end of file
diff --git a/website/assets/logos/logo_with_text_on_dark_full-1024.png b/website/assets/logos/logo_with_text_on_dark_full-1024.png
new file mode 100644
index 000000000..eb2e63981
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_dark_full-1024.png
diff --git a/website/assets/logos/logo_with_text_on_dark_full-128.png b/website/assets/logos/logo_with_text_on_dark_full-128.png
new file mode 100644
index 000000000..4ed21e5cb
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_dark_full-128.png
diff --git a/website/assets/logos/logo_with_text_on_dark_full-16.png b/website/assets/logos/logo_with_text_on_dark_full-16.png
new file mode 100644
index 000000000..d3968da5e
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_dark_full-16.png
diff --git a/website/assets/logos/logo_with_text_on_dark_full.png b/website/assets/logos/logo_with_text_on_dark_full.png
new file mode 100644
index 000000000..21feea356
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_dark_full.png
diff --git a/website/assets/logos/logo_with_text_on_dark_full.svg b/website/assets/logos/logo_with_text_on_dark_full.svg
new file mode 100644
index 000000000..017e72414
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_dark_full.svg
@@ -0,0 +1,120 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   id="svg2"
+   xml:space="preserve"
+   width="786.19244"
+   height="334.21716"
+   viewBox="0 0 786.19246 334.21718"
+   sodipodi:docname="logo_with_text_on_dark_full.svg"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath20"><path
+         d="M 0,821.614 H 1366 V 0 H 0 Z"
+         id="path18"
+         inkscape:connector-curvature="0" /></clipPath></defs><sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="567"
+     inkscape:window-height="462"
+     id="namedview4"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.3046635"
+     inkscape:cx="541.19762"
+     inkscape:cy="67.525134"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g10" /><g
+     id="g10"
+     inkscape:groupmode="layer"
+     inkscape:label="gvisor_final-logo_20190313"
+     transform="matrix(1.3333333,0,0,-1.3333333,-524.53324,714.8519)"><path
+       d="M 393.39994,285.47606 H 983.04431 V 536.13894 H 393.39994 Z"
+       style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.36289462"
+       id="path12"
+       inkscape:connector-curvature="0" /><g
+       id="g14"><g
+         id="g16"
+         clip-path="url(#clipPath20)"><g
+           id="g22"
+           transform="translate(668.8995,400.2874)"><path
+             d="m 0,0 c -0.698,-5.233 -4.362,-8.375 -10.991,-9.421 -9.072,0.349 -13.783,5.408 -14.132,15.178 0.698,8.375 5.408,12.719 14.132,13.033 C -4.362,17.777 -0.698,14.341 0,8.479 Z m 0,26.117 c -2.442,2.826 -6.629,4.414 -12.561,4.763 -8.026,0 -14.219,-2.443 -18.581,-7.327 -4.361,-4.886 -6.542,-11.166 -6.542,-18.842 0,-8.026 2.006,-14.394 6.019,-19.105 4.012,-4.71 10.031,-7.065 18.057,-7.065 5.164,0 9.7,1.569 13.608,4.71 v -2.616 c 0,-3.141 -0.986,-6.02 -2.957,-8.636 -1.972,-2.618 -5.749,-3.926 -11.331,-3.926 -5.479,0.349 -10.137,1.919 -13.975,4.71 l -6.804,-7.85 c 5.582,-5.582 13.433,-8.48 23.553,-8.689 l 3.14,0.314 c 6.629,0.28 12.124,2.879 16.487,7.798 4.361,4.92 6.542,11.044 6.542,18.373 V 29.31 L 0,30.88 Z"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path24"
+             inkscape:connector-curvature="0" /></g><g
+           id="g26"
+           transform="translate(720.3033,399.9339)"><path
+             d="m 0,0 -19.986,51.175 h -17.527 l 29.056,-72.28 H 8.891 l 28.984,72.28 H 20.348 Z"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path28"
+             inkscape:connector-curvature="0" /></g><path
+           d="m 762.522,378.828 h 14.655 v 52.392 h -14.655 z"
+           style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           id="path30"
+           inkscape:connector-curvature="0" /><g
+           id="g32"
+           transform="translate(769.7443,451.1087)"><path
+             d="m 0,0 c -2.373,0 -4.257,-0.706 -5.652,-2.12 -1.397,-1.413 -2.094,-3.166 -2.094,-5.26 0,-2.094 0.697,-3.839 2.094,-5.233 1.395,-1.397 3.315,-2.094 5.757,-2.094 2.442,0 4.361,0.697 5.757,2.094 1.395,1.394 2.094,3.139 2.094,5.233 0,2.094 -0.699,3.847 -2.094,5.26 C 4.466,-0.706 2.512,0 0,0"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path34"
+             inkscape:connector-curvature="0" /></g><g
+           id="g36"
+           transform="translate(816.9264,406.8294)"><path
+             d="m 0,0 c -3.315,2.478 -7.572,3.96 -12.771,4.449 -3.524,0.697 -5.879,1.578 -7.065,2.644 -1.187,1.063 -1.187,2.25 0,3.558 1.186,1.309 3.541,1.963 7.065,1.963 3.56,0 6.525,-1.413 8.898,-4.239 l 8.165,9.211 c -2.374,2.338 -4.92,4.101 -7.642,5.286 -2.721,1.187 -6.542,1.78 -11.462,1.78 -4.257,0 -8.4,-1.422 -12.43,-4.265 -4.03,-2.845 -6.046,-6.971 -6.046,-12.379 0,-5.897 1.771,-9.91 5.313,-12.038 3.541,-2.128 7.44,-3.437 11.697,-3.925 4.712,-0.454 7.476,-1.335 8.297,-2.644 0.819,-1.308 0.645,-2.616 -0.523,-3.924 -1.171,-1.309 -3.761,-1.963 -7.774,-1.963 -4.99,0.453 -8.67,2.092 -11.043,4.92 l -8.113,-9.16 c 2.373,-2.374 5.208,-4.266 8.505,-5.678 3.298,-1.414 7.424,-2.121 12.379,-2.121 5.199,0 9.752,1.361 13.66,4.083 3.908,2.722 5.862,6.559 5.862,11.515 C 4.972,-6.787 3.315,-2.478 0,0"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path38"
+             inkscape:connector-curvature="0" /></g><g
+           id="g40"
+           transform="translate(862.2008,394.9749)"><path
+             d="m 0,0 c -2.478,-2.669 -5.705,-4.004 -9.683,-4.004 -3.977,0 -7.205,1.335 -9.682,4.004 -2.478,2.67 -3.717,5.992 -3.717,9.97 0,3.978 1.239,7.31 3.717,9.998 2.477,2.685 5.686,4.029 9.63,4.029 4.012,0 7.257,-1.344 9.735,-4.029 C 2.477,17.28 3.716,13.948 3.716,9.97 3.716,5.992 2.477,2.67 0,0 m -9.63,36.716 c -7.608,0 -13.957,-2.547 -19.052,-7.642 -5.095,-5.095 -7.641,-11.445 -7.641,-19.051 0,-7.572 2.546,-13.914 7.641,-19.025 5.095,-5.112 11.444,-7.669 19.052,-7.669 7.606,0 13.956,2.548 19.051,7.642 5.095,5.094 7.641,11.445 7.641,19.052 0,7.606 -2.546,13.956 -7.641,19.051 -5.095,5.095 -11.445,7.642 -19.051,7.642"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path42"
+             inkscape:connector-curvature="0" /></g><g
+           id="g44"
+           transform="translate(911.9489,431.1673)"><path
+             d="m 0,0 c -5.374,0 -9.927,-1.919 -13.66,-5.757 v 5.81 l -14.655,-1.571 v -50.821 h 14.655 v 32.189 c 2.372,4.92 5.338,7.502 8.898,7.746 3.175,-0.105 5.617,-0.925 7.327,-2.46 L 4.868,-0.419 C 3.332,-0.14 1.709,0 0,0"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path46"
+             inkscape:connector-curvature="0" /></g><g
+           id="g48"
+           transform="translate(548.2423,363.2484)"><path
+             d="m 0,0 c 16.655,21.121 22.696,44.434 18.328,70.995 3.068,0 5.743,-0.023 8.417,0.007 2.222,0.025 4.443,0.102 6.664,0.175 4.79,0.154 4.818,0.165 5.88,-4.582 3.145,-14.05 2.18,-28.089 -0.179,-42.118 -0.25,-1.492 -0.7,-2.956 -0.864,-4.454 C 37.05,9.081 30.089,3.645 20.165,1.097 13.787,-0.54 7.323,-0.829 0,0"
+             style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path50"
+             inkscape:connector-curvature="0" /></g><g
+           id="g52"
+           transform="translate(544.6891,396.1771)"><path
+             d="M 0,0 C 0,2.593 -3.457,4.321 -3.457,4.321 -0.864,5.186 0,8.644 0,8.644 0,8.644 0.865,5.186 3.458,4.321 3.458,4.321 0,2.593 0,0 m -17.099,6.453 c 0,6.742 -8.989,11.237 -8.989,11.237 6.742,2.247 8.989,11.237 8.989,11.237 0,0 2.247,-8.99 8.99,-11.237 0,0 -8.99,-4.495 -8.99,-11.237"
+             style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path54"
+             inkscape:connector-curvature="0" /></g><g
+           id="g56"
+           transform="translate(485.0861,429.7923)"><path
+             d="m 0,0 c 0,0 -2.214,3.359 -9.736,2.059 0,0 5.987,28.738 36.298,38.806 C 26.562,40.865 -2.82,24.275 0,0 M 0.583,-33.208 0.58,-33.211 c -1.297,-2.026 -2.821,-3.578 -4.53,-4.615 -1.515,-0.934 -3.178,-1.425 -4.743,-1.425 -0.528,0 -1.044,0.056 -1.539,0.169 l -2.209,0.507 2.184,0.603 c 1.63,0.451 3.063,1.347 4.259,2.664 1.014,1.108 1.856,2.485 2.577,4.214 1.175,2.819 1.784,6.161 1.81,9.935 -0.049,3.719 -0.683,7.054 -1.886,9.902 -0.703,1.655 -1.585,3.057 -2.621,4.163 -1.227,1.311 -2.671,2.178 -4.294,2.576 l -2.187,0.538 2.179,0.572 c 0.48,0.127 0.998,0.196 1.539,0.209 l 0.031,10e-4 1.688,-0.154 c 1.045,-0.206 2.104,-0.615 3.061,-1.184 1.755,-1.032 3.302,-2.568 4.598,-4.565 2.155,-3.374 3.315,-7.537 3.357,-12.043 -0.028,-4.548 -1.159,-8.717 -3.271,-12.064 m 101.949,51.176 c 0,0 -0.075,0.051 -0.226,0.137 -0.541,0.326 -1.113,0.602 -1.715,0.814 -3.044,1.241 -9.881,3.187 -21.906,2.623 -0.029,0 -0.056,0 -0.085,-0.001 C 52.916,21.123 30.022,10.923 30.022,10.923 c 0,0 1.439,1.76 3.453,3.691 10e-4,10e-4 10e-4,10e-4 0.002,0.002 1.052,0.973 2.355,2.076 3.912,3.226 0.046,0.032 0.088,0.063 0.124,0.094 8.708,6.383 25.34,14.162 51.625,9.54 -0.989,1.124 -2.002,2.193 -3.036,3.215 -1.112,0.884 -2.231,1.694 -3.354,2.456 0.02,-0.012 0.039,-0.023 0.059,-0.036 0,0 -17.016,19.415 -48.683,15.891 C 30.19,48.622 25.983,47.868 21.66,46.564 21.653,46.563 21.646,46.562 21.64,46.56 L 21.638,46.558 C 11.48,43.492 0.683,37.387 -8.719,25.911 c -0.571,-0.718 -1.122,-1.431 -1.657,-2.14 -0.065,-0.1 -0.141,-0.202 -0.226,-0.307 -1.182,-1.582 -2.271,-3.141 -3.279,-4.674 -3.266,-5.427 -5.631,-11.666 -6.311,-13.546 -10.58,-32.401 2.586,-57.549 5.144,-61.967 8.93,-15.157 24.565,-32.355 50.771,-37.327 0.197,-0.046 0.382,-0.101 0.582,-0.146 1.723,-0.367 4.864,-0.929 8.908,-1.197 1.524,-0.069 3.088,-0.094 4.699,-0.066 1.548,-0.01 2.999,0.017 4.335,0.064 0.396,0.027 0.74,0.041 1.044,0.044 5.102,0.237 8.272,0.774 8.272,0.774 -26.543,1.3 -39.847,13.41 -45.691,21.142 -1.325,1.648 -2.46,3.421 -3.377,5.316 -0.361,0.711 -0.523,1.115 -0.523,1.115 8.459,-7.18 20.294,-13.361 20.294,-13.361 10.611,-4.993 21.737,-7.452 33.524,-5.838 0,0 24.645,2.264 34.464,25.09 -0.423,0.322 -0.366,0.278 -0.79,0.6 -0.652,-0.526 -1.294,-1.022 -1.926,-1.496 -0.991,-0.65 -1.964,-1.357 -2.937,-2.07 -5.265,-3.485 -9.561,-5.127 -12.12,-5.879 -19.359,-4.887 -37.273,-1.252 -52.93,12.455 -7.253,6.349 -11.754,14.353 -11.944,24.291 -0.124,6.466 -0.19,12.935 -0.136,19.4 0.085,10.181 7.246,17.921 17.394,19.284 20.561,2.759 41.234,3.71 61.947,4.193 3.174,0.074 5.981,0.037 7.458,-3.356 0.025,-0.058 0.363,0.02 0.552,0.035 1.4,4.462 -0.617,9.27 -4.29,11.624"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path58"
+             inkscape:connector-curvature="0" /></g></g></g></g></svg>
+\ No newline at end of file
diff --git a/website/assets/logos/logo_with_text_on_white.png b/website/assets/logos/logo_with_text_on_white.png
new file mode 100644
index 000000000..bf420a057
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_white.png
diff --git a/website/assets/logos/logo_with_text_on_white.svg b/website/assets/logos/logo_with_text_on_white.svg
new file mode 100644
index 000000000..4275efe83
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_white.svg
@@ -0,0 +1,116 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   id="svg2"
+   xml:space="preserve"
+   width="607.97211"
+   height="193.20036"
+   viewBox="0 0 607.97212 193.20037"
+   sodipodi:docname="logo_with_text_on_white.svg"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath18"><path
+         d="M 0,821.614 H 1366 V 0 H 0 Z"
+         id="path16"
+         inkscape:connector-curvature="0" /></clipPath></defs><sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="640"
+     inkscape:window-height="480"
+     id="namedview4"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.43085925"
+     inkscape:cx="325.455"
+     inkscape:cy="50.910097"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g10" /><g
+     id="g10"
+     inkscape:groupmode="layer"
+     inkscape:label="gvisor_final-logo_20190313"
+     transform="matrix(1.3333333,0,0,-1.3333333,-614.45037,638.9628)"><g
+       id="g12"><g
+         id="g14"
+         clip-path="url(#clipPath18)"><g
+           id="g20"
+           transform="translate(668.8995,400.2876)"><path
+             d="m 0,0 c -0.698,-5.234 -4.362,-8.375 -10.991,-9.421 -9.072,0.349 -13.783,5.407 -14.132,15.178 0.698,8.374 5.408,12.719 14.132,13.033 C -4.362,17.776 -0.698,14.341 0,8.479 Z m 0,26.117 c -2.442,2.826 -6.629,4.413 -12.561,4.763 -8.026,0 -14.219,-2.443 -18.581,-7.327 -4.361,-4.886 -6.542,-11.167 -6.542,-18.842 0,-8.026 2.006,-14.395 6.019,-19.105 4.012,-4.71 10.031,-7.066 18.057,-7.066 5.164,0 9.7,1.57 13.608,4.711 v -2.617 c 0,-3.141 -0.986,-6.019 -2.957,-8.636 -1.972,-2.617 -5.749,-3.925 -11.331,-3.925 -5.479,0.349 -10.137,1.919 -13.975,4.71 l -6.804,-7.85 c 5.582,-5.582 13.433,-8.48 23.553,-8.689 l 3.14,0.314 c 6.629,0.279 12.124,2.879 16.487,7.798 4.361,4.92 6.542,11.044 6.542,18.372 V 29.31 L 0,30.88 Z"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path22"
+             inkscape:connector-curvature="0" /></g><g
+           id="g24"
+           transform="translate(720.3033,399.9331)"><path
+             d="M 0,0 -19.986,51.176 H -37.513 L -8.457,-21.105 H 8.891 L 37.875,51.176 H 20.348 Z"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path26"
+             inkscape:connector-curvature="0" /></g><path
+           d="m 762.522,378.828 h 14.655 v 52.392 h -14.655 z"
+           style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           id="path28"
+           inkscape:connector-curvature="0" /><g
+           id="g30"
+           transform="translate(769.7443,451.1089)"><path
+             d="m 0,0 c -2.373,0 -4.257,-0.707 -5.652,-2.12 -1.397,-1.413 -2.094,-3.166 -2.094,-5.26 0,-2.094 0.697,-3.839 2.094,-5.234 1.395,-1.396 3.315,-2.094 5.757,-2.094 2.442,0 4.361,0.698 5.757,2.094 1.395,1.395 2.094,3.14 2.094,5.234 0,2.094 -0.699,3.847 -2.094,5.26 C 4.466,-0.707 2.512,0 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path32"
+             inkscape:connector-curvature="0" /></g><g
+           id="g34"
+           transform="translate(816.9264,406.8296)"><path
+             d="m 0,0 c -3.315,2.477 -7.572,3.96 -12.771,4.449 -3.524,0.697 -5.879,1.578 -7.065,2.643 -1.187,1.064 -1.187,2.251 0,3.559 1.186,1.309 3.541,1.962 7.065,1.962 3.56,0 6.525,-1.412 8.898,-4.239 l 8.165,9.212 c -2.374,2.338 -4.92,4.1 -7.642,5.286 -2.721,1.186 -6.542,1.78 -11.462,1.78 -4.257,0 -8.4,-1.423 -12.43,-4.265 -4.03,-2.845 -6.046,-6.971 -6.046,-12.379 0,-5.898 1.771,-9.91 5.313,-12.038 3.541,-2.129 7.44,-3.437 11.697,-3.925 4.712,-0.454 7.476,-1.335 8.297,-2.645 0.819,-1.307 0.645,-2.615 -0.523,-3.924 -1.171,-1.309 -3.761,-1.963 -7.774,-1.963 -4.99,0.453 -8.67,2.093 -11.043,4.921 l -8.113,-9.161 c 2.373,-2.373 5.208,-4.265 8.505,-5.678 3.298,-1.413 7.424,-2.12 12.379,-2.12 5.199,0 9.752,1.36 13.66,4.083 3.908,2.721 5.862,6.558 5.862,11.514 C 4.972,-6.787 3.315,-2.479 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path36"
+             inkscape:connector-curvature="0" /></g><g
+           id="g38"
+           transform="translate(862.2008,394.9751)"><path
+             d="m 0,0 c -2.478,-2.67 -5.705,-4.004 -9.683,-4.004 -3.977,0 -7.205,1.334 -9.682,4.004 -2.478,2.669 -3.717,5.992 -3.717,9.97 0,3.978 1.239,7.31 3.717,9.997 2.477,2.686 5.686,4.03 9.63,4.03 4.012,0 7.257,-1.344 9.735,-4.03 C 2.477,17.28 3.716,13.948 3.716,9.97 3.716,5.992 2.477,2.669 0,0 m -9.63,36.716 c -7.608,0 -13.957,-2.548 -19.052,-7.642 -5.095,-5.095 -7.641,-11.445 -7.641,-19.051 0,-7.573 2.546,-13.915 7.641,-19.025 5.095,-5.113 11.444,-7.669 19.052,-7.669 7.606,0 13.956,2.547 19.051,7.642 5.095,5.093 7.641,11.445 7.641,19.052 0,7.606 -2.546,13.956 -7.641,19.051 -5.095,5.094 -11.445,7.642 -19.051,7.642"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path40"
+             inkscape:connector-curvature="0" /></g><g
+           id="g42"
+           transform="translate(911.9489,431.1675)"><path
+             d="m 0,0 c -5.374,0 -9.927,-1.92 -13.66,-5.758 v 5.81 L -28.315,-1.519 V -52.34 h 14.655 v 32.19 c 2.372,4.92 5.338,7.501 8.898,7.746 3.175,-0.105 5.617,-0.925 7.327,-2.46 L 4.868,-0.419 C 3.332,-0.14 1.709,0 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path44"
+             inkscape:connector-curvature="0" /></g><g
+           id="g46"
+           transform="translate(548.2428,363.2485)"><path
+             d="m 0,0 c 16.655,21.121 22.696,44.433 18.328,70.995 3.068,0 5.742,-0.023 8.417,0.007 2.221,0.025 4.442,0.102 6.663,0.175 4.79,0.154 4.818,0.165 5.881,-4.582 C 42.434,52.544 41.469,38.505 39.11,24.477 38.859,22.985 38.409,21.521 38.246,20.023 37.05,9.081 30.089,3.645 20.164,1.097 13.786,-0.54 7.323,-0.829 0,0"
+             style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path48"
+             inkscape:connector-curvature="0" /></g><g
+           id="g50"
+           transform="translate(544.6891,396.1763)"><path
+             d="M 0,0 C 0,2.594 -3.457,4.322 -3.457,4.322 -0.864,5.187 0,8.644 0,8.644 0,8.644 0.865,5.187 3.458,4.322 3.458,4.322 0,2.594 0,0 m -17.099,6.454 c 0,6.742 -8.989,11.236 -8.989,11.236 6.742,2.248 8.989,11.238 8.989,11.238 0,0 2.248,-8.99 8.99,-11.238 0,0 -8.99,-4.494 -8.99,-11.236"
+             style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path52"
+             inkscape:connector-curvature="0" /></g><g
+           id="g54"
+           transform="translate(485.0861,429.7925)"><path
+             d="m 0,0 c 0,0 -2.214,3.359 -9.736,2.059 0,0 5.987,28.738 36.298,38.806 C 26.562,40.865 -2.82,24.275 0,0 M 0.583,-33.208 0.58,-33.211 c -1.297,-2.026 -2.821,-3.579 -4.53,-4.616 -1.515,-0.933 -3.178,-1.425 -4.743,-1.425 -0.528,0 -1.044,0.057 -1.539,0.17 l -2.209,0.507 2.184,0.603 c 1.63,0.451 3.063,1.347 4.259,2.664 1.014,1.108 1.856,2.485 2.577,4.213 1.175,2.82 1.784,6.162 1.81,9.936 -0.049,3.718 -0.683,7.054 -1.886,9.902 -0.703,1.654 -1.585,3.056 -2.621,4.163 -1.227,1.311 -2.671,2.178 -4.294,2.576 l -2.187,0.538 2.179,0.572 c 0.48,0.126 0.998,0.196 1.539,0.209 h 0.031 l 1.688,-0.153 c 1.045,-0.206 2.104,-0.616 3.061,-1.185 1.755,-1.031 3.302,-2.567 4.598,-4.565 2.155,-3.374 3.315,-7.536 3.357,-12.042 -0.028,-4.548 -1.159,-8.717 -3.271,-12.064 m 101.949,51.176 c 0,0 -0.075,0.05 -0.226,0.136 -0.541,0.327 -1.113,0.603 -1.715,0.815 -3.044,1.241 -9.881,3.186 -21.906,2.623 -0.029,0 -0.056,0 -0.085,-0.001 C 52.916,21.123 30.022,10.922 30.022,10.922 c 0,0 1.439,1.761 3.453,3.692 10e-4,0 10e-4,10e-4 0.002,10e-4 1.052,0.974 2.355,2.076 3.912,3.227 0.046,0.031 0.088,0.063 0.124,0.093 8.708,6.384 25.341,14.163 51.625,9.541 -0.989,1.124 -2.002,2.192 -3.036,3.215 -1.112,0.883 -2.231,1.693 -3.354,2.456 0.02,-0.012 0.039,-0.023 0.059,-0.036 0,0 -17.016,19.415 -48.683,15.891 C 30.19,48.622 25.983,47.867 21.66,46.564 21.653,46.563 21.646,46.562 21.64,46.56 L 21.638,46.558 C 11.48,43.492 0.683,37.387 -8.719,25.911 -9.29,25.193 -9.841,24.479 -10.376,23.77 c -0.065,-0.099 -0.141,-0.202 -0.226,-0.307 -1.182,-1.581 -2.271,-3.14 -3.279,-4.674 -3.266,-5.427 -5.631,-11.665 -6.311,-13.545 -10.58,-32.401 2.586,-57.55 5.144,-61.967 8.93,-15.158 24.565,-32.355 50.771,-37.327 0.197,-0.047 0.382,-0.101 0.582,-0.147 1.723,-0.367 4.864,-0.929 8.908,-1.196 1.524,-0.069 3.088,-0.094 4.699,-0.067 1.548,-0.009 2.999,0.017 4.335,0.064 0.396,0.028 0.74,0.041 1.044,0.044 5.102,0.238 8.272,0.775 8.272,0.775 -26.542,1.299 -39.847,13.409 -45.691,21.142 -1.325,1.648 -2.46,3.42 -3.377,5.316 -0.361,0.71 -0.523,1.115 -0.523,1.115 8.459,-7.181 20.294,-13.362 20.294,-13.362 10.611,-4.993 21.737,-7.451 33.525,-5.837 0,0 24.644,2.263 34.463,25.09 -0.423,0.322 -0.366,0.278 -0.79,0.6 -0.651,-0.526 -1.294,-1.023 -1.926,-1.496 -0.991,-0.651 -1.964,-1.357 -2.937,-2.07 -5.265,-3.485 -9.561,-5.128 -12.12,-5.879 -19.359,-4.887 -37.273,-1.252 -52.93,12.455 -7.253,6.349 -11.754,14.352 -11.944,24.291 -0.124,6.465 -0.19,12.935 -0.136,19.4 0.085,10.181 7.246,17.921 17.394,19.284 20.561,2.759 41.234,3.71 61.948,4.193 3.173,0.073 5.98,0.037 7.457,-3.356 0.025,-0.058 0.363,0.02 0.552,0.035 1.4,4.461 -0.617,9.27 -4.29,11.624"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path56"
+             inkscape:connector-curvature="0" /></g></g></g></g></svg>
+\ No newline at end of file
diff --git a/website/assets/logos/logo_with_text_on_white_bordered.png b/website/assets/logos/logo_with_text_on_white_bordered.png
new file mode 100644
index 000000000..bd1a1e4b7
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_white_bordered.png
diff --git a/website/assets/logos/logo_with_text_on_white_bordered.svg b/website/assets/logos/logo_with_text_on_white_bordered.svg
new file mode 100644
index 000000000..08125629d
--- /dev/null
+++ b/website/assets/logos/logo_with_text_on_white_bordered.svg
@@ -0,0 +1,122 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   version="1.1"
+   id="svg2"
+   xml:space="preserve"
+   width="607.64587"
+   height="207.92123"
+   viewBox="0 0 607.64589 207.92124"
+   sodipodi:docname="logo_with_text_on_white_bordered.svg"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"><metadata
+     id="metadata8"><rdf:RDF><cc:Work
+         rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
+     id="defs6"><clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath18"><path
+         d="M 0,821.614 H 1366 V 0 H 0 Z"
+         id="path16"
+         inkscape:connector-curvature="0" /></clipPath></defs><sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="640"
+     inkscape:window-height="480"
+     id="namedview4"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="0.21542963"
+     inkscape:cx="298.55736"
+     inkscape:cy="108.65533"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g10" /><g
+     id="g10"
+     inkscape:groupmode="layer"
+     inkscape:label="gvisor_final-logo_20190313"
+     transform="matrix(1.3333333,0,0,-1.3333333,-612.10927,647.00852)"><g
+       id="g12"><g
+         id="g14"
+         clip-path="url(#clipPath18)"><g
+           id="g20"
+           transform="translate(670.7226,400.7367)"><path
+             d="m 0,0 c -0.687,-5.153 -4.294,-8.246 -10.821,-9.275 -8.933,0.342 -13.571,5.324 -13.914,14.943 0.687,8.245 5.325,12.522 13.914,12.832 C -4.294,17.503 -0.687,14.12 0,8.348 Z m 0,25.715 c -2.404,2.782 -6.527,4.345 -12.367,4.688 -7.902,0 -14,-2.404 -18.294,-7.214 -4.295,-4.81 -6.442,-10.994 -6.442,-18.55 0,-7.903 1.976,-14.172 5.927,-18.811 3.949,-4.638 9.876,-6.956 17.778,-6.956 5.085,0 9.55,1.545 13.398,4.638 v -2.577 c 0,-3.092 -0.971,-5.926 -2.911,-8.502 -1.942,-2.577 -5.66,-3.866 -11.156,-3.866 -5.395,0.344 -9.981,1.89 -13.76,4.638 l -6.699,-7.729 c 5.496,-5.496 13.226,-8.349 23.19,-8.555 l 3.091,0.309 c 6.527,0.275 11.938,2.834 16.233,7.678 4.294,4.844 6.441,10.873 6.441,18.088 V 28.857 L 0,30.403 Z"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path22"
+             inkscape:connector-curvature="0" /></g><g
+           id="g24"
+           transform="translate(721.3339,400.388)"><path
+             d="M 0,0 -19.678,50.387 H -36.935 L -8.326,-20.779 H 8.754 L 37.291,50.387 H 20.034 Z"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path26"
+             inkscape:connector-curvature="0" /></g><path
+           d="m 762.901,379.609 h 14.429 v 51.583 h -14.429 z"
+           style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+           id="path28"
+           inkscape:connector-curvature="0" /><g
+           id="g30"
+           transform="translate(770.0126,450.7747)"><path
+             d="m 0,0 c -2.337,0 -4.191,-0.696 -5.565,-2.088 -1.375,-1.391 -2.062,-3.117 -2.062,-5.179 0,-2.061 0.687,-3.779 2.062,-5.153 1.374,-1.374 3.263,-2.061 5.669,-2.061 2.404,0 4.293,0.687 5.667,2.061 1.375,1.374 2.062,3.092 2.062,5.153 0,2.062 -0.687,3.788 -2.062,5.179 C 4.397,-0.696 2.474,0 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path32"
+             inkscape:connector-curvature="0" /></g><g
+           id="g34"
+           transform="translate(816.4667,407.1781)"><path
+             d="m 0,0 c -3.264,2.438 -7.455,3.898 -12.573,4.381 -3.471,0.686 -5.789,1.554 -6.957,2.601 -1.168,1.048 -1.168,2.216 0,3.504 1.168,1.289 3.486,1.933 6.957,1.933 3.504,0 6.424,-1.391 8.76,-4.174 l 8.039,9.069 c -2.336,2.302 -4.844,4.038 -7.524,5.206 -2.68,1.167 -6.441,1.751 -11.285,1.751 -4.192,0 -8.271,-1.4 -12.239,-4.199 -3.968,-2.801 -5.951,-6.863 -5.951,-12.187 0,-5.807 1.742,-9.758 5.229,-11.854 3.486,-2.094 7.326,-3.383 11.518,-3.863 4.638,-0.447 7.36,-1.315 8.168,-2.604 0.807,-1.288 0.635,-2.576 -0.515,-3.864 -1.152,-1.289 -3.703,-1.932 -7.653,-1.932 -4.913,0.446 -8.537,2.06 -10.873,4.843 l -7.988,-9.017 c 2.336,-2.337 5.127,-4.201 8.374,-5.592 3.247,-1.392 7.309,-2.087 12.187,-2.087 5.119,0 9.602,1.339 13.45,4.02 3.848,2.679 5.772,6.458 5.772,11.336 C 4.896,-6.683 3.264,-2.44 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path36"
+             inkscape:connector-curvature="0" /></g><g
+           id="g38"
+           transform="translate(861.0429,395.5062)"><path
+             d="m 0,0 c -2.439,-2.628 -5.616,-3.942 -9.533,-3.942 -3.916,0 -7.095,1.314 -9.533,3.942 -2.44,2.628 -3.66,5.9 -3.66,9.816 0,3.917 1.22,7.198 3.66,9.843 2.438,2.646 5.598,3.968 9.482,3.968 3.949,0 7.145,-1.322 9.584,-3.968 C 2.439,17.014 3.659,13.733 3.659,9.816 3.659,5.9 2.439,2.628 0,0 m -9.481,36.149 c -7.491,0 -13.743,-2.507 -18.758,-7.523 -5.017,-5.017 -7.524,-11.269 -7.524,-18.757 0,-7.456 2.507,-13.7 7.524,-18.732 5.015,-5.034 11.267,-7.55 18.758,-7.55 7.489,0 13.741,2.508 18.757,7.523 5.016,5.016 7.524,11.269 7.524,18.759 0,7.488 -2.508,13.74 -7.524,18.757 -5.016,5.016 -11.268,7.523 -18.757,7.523"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path40"
+             inkscape:connector-curvature="0" /></g><g
+           id="g42"
+           transform="translate(910.0244,431.14)"><path
+             d="m 0,0 c -5.291,0 -9.773,-1.89 -13.449,-5.668 v 5.72 l -14.43,-1.546 v -50.037 h 14.43 v 31.691 c 2.335,4.845 5.255,7.387 8.76,7.627 3.126,-0.102 5.531,-0.91 7.214,-2.422 L 4.792,-0.412 C 3.28,-0.138 1.683,0 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path44"
+             inkscape:connector-curvature="0" /></g><g
+           id="g46"
+           transform="translate(594.4321,453.1439)"><path
+             d="m 0,0 c 0,0 -0.204,0.139 -0.45,0.277 -0.906,0.547 -1.856,1 -2.744,1.321 -1.685,0.679 -3.62,1.262 -5.761,1.738 l -2.063,2.533 c -1.793,2.283 -6.09,7.358 -13.132,13.334 -8.142,6.209 -24.212,15.045 -47.595,12.442 -4.578,-0.444 -9.077,-1.318 -13.368,-2.597 l -0.232,-0.068 c -12.978,-3.918 -24.155,-11.512 -33.24,-22.601 -0.6,-0.754 -1.179,-1.504 -1.783,-2.307 l -0.134,-0.191 -0.062,-0.074 c -1.194,-1.596 -2.36,-3.258 -3.485,-4.969 l -0.125,-0.198 c -3.559,-5.915 -6.126,-12.72 -6.85,-14.73 -11.284,-34.556 2.735,-61.502 5.669,-66.567 8.482,-14.4 23.945,-32.461 50.005,-38.975 1.42,-0.354 2.872,-0.676 4.356,-0.96 0.016,-0.004 0.036,-0.009 0.053,-0.013 6.537,-1.118 16.647,-1.928 29.969,-0.317 3.21,0.621 8.236,2.535 8.646,8.445 2.209,0.842 10.261,3.812 10.261,3.812 8.572,3.874 18.586,11.106 24.334,24.546 1.21,2.83 0.277,6.128 -2.171,7.994 l -0.202,0.152 c 0.639,1.557 1.125,3.209 1.488,4.93 l 0.019,0.009 c 0,0 0.063,0.325 0.164,0.854 0.013,0.073 0.028,0.144 0.041,0.215 0.402,2.114 1.294,6.91 1.719,10.035 0.02,0.149 0.029,0.268 0.033,0.371 2.136,14.686 2.099,26.608 -0.156,37.847 1.443,0.114 2.672,1.095 3.106,2.477 l 0.655,2.086 C 9.188,-12.066 6.243,-4.003 0,0"
+             style="fill:#262262;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path48"
+             inkscape:connector-curvature="0" /></g><g
+           id="g50"
+           transform="translate(551.9267,364.2689)"><path
+             d="m 0,0 c 16.398,20.796 22.346,43.748 18.045,69.9 3.02,0 5.654,-0.022 8.288,0.007 2.187,0.025 4.374,0.101 6.56,0.172 4.716,0.152 4.743,0.163 5.79,-4.512 C 41.779,51.734 40.83,37.911 38.507,24.1 38.26,22.631 37.817,21.189 37.656,19.715 36.479,8.941 29.625,3.588 19.853,1.08 13.574,-0.531 7.209,-0.816 0,0"
+             style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path52"
+             inkscape:connector-curvature="0" /></g><g
+           id="g54"
+           transform="translate(548.4282,396.6898)"><path
+             d="M 0,0 C 0,2.553 -3.404,4.255 -3.404,4.255 -0.851,5.105 0,8.51 0,8.51 0,8.51 0.851,5.105 3.404,4.255 3.404,4.255 0,2.553 0,0 m -16.835,6.354 c 0,6.638 -8.851,11.063 -8.851,11.063 6.638,2.213 8.851,11.063 8.851,11.063 0,0 2.212,-8.85 8.85,-11.063 0,0 -8.85,-4.425 -8.85,-11.063"
+             style="fill:#fbb03b;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path56"
+             inkscape:connector-curvature="0" /></g><g
+           id="g58"
+           transform="translate(489.744,429.7865)"><path
+             d="m 0,0 c 0,0 -2.18,3.308 -9.585,2.026 0,0 5.894,28.296 35.737,38.209 C 26.152,40.235 -2.777,23.901 0,0 m 0.574,-32.696 -0.003,-0.003 c -1.277,-1.994 -2.778,-3.523 -4.46,-4.544 -1.492,-0.919 -3.13,-1.403 -4.67,-1.403 -0.519,0 -1.028,0.055 -1.516,0.167 l -2.174,0.5 2.15,0.593 c 1.605,0.443 3.016,1.325 4.194,2.623 0.997,1.09 1.826,2.447 2.536,4.149 1.158,2.775 1.757,6.066 1.783,9.781 -0.048,3.661 -0.673,6.945 -1.857,9.75 -0.693,1.629 -1.56,3.009 -2.58,4.099 -1.208,1.29 -2.63,2.143 -4.228,2.536 l -2.154,0.529 2.145,0.564 c 0.473,0.125 0.983,0.193 1.516,0.205 l 0.031,0.002 1.662,-0.152 c 1.029,-0.203 2.071,-0.605 3.014,-1.167 1.727,-1.015 3.25,-2.527 4.526,-4.493 2.122,-3.323 3.264,-7.421 3.305,-11.857 C 3.767,-25.296 2.653,-29.4 0.574,-32.696 M 100.951,17.69 c 0,0 -0.074,0.05 -0.223,0.136 -0.532,0.32 -1.095,0.593 -1.688,0.801 -2.997,1.223 -9.729,3.139 -21.568,2.583 -0.029,0 -0.055,0 -0.084,-0.001 C 52.1,20.798 29.559,10.754 29.559,10.754 c 0,0 1.417,1.733 3.4,3.636 0,0 10e-4,0 10e-4,10e-4 1.036,0.958 2.319,2.044 3.853,3.176 0.044,0.031 0.086,0.062 0.122,0.091 8.573,6.286 24.949,13.945 50.829,9.394 -0.615,0.698 -1.254,1.341 -1.887,1.998 l 0.057,-0.009 c 0,0 -5.858,6.886 -16.555,12.616 -0.613,0.331 -1.252,0.659 -1.91,0.985 -0.038,0.018 -0.073,0.038 -0.112,0.057 -0.067,0.033 -0.128,0.057 -0.195,0.089 -8.007,3.888 -19.263,7.05 -33.564,5.458 -3.873,-0.374 -8.015,-1.116 -12.272,-2.399 -0.007,-0.002 -0.013,-0.003 -0.02,-0.005 L 21.304,45.84 c -10,-3.018 -20.632,-9.029 -29.888,-20.328 -0.563,-0.707 -1.105,-1.409 -1.632,-2.109 -0.064,-0.096 -0.138,-0.198 -0.222,-0.301 -1.164,-1.557 -2.236,-3.091 -3.229,-4.602 -3.216,-5.343 -5.544,-11.486 -6.214,-13.337 -10.417,-31.901 2.546,-56.661 5.065,-61.011 8.793,-14.923 24.186,-31.856 49.988,-36.751 0.195,-0.046 0.377,-0.101 0.573,-0.145 1.697,-0.361 4.79,-0.914 8.771,-1.178 1.5,-0.067 3.04,-0.093 4.626,-0.065 1.525,-0.01 2.953,0.016 4.268,0.063 0.39,0.027 0.729,0.04 1.029,0.044 5.023,0.233 8.144,0.762 8.144,0.762 -26.133,1.279 -39.232,13.203 -44.987,20.816 -1.304,1.622 -2.422,3.367 -3.324,5.234 -0.356,0.699 -0.515,1.097 -0.515,1.097 8.328,-7.069 19.98,-13.155 19.98,-13.155 10.449,-4.917 21.402,-7.337 33.008,-5.747 0,0 24.264,2.227 33.932,24.702 -0.417,0.317 -0.361,0.275 -0.777,0.592 -0.642,-0.518 -1.274,-1.007 -1.898,-1.474 -0.975,-0.64 -1.933,-1.336 -2.89,-2.038 -5.184,-3.431 -9.414,-5.048 -11.934,-5.788 -19.06,-4.811 -36.698,-1.232 -52.114,12.263 -7.14,6.251 -11.572,14.131 -11.759,23.917 -0.122,6.365 -0.188,12.734 -0.135,19.101 0.084,10.023 7.135,17.645 17.126,18.986 20.245,2.716 40.598,3.652 60.993,4.129 3.125,0.072 5.888,0.036 7.342,-3.306 0.025,-0.057 0.358,0.021 0.543,0.035 1.379,4.393 -0.607,9.127 -4.223,11.444"
+             style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:none"
+             id="path60"
+             inkscape:connector-curvature="0" /></g></g></g></g></svg>
+\ No newline at end of file
diff --git a/website/assets/logos/powered-gvisor.png b/website/assets/logos/powered-gvisor.png
new file mode 100644
index 000000000..e00c74a33
--- /dev/null
+++ b/website/assets/logos/powered-gvisor.png
diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md
new file mode 100644
index 000000000..fbdd511dd
--- /dev/null
+++ b/website/blog/2019-11-18-security-basics.md
@@ -0,0 +1,299 @@
+# gVisor Security Basics - Part 1
+
+This blog is a space for engineers and community members to share perspectives
+and deep dives on technology and design within the gVisor project. Though our
+logo suggests we're in the business of space exploration (or perhaps fighting
+sea monsters), we're actually in the business of sandboxing Linux containers.
+When we created gVisor, we had three specific goals in mind; _container-native
+security_, _resource efficiency_, and _platform portability_. To put it simply,
+gVisor provides _efficient defense-in-depth for containers anywhere_.
+
+This post addresses gVisor's _container-native security_, specifically how
+gVisor provides strong isolation between an application and the host OS. Future
+posts will address _resource efficiency_ (how gVisor preserves container
+benefits like fast starts, smaller snapshots, and less memory overhead than VMs)
+and _platform portability_ (run gVisor wherever Linux OCI containers run).
+Delivering on each of these goals requires careful security considerations and a
+robust design.
+
+## What does "sandbox" mean?
+
+gVisor allows the execution of untrusted containers, preventing them from
+adversely affecting the host. This means that the untrusted container is
+prevented from attacking or spying on either the host kernel or any other peer
+userspace processes on the host.
+
+For example, if you are a cloud container hosting service, running containers
+from different customers on the same virtual machine means that compromises
+expose customer data. Properly configured, gVisor can provide sufficient
+isolation to allow different customers to run containers on the same host. There
+are many aspects to the proper configuration, including limiting file and
+network access, which we will discuss in future posts.
+
+## The cost of compromise
+
+gVisor was designed around the premise that any security boundary could
+potentially be compromised with enough time and resources. We tried to optimize
+for a solution that was as costly and time-consuming for an attacker as
+possible, at every layer.
+
+Consequently, gVisor was built through a combination of intentional design
+principles and specific technology choices that work together to provide the
+security isolation needed for running hostile containers on a host. We'll dig
+into it in the next section!
+
+# Design Principles
+
+gVisor was designed with some
+[common secure design principles](https://www.owasp.org/index.php/Security_by_Design_Principles)
+in mind: Defense-in-Depth, Principle of Least-Privilege, Attack Surface
+Reduction and Secure-by-Default[^1].
+
+In general, Design Principles outline good engineering practices, but in the
+case of security, they also can be thought of as a set of tactics. In a
+real-life castle, there is no single defensive feature. Rather, there are many
+in combination: redundant walls, scattered draw bridges, small bottle-neck
+entrances, moats, etc.
+
+A simplified version of the design is below
+([more detailed version](/docs/))[^2]:
+
+![Figure 1](/assets/images/2019-11-18-security-basics-figure1.png "Simplified design of gVisor.")
+
+In order to discuss design principles, the following components are important to
+know:
+
+*   runsc - binary that packages the Sentry, platform, and Gofer(s) that run
+    containers. runsc is the drop-in binary for running gVisor in Docker and
+    Kubernetes.
+*   Untrusted Application - container running in the sandbox. Untrusted
+    application/container are used interchangeably in this article.
+*   Platform Syscall Switcher - intercepts syscalls from the application and
+    passes them to the Sentry with no further handling.
+*   Sentry - The "application kernel" in userspace that serves the untrusted
+    application. Each application instance has its own Sentry. The Sentry
+    handles syscalls, routes I/O to gofers, and manages memory and CPU, all in
+    userspace. The Sentry is allowed to make limited, filtered syscalls to the
+    host OS.
+*   Gofer - a process that specifically handles different types of I/O for the
+    Sentry (usually disk I/O). Gofers are also allowed to make filtered syscalls
+    to the Host OS.
+*   Host OS - the actual OS on which gVisor containers are running, always some
+    flavor of Linux (sorry, Windows/MacOS users).
+
+It is important to emphasize what is being protected from the untrusted
+application in this diagram: the host OS and other userspace applications.
+
+In this post, we are only discussing security-related features of gVisor, and
+you might ask, "What about performance, compatibility and stability?" We will
+cover these considerations in future posts.
+
+## Defense-in-Depth
+
+For gVisor, Defense-in-Depth means each component of the software stack trusts
+the other components as little as possible.
+
+It may seem strange that we would want our own software components to distrust
+each other. But by limiting the trust between small, discrete components, each
+component is forced to defend itself against potentially malicious input. And
+when you stack these components on top of each other, you can ensure that
+multiple security barriers must be overcome by an attacker.
+
+And this leads us to how Defense-in-Depth is applied to gVisor: no single
+vulnerability should compromise the host.
+
+In the "Attacker's Advantage / Defender's Dilemma," the defender must succeed
+all the time while the attacker only needs to succeed once. Defense in Depth
+inverts this principle: once the attacker successfully compromises any given
+software component, they are immediately faced with needing to compromise a
+subsequent, distinct layer in order to move laterally or acquire more privilege.
+
+For example, the untrusted container is isolated from the Sentry. The Sentry is
+isolated from host I/O operations by serving those requests in separate
+processes called Gofers. And both the untrusted container and its associated
+Gofers are isolated from the host process that is running the sandbox.
+
+An additional benefit is that this generally leads to more robust and stable
+software, forcing interfaces to be strictly defined and tested to ensure all
+inputs are properly parsed and bounds checked.
+
+## Least-Privilege
+
+The principle of Least-Privilege implies that each software component has only
+the permissions it needs to function, and no more.
+
+Least-Privilege is applied throughout gVisor. Each component and more
+importantly, each interface between the components, is designed so that only the
+minimum level of permission is required for it to perform its function.
+Specifically, the closer you are to the untrusted application, the less
+privilege you have.
+
+![Figure 2](/assets/images/2019-11-18-security-basics-figure2.png "runsc components and their privileges.")
+
+This is evident in how runsc (the drop in gVisor binary for Docker/Kubernetes)
+constructs the sandbox. The Sentry has the least privilege possible (it can't
+even open a file!). Gofers are only allowed file access, so even if it were
+compromised, the host network would be unavailable. Only the runsc binary itself
+has full access to the host OS, and even runsc's access to the host OS is often
+limited through capabilities / chroot / namespacing.
+
+Designing a system with Defense-in-Depth and Least-Privilege in mind encourages
+small, separate, single-purpose components, each with very restricted
+privileges.
+
+## Attack Surface Reduction
+
+There are no bugs in unwritten code. In other words, gVisor supports a feature
+if and only if it is needed to run host Linux containers.
+
+### Host Application/Sentry Interface:
+
+There are a lot of things gVisor does not need to do. For example, it does not
+need to support arbitrary device drivers, nor does it need to support video
+playback. By not implementing what will not be used, we avoid introducing
+potential bugs in our code.
+
+That is not to say gVisor has limited functionality! Quite the opposite, we
+analyzed what is actually needed to run Linux containers and today the Sentry
+supports 237 syscalls[^3]<sup>,</sup>[^4], along with the range of critical
+/proc and /dev files. However, gVisor does not support every syscall in the
+Linux kernel. There are about 350 syscalls[^5] within the 5.3.11 version of the
+Linux kernel, many of which do not apply to Linux containers that typically host
+cloud-like workloads. For example, we don't support old versions of epoll
+(epoll_ctl_old, epoll_wait_old), because they are deprecated in Linux and no
+supported workloads use them.
+
+Furthermore, any exploited vulnerabilities in the implemented syscalls (or
+Sentry code in general) only apply to gaining control of the Sentry. More on
+this in a later post.
+
+### Sentry/Host OS Interface:
+
+The Sentry's interactions with the Host OS are restricted in many ways. For
+instance, no syscall is "passed-through" from the untrusted application to the
+host OS. All syscalls are intercepted and interpreted. In the case where the
+Sentry needs to call the Host OS, we severely limit the syscalls that the Sentry
+itself is allowed to make to the host kernel[^6].
+
+For example, there are many file-system based attacks, where manipulation of
+files or their paths, can lead to compromise of the host[^7]. As a result, the
+Sentry does not allow any syscall that creates or opens a file descriptor. All
+file descriptors must be donated to the sandbox. By disallowing open or creation
+of file descriptors, we eliminate entire categories of these file-based attacks.
+
+This does not affect functionality though. For example, during startup, runsc
+will donate FDs the Sentry that allow for mapping STDIN/STDOUT/STDERR to the
+sandboxed application. Also the Gofer may donate an FD to the Sentry, allowing
+for direct access to some files. And most files will be remotely accessed
+through the Gofers, in which case no FDs are donated to the Sentry.
+
+The Sentry itself is only allowed access to specific
+[whitelisted syscalls](https://github.com/google/gvisor/blob/master/runsc/boot/config.go).
+Without networking, the Sentry needs 53 host syscalls in order to function, and
+with networking, it uses an additional 15[^8]. By limiting the whitelist to only
+these needed syscalls, we radically reduce the amount of host OS attack surface.
+If any attempts are made to call something outside the whitelist, it is
+immediately blocked and the sandbox is killed by the Host OS.
+
+### Sentry/Gofer Interface:
+
+The Sentry communicates with the Gofer through a local unix domain socket (UDS)
+via a version of the 9P protocol[^9]. The UDS file descriptor is passed to the
+sandbox during initialization and all communication between the Sentry and Gofer
+happens via 9P. We will go more into how Gofers work in future posts.
+
+### End Result
+
+So, of the 350 syscalls in the Linux kernel, the Sentry needs to implement only
+237 of them to support containers. At most, the Sentry only needs to call 68 of
+the host Linux syscalls. In other words, with gVisor, applications get the vast
+majority (and growing) functionality of Linux containers for only 68 possible
+syscalls to the Host OS. 350 syscalls to 68 is attack surface reduction.
+
+![Figure 3](/assets/images/2019-11-18-security-basics-figure3.png "Reduction of Attack Surface of the Syscall Table. Note that the Senty's Syscall Emulation Layer keeps the Containerized Process from ever calling the Host OS.")
+
+## Secure-by-default
+
+The default choice for a user should be safe. If users need to run a less secure
+configuration of the sandbox for the sake of performance or application
+compatibility, they must make the choice explicitly.
+
+An example of this might be a networking application that is performance
+sensitive. Instead of using the safer, Go-based Netstack in the Sentry, the
+untrusted container can instead use the host Linux networking stack directly.
+However, this means the untrusted container will be directly interacting with
+the host, without the safety benefits of the sandbox. It also means that an
+attack could directly compromise the host through his path.
+
+These less secure configurations are **not** the default. In fact, the user must
+take action to change the configuration and run in a less secure mode.
+Additionally, these actions make it very obvious that a less secure
+configuration is being used.
+
+This can be as simple as forcing a default runtime flag option to the secure
+option. gVisor does this by always using its internal netstack by default.
+However, for certain performance sensitive applications, we allow the usage of
+the host OS networking stack, but it requires the user to actively set a
+flag[^10].
+
+# Technology Choices
+
+Technology choices for gVisor mainly involve things that will give us a security
+boundary.
+
+At a higher level, boundaries in software might be describing a great many
+things. It may be discussing the boundaries between threads, boundaries between
+processes, boundaries between CPU privilege levels, and more.
+
+Security boundaries are interfaces that are designed and built so that entire
+classes of bugs/vulnerabilities are eliminated.
+
+For example, the Sentry and Gofers are implemented using Go. Go was chosen for a
+number of the features it provided. Go is a fast, statically-typed, compiled
+language that has efficient multi-threading support, garbage collection and a
+constrained set of "unsafe" operations.
+
+Using these features enabled safe array and pointer handling. This means entire
+classes of vulnerabilities were eliminated, such as buffer overflows and
+use-after-free.
+
+Another example is our use of very strict syscall switching to ensure that the
+Sentry is always the first software component that parses and interprets the
+calls being made by the untrusted container. Here is an instance where different
+platforms use different solutions, but all of them share this common trait,
+whether it is through the use of ptrace "a la PTRACE_ATTACH"[^11] or kvm's
+ring0[^12].
+
+Finally, one of the most restrictive choices was to use seccomp, to restrict the
+Sentry from being able to open or create a file descriptor on the host. All file
+I/O is required to go through Gofers. Preventing the opening or creation of file
+descriptions eliminates whole categories of bugs around file permissions
+[like this one](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2016-4557)[^13].
+
+# To be continued - Part 2
+
+In part 2 of this blog post, we will explore gVisor from an attacker's point of
+view. We will use it as an opportunity to examine the specific strengths and
+weaknesses of each gVisor component.
+
+We will also use it to introduce Google's Vulnerability Reward Program[^14], and
+other ways the community can contribute to help make gVisor safe, fast and
+stable.
+
+## Notes
+
+[^1]: [https://www.owasp.org/index.php/Security_by_Design_Principles](https://www.owasp.org/index.php/Security_by_Design_Principles)
+[^2]: [https://gvisor.dev/docs/architecture_guide](https://gvisor.dev/docs/architecture_guide/)
+[^3]: [https://github.com/google/gvisor/blob/master/pkg/sentry/syscalls/linux/linux64_amd64.go](https://github.com/google/gvisor/blob/master/pkg/sentry/syscalls/syscalls.go)
+[^4]: Internally that is, it doesn't call to the Host OS to implement them, in
+    fact that is explicitly disallowed, more on that in the future.
+[^5]: [https://elixir.bootlin.com/linux/latest/source/arch/x86/entry/syscalls/syscall_64.tbl#L345](https://elixir.bootlin.com/linux/latest/source/arch/x86/entry/syscalls/syscall_64.tbl#L345)
+[^6]: [https://github.com/google/gvisor/tree/master/runsc/boot/filter](https://github.com/google/gvisor/tree/master/runsc/boot/filter)
+[^7]: [https://en.wikipedia.org/wiki/Dirty_COW](https://en.wikipedia.org/wiki/Dirty_COW)
+[^8]: [https://github.com/google/gvisor/blob/master/runsc/boot/config.go](https://github.com/google/gvisor/blob/master/runsc/boot/config.go)
+[^9]: [https://en.wikipedia.org/wiki/9P_(protocol)](https://en.wikipedia.org/wiki/9P_\(protocol\))
+[^10]: [https://gvisor.dev/docs/user_guide/networking/#network-passthrough](https://gvisor.dev/docs/user_guide/networking/#network-passthrough)
+[^11]: [https://github.com/google/gvisor/blob/c7e901f47a09eaac56bd4813227edff016fa6bff/pkg/sentry/platform/ptrace/subprocess.go#L390](https://github.com/google/gvisor/blob/c7e901f47a09eaac56bd4813227edff016fa6bff/pkg/sentry/platform/ptrace/subprocess.go#L390)
+[^12]: [https://github.com/google/gvisor/blob/c7e901f47a09eaac56bd4813227edff016fa6bff/pkg/sentry/platform/ring0/kernel_amd64.go#L182](https://github.com/google/gvisor/blob/c7e901f47a09eaac56bd4813227edff016fa6bff/pkg/sentry/platform/ring0/kernel_amd64.go#L182)
+[^13]: [https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2016-4557](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2016-4557)
+[^14]: [https://www.google.com/about/appsecurity/reward-program/index.html](https://www.google.com/about/appsecurity/reward-program/index.html)
diff --git a/website/blog/2020-04-02-networking-security.md b/website/blog/2020-04-02-networking-security.md
new file mode 100644
index 000000000..5a5e38fd7
--- /dev/null
+++ b/website/blog/2020-04-02-networking-security.md
@@ -0,0 +1,183 @@
+# gVisor Networking Security
+
+In our
+[first blog post](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/),
+we covered some secure design principles and how they guided the architecture of
+gVisor as a whole. In this post, we will cover how these principles guided the
+networking architecture of gVisor, and the tradeoffs involved. In particular, we
+will cover how these principles culminated in two networking modes, how they
+work, and the properties of each.
+
+## gVisor's security architecture in the context of networking
+
+Linux networking is complicated. The TCP protocol is over 40 years old, and has
+been repeatedly extended over the years to keep up with the rapid pace of
+network infrastructure improvements, all while maintaining compatibility. On top
+of that, Linux networking has a fairly large API surface. Linux supports
+[over 150 options](https://github.com/google/gvisor/blob/960f6a975b7e44c0efe8fd38c66b02017c4fe137/pkg/sentry/strace/socket.go#L476-L644)
+for the most common socket types alone. In fact, the net subsystem is one of the
+largest and fastest growing in Linux at approximately 1.1 million lines of code.
+For comparison, that is several times the size of the entire gVisor codebase.
+
+At the same time, networking is increasingly important. The cloud era is
+arguably about making everything a network service, and in order to make that
+work, the interconnect performance is critical. Adding networking support to
+gVisor was difficult, not just due to the inherent complexity, but also because
+it has the potential to significantly weaken gVisor's security model.
+
+As outlined in the previous blog post, gVisor's
+[secure design principles](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/#design-principles)
+are:
+
+1.  Defense in Depth: each component of the software stack trusts each other
+    component as little as possible.
+1.  Least Privilege: each software component has only the permissions it needs
+    to function, and no more.
+1.  Attack Surface Reduction: limit the surface area of the host exposed to the
+    sandbox.
+1.  Secure by Default: the default choice for a user should be safe.
+
+gVisor manifests these principles as a multi-layered system. An application
+running in the sandbox interacts with the Sentry, a userspace kernel, which
+mediates all interactions with the Host OS and beyond. The Sentry is written in
+pure Go with minimal unsafe code, making it less vulnerable to buffer overflows
+and related memory bugs that can lead to a variety of compromises including code
+injection. It emulates Linux using only a minimal and audited set of Host OS
+syscalls that limit the Host OS's attack surface exposed to the Sentry itself.
+The syscall restrictions are enforced by running the Sentry with seccomp
+filters, which enforce that the Sentry can only use the expected set of
+syscalls. The Sentry runs as an unprivileged user and in namespaces, which,
+along with the seccomp filters, ensure that the Sentry is run with the Least
+Privilege required.
+
+gVisor's multi-layered design provides Defense in Depth. The Sentry, which does
+not trust the application because it may attack the Sentry and try to bypass it,
+is the first layer. The sandbox that the Sentry runs in is the second layer. If
+the Sentry were compromised, the attacker would still be in a highly restrictive
+sandbox which they must also break out of in order to compromise the Host OS.
+
+To enable networking functionality while preserving gVisor's security
+properties, we implemented a
+[userspace network stack](https://github.com/google/gvisor/tree/master/pkg/tcpip)
+in the Sentry, which we creatively named Netstack. Netstack is also written in
+Go, not only to avoid unsafe code in the network stack itself, but also to avoid
+a complicated and unsafe Foreign Function Interface. Having its own integrated
+network stack allows the Sentry to implement networking operations using up to
+three Host OS syscalls to read and write packets. These syscalls allow a very
+minimal set of operations which are already allowed (either through the same or
+a similar syscall). Moreover, because packets typically come from off-host (e.g.
+the internet), the Host OS's packet processing code has received a lot of
+scrutiny, hopefully resulting in a high degree of hardening.
+
+![Figure 1](/assets/images/2020-04-02-networking-security-figure1.png "Network and gVisor.")
+
+## Writing a network stack
+
+Netstack was written from scratch specifically for gVisor. Because Netstack was
+designed and implemented to be modular, flexible and self-contained, there are
+now several more projects using Netstack in creative and exciting ways. As we
+discussed, a custom network stack has enabled a variety of security-related
+goals which would not have been possible any other way. This came at a cost
+though. Network stacks are complex and writing a new one comes with many
+challenges, mostly related to application compatibility and performance.
+
+Compatibility issues typically come in two forms: missing features, and features
+with behavior that differs from Linux (usually due to bugs). Both of these are
+inevitable in an implementation of a complex system spanning many quickly
+evolving and ambiguous standards. However, we have invested heavily in this
+area, and the vast majority of applications have no issues using Netstack. For
+example,
+[we now support setting 34 different socket options](https://github.com/google/gvisor/blob/815df2959a76e4a19f5882e40402b9bbca9e70be/pkg/sentry/socket/netstack/netstack.go#L830-L1764)
+versus
+[only 7 in our initial git commit](https://github.com/google/gvisor/blob/d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296/pkg/sentry/socket/epsocket/epsocket.go#L445-L702).
+We are continuing to make good progress in this area.
+
+Performance issues typically come from TCP behavior and packet processing speed.
+To improve our TCP behavior, we are working on implementing the full set of TCP
+RFCs. There are many RFCs which are significant to performance (e.g.
+[RACK](https://tools.ietf.org/id/draft-ietf-tcpm-rack-03.html) and
+[BBR](https://tools.ietf.org/html/draft-cardwell-iccrg-bbr-congestion-control-00))
+that we have yet to implement. This mostly affects TCP performance with
+non-ideal network conditions (e.g. cross continent connections). Faster packet
+processing mostly improves TCP performance when network conditions are very good
+(e.g. within a datacenter). Our primary strategy here is to reduce interactions
+with the Go runtime, specifically the garbage collector (GC) and scheduler. We
+are currently optimizing buffer management to reduce the amount of garbage,
+which will lower the GC cost. To reduce scheduler interactions, we are
+re-architecting the TCP implementation to use fewer goroutines. Performance
+today is good enough for most applications and we are making steady
+improvements. For example, since May of 2019, we have improved the Netstack
+runsc
+[iperf3 download benchmark](https://github.com/google/gvisor/blob/master/benchmarks/suites/network.py)
+score by roughly 15% and upload score by around 10,000X. Current numbers are
+about 17 Gbps download and about 8 Gbps upload versus about 42 Gbps and 43 Gbps
+for native (Linux) respectively.
+
+## An alternative
+
+We also offer an alternative network mode: passthrough. This name can be
+misleading as syscalls are never passed through from the app to the Host OS.
+Instead, the passthrough mode implements networking in gVisor using the Host
+OS's network stack. (This mode is called
+[hostinet](https://github.com/google/gvisor/tree/master/pkg/sentry/socket/hostinet)
+in the codebase.) Passthrough mode can improve performance for some use cases as
+the Host OS's network stack has had an enormous number of person-years poured
+into making it highly performant. However, there is a rather large downside to
+using passthrough mode: it weakens gVisor's security model by increasing the
+Host OS's Attack Surface. This is because using the Host OS's network stack
+requires the Sentry to use the Host OS's
+[Berkeley socket interface](https://en.wikipedia.org/wiki/Berkeley_sockets). The
+Berkeley socket interface is a much larger API surface than the packet interface
+that our network stack uses. When passthrough mode is in use, the Sentry is
+allowed to use
+[15 additional syscalls](https://github.com/google/gvisor/blob/b1576e533223e98ebe4bd1b82b04e3dcda8c4bf1/runsc/boot/filter/config.go#L312-L517).
+Further, this set of syscalls includes some that allow the Sentry to create file
+descriptors, something that
+[we don't normally allow](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/#sentry-host-os-interface)
+as it opens up classes of file-based attacks.
+
+There are some networking features that we can't implement on top of syscalls
+that we feel are safe (most notably those behind
+[ioctl](http://man7.org/linux/man-pages/man2/ioctl.2.html)) and therefore are
+not supported. Because of this, we actually support fewer networking features in
+passthrough mode than we do in Netstack, reducing application compatibility.
+That's right: using our networking stack provides better overall application
+compatibility than using our passthrough mode.
+
+That said, gVisor with passthrough networking still provides a high level of
+isolation. Applications cannot specify host syscall arguments directly, and the
+sentry's seccomp policy restricts its syscall use significantly more than a
+general purpose seccomp policy.
+
+## Secure by Default
+
+The goal of the Secure by Default principle is to make it easy to securely
+sandbox containers. Of course, disabling network access entirely is the most
+secure option, but that is not practical for most applications. To make gVisor
+Secure by Default, we have made Netstack the default networking mode in gVisor
+as we believe that it provides significantly better isolation. For this reason
+we strongly caution users from changing the default unless Netstack flat out
+won't work for them. The passthrough mode option is still provided, but we want
+users to make an informed decision when selecting it.
+
+Another way in which gVisor makes it easy to securely sandbox containers is by
+allowing applications to run unmodified, with no special configuration needed.
+In order to do this, gVisor needs to support all of the features and syscalls
+that applications use. Neither seccomp nor gVisor's passthrough mode can do this
+as applications commonly use syscalls which are too dangerous to be included in
+a secure policy. Even if this dream isn't fully realized today, gVisor's
+architecture with Netstack makes this possible.
+
+## Give Netstack a Try
+
+If you haven't already, try running a workload in gVisor with Netstack. You can
+find instructions on how to get started in our
+[Quick Start](/docs/user_guide/quick_start/docker/). We want to hear about both
+your successes and any issues you encounter. We welcome your contributions,
+whether that be verbal feedback or code contributions, via our
+[Gitter channel](https://gitter.im/gvisor/community),
+[email list](https://groups.google.com/forum/#!forum/gvisor-users),
+[issue tracker](https://gvisor.dev/issue/new), and
+[Github repository](https://github.com/google/gvisor). Feel free to express
+interest in an [open issue](https://gvisor.dev/issue/), or reach out if you
+aren't sure where to start.
diff --git a/website/blog/BUILD b/website/blog/BUILD
new file mode 100644
index 000000000..01c1f5a6e
--- /dev/null
+++ b/website/blog/BUILD
@@ -0,0 +1,37 @@
+load("//website:defs.bzl", "doc", "docs")
+
+package(
+    default_visibility = ["//website:__pkg__"],
+    licenses = ["notice"],
+)
+
+exports_files(["index.html"])
+
+doc(
+    name = "security_basics",
+    src = "2019-11-18-security-basics.md",
+    authors = [
+        "jsprad",
+        "zkoopmans",
+    ],
+    layout = "post",
+    permalink = "/blog/2019/11/18/gvisor-security-basics-part-1/",
+)
+
+doc(
+    name = "networking_security",
+    src = "2020-04-02-networking-security.md",
+    authors = [
+        "igudger",
+    ],
+    layout = "post",
+    permalink = "/blog/2020/04/02/gvisor-networking-security/",
+)
+
+docs(
+    name = "posts",
+    deps = [
+        ":" + rule
+        for rule in existing_rules()
+    ],
+)
diff --git a/website/blog/index.html b/website/blog/index.html
new file mode 100644
index 000000000..5c67c95fc
--- /dev/null
+++ b/website/blog/index.html
@@ -0,0 +1,22 @@
+---
+title: Blog
+layout: blog
+feed: true
+pagination:
+  enabled: true
+---
+
+{% for post in paginator.posts %}
+<div>
+  <h2><a href="{{ post.url }}">{{ post.title }}</a></h2>
+  <div class="blog-meta">
+    {% include byline.html authors=post.authors date=post.date %}
+  </div>
+  <p>{{ post.excerpt | strip_html }}</p>
+  <p><a href="{{ post.url }}">Full Post &raquo;</a></p>
+</div>
+{% endfor %}
+
+{% if paginator.total_pages > 1 %}
+{% include paginator.html %}
+{% endif %}
diff --git a/website/cmd/server/BUILD b/website/cmd/server/BUILD
new file mode 100644
index 000000000..6b5a08f0d
--- /dev/null
+++ b/website/cmd/server/BUILD
@@ -0,0 +1,10 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "server",
+    srcs = ["main.go"],
+    pure = True,
+    visibility = ["//website:__pkg__"],
+)
diff --git a/website/cmd/server/main.go b/website/cmd/server/main.go
new file mode 100644
index 000000000..7c8bc9bfa
--- /dev/null
+++ b/website/cmd/server/main.go
@@ -0,0 +1,211 @@
+// Copyright 2019 The gVisor Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Server is the main gvisor.dev binary.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"net/http"
+	"os"
+	"regexp"
+	"strings"
+)
+
+var redirects = map[string]string{
+	// GitHub redirects.
+	"/change":    "https://github.com/google/gvisor",
+	"/issue":     "https://github.com/google/gvisor/issues",
+	"/issue/new": "https://github.com/google/gvisor/issues/new",
+	"/pr":        "https://github.com/google/gvisor/pulls",
+
+	// For links.
+	"/faq": "/docs/user_guide/faq/",
+
+	// Redirects to compatibility docs.
+	"/c":             "/docs/user_guide/compatibility/",
+	"/c/linux/amd64": "/docs/user_guide/compatibility/linux/amd64/",
+
+	// Redirect for old URLs.
+	"/docs/user_guide/compatibility/amd64/": "/docs/user_guide/compatibility/linux/amd64/",
+	"/docs/user_guide/compatibility/amd64":  "/docs/user_guide/compatibility/linux/amd64/",
+	"/docs/user_guide/kubernetes/":          "/docs/user_guide/quick_start/kubernetes/",
+	"/docs/user_guide/kubernetes":           "/docs/user_guide/quick_start/kubernetes/",
+	"/docs/user_guide/oci/":                 "/docs/user_guide/quick_start/oci/",
+	"/docs/user_guide/oci":                  "/docs/user_guide/quick_start/oci/",
+	"/docs/user_guide/docker/":              "/docs/user_guide/quick_start/docker/",
+	"/docs/user_guide/docker":               "/docs/user_guide/quick_start/docker/",
+
+	// Deprecated, but links continue to work.
+	"/cl": "https://gvisor-review.googlesource.com",
+}
+
+var prefixHelpers = map[string]string{
+	"change": "https://github.com/google/gvisor/commit/%s",
+	"issue":  "https://github.com/google/gvisor/issues/%s",
+	"pr":     "https://github.com/google/gvisor/pull/%s",
+
+	// Redirects to compatibility docs.
+	"c/linux/amd64": "/docs/user_guide/compatibility/linux/amd64/#%s",
+
+	// Deprecated, but links continue to work.
+	"cl": "https://gvisor-review.googlesource.com/c/gvisor/+/%s",
+}
+
+var (
+	validID    = regexp.MustCompile(`^[A-Za-z0-9-]*/?$`)
+	goGetHTML5 = `<!doctype html><html><head><meta charset=utf-8>
+<meta name="go-import" content="gvisor.dev/gvisor git https://github.com/google/gvisor">
+<meta name="go-import" content="gvisor.dev/website git https://github.com/google/gvisor-website">
+<title>Go-get</title></head><body></html>`
+)
+
+// cronHandler wraps an http.Handler to check that the request is from the App
+// Engine Cron service.
+// See: https://cloud.google.com/appengine/docs/standard/go112/scheduling-jobs-with-cron-yaml#validating_cron_requests
+func cronHandler(h http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.Header.Get("X-Appengine-Cron") != "true" {
+			http.NotFound(w, r)
+			return
+		}
+		// Fallthrough.
+		h.ServeHTTP(w, r)
+	})
+}
+
+// wrappedHandler wraps an http.Handler.
+//
+// If the query parameters include go-get=1, then we redirect to a single
+// static page that allows us to serve arbitrary Go packages.
+func wrappedHandler(h http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		gg, ok := r.URL.Query()["go-get"]
+		if ok && len(gg) == 1 && gg[0] == "1" {
+			// Serve a trivial html page.
+			w.Write([]byte(goGetHTML5))
+			return
+		}
+		// Fallthrough.
+		h.ServeHTTP(w, r)
+	})
+}
+
+// redirectWithQuery redirects to the given target url preserving query parameters.
+func redirectWithQuery(w http.ResponseWriter, r *http.Request, target string) {
+	url := target
+	if qs := r.URL.RawQuery; qs != "" {
+		url += "?" + qs
+	}
+	http.Redirect(w, r, url, http.StatusFound)
+}
+
+// hostRedirectHandler redirects the www. domain to the naked domain.
+func hostRedirectHandler(h http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if strings.HasPrefix(r.Host, "www.") {
+			// Redirect to the naked domain.
+			r.URL.Scheme = "https"  // Assume https.
+			r.URL.Host = r.Host[4:] // Remove the 'www.'
+			http.Redirect(w, r, r.URL.String(), http.StatusMovedPermanently)
+			return
+		}
+
+		if *projectID != "" && r.Host == *projectID+".appspot.com" && *customHost != "" {
+			// Redirect to the custom domain.
+			r.URL.Scheme = "https" // Assume https.
+			r.URL.Host = *customHost
+			http.Redirect(w, r, r.URL.String(), http.StatusMovedPermanently)
+			return
+		}
+		h.ServeHTTP(w, r)
+	})
+}
+
+// prefixRedirectHandler returns a handler that redirects to the given formated url.
+func prefixRedirectHandler(prefix, baseURL string) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if p := r.URL.Path; p == prefix {
+			// Redirect /prefix/ to /prefix.
+			http.Redirect(w, r, p[:len(p)-1], http.StatusFound)
+			return
+		}
+		id := r.URL.Path[len(prefix):]
+		if !validID.MatchString(id) {
+			http.Error(w, "Not found", http.StatusNotFound)
+			return
+		}
+		target := fmt.Sprintf(baseURL, id)
+		redirectWithQuery(w, r, target)
+	})
+}
+
+// redirectHandler returns a handler that redirects to the given url.
+func redirectHandler(target string) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		redirectWithQuery(w, r, target)
+	})
+}
+
+// redirectRedirects registers redirect http handlers.
+func registerRedirects(mux *http.ServeMux) {
+	if mux == nil {
+		mux = http.DefaultServeMux
+	}
+
+	for prefix, baseURL := range prefixHelpers {
+		p := "/" + prefix + "/"
+		mux.Handle(p, hostRedirectHandler(wrappedHandler(prefixRedirectHandler(p, baseURL))))
+	}
+
+	for path, redirect := range redirects {
+		mux.Handle(path, hostRedirectHandler(wrappedHandler(redirectHandler(redirect))))
+	}
+}
+
+// registerStatic registers static file handlers
+func registerStatic(mux *http.ServeMux, staticDir string) {
+	if mux == nil {
+		mux = http.DefaultServeMux
+	}
+	mux.Handle("/", hostRedirectHandler(wrappedHandler(http.FileServer(http.Dir(staticDir)))))
+}
+
+func envFlagString(name, def string) string {
+	if val := os.Getenv(name); val != "" {
+		return val
+	}
+	return def
+}
+
+var (
+	addr      = flag.String("http", envFlagString("HTTP", ":"+envFlagString("PORT", "8080")), "HTTP service address")
+	staticDir = flag.String("static-dir", envFlagString("STATIC_DIR", "_site"), "static files directory")
+
+	// Uses the standard GOOGLE_CLOUD_PROJECT environment variable set by App Engine.
+	projectID  = flag.String("project-id", envFlagString("GOOGLE_CLOUD_PROJECT", ""), "The App Engine project ID.")
+	customHost = flag.String("custom-domain", envFlagString("CUSTOM_DOMAIN", "gvisor.dev"), "The application's custom domain.")
+)
+
+func main() {
+	flag.Parse()
+
+	registerRedirects(nil)
+	registerStatic(nil, *staticDir)
+
+	log.Printf("Listening on %s...", *addr)
+	log.Fatal(http.ListenAndServe(*addr, nil))
+}
diff --git a/website/cmd/syscalldocs/BUILD b/website/cmd/syscalldocs/BUILD
new file mode 100644
index 000000000..c5a0ed7fe
--- /dev/null
+++ b/website/cmd/syscalldocs/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "syscalldocs",
+    srcs = ["main.go"],
+    visibility = ["//website:__pkg__"],
+)
diff --git a/website/cmd/syscalldocs/main.go b/website/cmd/syscalldocs/main.go
new file mode 100644
index 000000000..327537214
--- /dev/null
+++ b/website/cmd/syscalldocs/main.go
@@ -0,0 +1,211 @@
+// Copyright 2019 The gVisor Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary syscalldocs generates system call markdown.
+package main
+
+import (
+	"bufio"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"text/template"
+)
+
+// CompatibilityInfo is the collection of all information.
+type CompatibilityInfo map[string]map[string]ArchInfo
+
+// ArchInfo is compatbility doc for an architecture.
+type ArchInfo struct {
+	// Syscalls maps syscall number for the architecture to the doc.
+	Syscalls map[uintptr]SyscallDoc `json:"syscalls"`
+}
+
+// SyscallDoc represents a single item of syscall documentation.
+type SyscallDoc struct {
+	Name    string   `json:"name"`
+	Support string   `json:"support"`
+	Note    string   `json:"note,omitempty"`
+	URLs    []string `json:"urls,omitempty"`
+}
+
+var mdTemplate = template.Must(template.New("out").Parse(`---
+title: {{.Title}}
+description: Syscall Compatibility Reference Documentation for {{.OS}}/{{.Arch}}
+layout: docs
+category: Compatibility
+weight: 50
+permalink: /docs/user_guide/compatibility/{{.OS}}/{{.Arch}}/
+---
+
+This table is a reference of {{.OS}} syscalls for the {{.Arch}} architecture and
+their compatibility status in gVisor. gVisor does not support all syscalls and
+some syscalls may have a partial implementation.
+
+This page is automatically generated from the source code.
+
+Of {{.Total}} syscalls, {{.Supported}} syscalls have a full or partial
+implementation. There are currently {{.Unsupported}} unsupported
+syscalls. {{if .Undocumented}}{{.Undocumented}} syscalls are not yet documented.{{end}}
+
+<table>
+  <thead>
+    <tr>
+      <th>#</th>
+      <th>Name</th>
+      <th>Support</th>
+      <th>Notes</th>
+    </tr>
+  </thead>
+  <tbody>
+ 	{{range $i, $syscall := .Syscalls}}
+    <tr>
+      <td><a class="doc-table-anchor" id="{{.Name}}"></a>{{.Number}}</td>
+      <td><a href="http://man7.org/linux/man-pages/man2/{{.Name}}.2.html" target="_blank" rel="noopener">{{.Name}}</a></td>
+      <td>{{.Support}}</td>
+	  <td>{{.Note}} {{range $i, $url := .URLs}}<br/>See: <a href="{{.}}">{{.}}</a>{{end}}</td>
+    </tr>
+	{{end}}
+  </tbody>
+</table>
+`))
+
+// Fatalf writes a message to stderr and exits with error code 1
+func Fatalf(format string, a ...interface{}) {
+	fmt.Fprintf(os.Stderr, format, a...)
+	os.Exit(1)
+}
+
+func main() {
+	inputFlag := flag.String("in", "-", "File to input ('-' for stdin)")
+	outputDir := flag.String("out", ".", "Directory to output files.")
+
+	flag.Parse()
+
+	var input io.Reader
+	if *inputFlag == "-" {
+		input = os.Stdin
+	} else {
+		i, err := os.Open(*inputFlag)
+		if err != nil {
+			Fatalf("Error opening %q: %v", *inputFlag, err)
+		}
+		input = i
+	}
+	input = bufio.NewReader(input)
+
+	var info CompatibilityInfo
+	d := json.NewDecoder(input)
+	if err := d.Decode(&info); err != nil {
+		Fatalf("Error reading json: %v", err)
+	}
+
+	weight := 0
+	for osName, osInfo := range info {
+		for archName, archInfo := range osInfo {
+			outDir := filepath.Join(*outputDir, osName)
+			outFile := filepath.Join(outDir, archName+".md")
+
+			if err := os.MkdirAll(outDir, 0755); err != nil {
+				Fatalf("Error creating directory %q: %v", *outputDir, err)
+			}
+
+			f, err := os.OpenFile(outFile, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644)
+			if err != nil {
+				Fatalf("Error opening file %q: %v", outFile, err)
+			}
+			defer f.Close()
+
+			weight += 10
+			data := struct {
+				Title        string
+				OS           string
+				Arch         string
+				Weight       int
+				Total        int
+				Supported    int
+				Unsupported  int
+				Undocumented int
+				Syscalls     []struct {
+					Name    string
+					Number  uintptr
+					Support string
+					Note    string
+					URLs    []string
+				}
+			}{
+				Title:        strings.Title(osName) + "/" + archName,
+				OS:           osName,
+				Arch:         archName,
+				Weight:       weight,
+				Total:        0,
+				Supported:    0,
+				Unsupported:  0,
+				Undocumented: 0,
+				Syscalls: []struct {
+					Name    string
+					Number  uintptr
+					Support string
+					Note    string
+					URLs    []string
+				}{},
+			}
+
+			for num, s := range archInfo.Syscalls {
+				switch s.Support {
+				case "Full Support", "Partial Support":
+					data.Supported++
+				case "Unimplemented":
+					data.Unsupported++
+				default:
+					data.Undocumented++
+				}
+				data.Total++
+
+				for i := range s.URLs {
+					if !strings.HasPrefix(s.URLs[i], "http://") && !strings.HasPrefix(s.URLs[i], "https://") {
+						s.URLs[i] = "https://" + s.URLs[i]
+					}
+				}
+
+				data.Syscalls = append(data.Syscalls, struct {
+					Name    string
+					Number  uintptr
+					Support string
+					Note    string
+					URLs    []string
+				}{
+					Name:    s.Name,
+					Number:  num,
+					Support: s.Support,
+					Note:    s.Note, // TODO urls
+					URLs:    s.URLs,
+				})
+			}
+
+			sort.Slice(data.Syscalls, func(i, j int) bool {
+				return data.Syscalls[i].Number < data.Syscalls[j].Number
+			})
+
+			if err := mdTemplate.Execute(f, data); err != nil {
+				Fatalf("Error writing file %q: %v", outFile, err)
+			}
+		}
+	}
+}
diff --git a/website/css/main.scss b/website/css/main.scss
new file mode 100644
index 000000000..06106833f
--- /dev/null
+++ b/website/css/main.scss
@@ -0,0 +1,5 @@
+@import 'style.scss';
+@import 'front.scss';
+@import 'navbar.scss';
+@import 'sidebar.scss';
+@import 'footer.scss';
diff --git a/website/defs.bzl b/website/defs.bzl
new file mode 100644
index 000000000..ead6a3067
--- /dev/null
+++ b/website/defs.bzl
@@ -0,0 +1,176 @@
+"""Wrappers for website documentation."""
+
+# DocInfo is a provider which simple adds sufficient metadata to the source
+# files (and additional data files) so that a jeyll header can be constructed
+# dynamically. This is done the via BUILD system so that the plain
+# documentation files can be viewable without non-compliant markdown headers.
+DocInfo = provider(
+    fields = [
+        "layout",
+        "description",
+        "permalink",
+        "category",
+        "subcategory",
+        "weight",
+        "editpath",
+        "authors",
+    ],
+)
+
+def _doc_impl(ctx):
+    return [
+        DefaultInfo(
+            files = depset(ctx.files.src + ctx.files.data),
+        ),
+        DocInfo(
+            layout = ctx.attr.layout,
+            description = ctx.attr.description,
+            permalink = ctx.attr.permalink,
+            category = ctx.attr.category,
+            subcategory = ctx.attr.subcategory,
+            weight = ctx.attr.weight,
+            editpath = ctx.files.src[0].short_path,
+            authors = ctx.attr.authors,
+        ),
+    ]
+
+doc = rule(
+    implementation = _doc_impl,
+    doc = "Annotate a document for jekyll headers.",
+    attrs = {
+        "src": attr.label(
+            doc = "The markdown source file.",
+            mandatory = True,
+            allow_single_file = True,
+        ),
+        "data": attr.label_list(
+            doc = "Additional data files (e.g. images).",
+            allow_files = True,
+        ),
+        "layout": attr.string(
+            doc = "The document layout.",
+            default = "docs",
+        ),
+        "description": attr.string(
+            doc = "The document description.",
+            default = "",
+        ),
+        "permalink": attr.string(
+            doc = "The document permalink.",
+            mandatory = True,
+        ),
+        "category": attr.string(
+            doc = "The document category.",
+            default = "",
+        ),
+        "subcategory": attr.string(
+            doc = "The document subcategory.",
+            default = "",
+        ),
+        "weight": attr.string(
+            doc = "The document weight.",
+            default = "50",
+        ),
+        "authors": attr.string_list(),
+    },
+)
+
+def _docs_impl(ctx):
+    # Tarball is the actual output.
+    tarball = ctx.actions.declare_file(ctx.label.name + ".tgz")
+
+    # But we need an intermediate builder to translate the files.
+    builder = ctx.actions.declare_file("%s-builder" % ctx.label.name)
+    builder_content = [
+        "#!/bin/bash",
+        "set -euo pipefail",
+        "declare -r T=$(mktemp -d)",
+        "function cleanup {",
+        "    rm -rf $T",
+        "}",
+        "trap cleanup EXIT",
+    ]
+    for dep in ctx.attr.deps:
+        doc = dep[DocInfo]
+
+        # Sanity check the permalink.
+        if not doc.permalink.endswith("/"):
+            fail("permalink %s for target %s should end with /" % (
+                doc.permalink,
+                ctx.label.name,
+            ))
+
+        # Construct the header.
+        header = """\
+description: {description}
+permalink: {permalink}
+category: {category}
+subcategory: {subcategory}
+weight: {weight}
+editpath: {editpath}
+authors: {authors}
+layout: {layout}"""
+
+        for f in dep.files.to_list():
+            # Is this a markdown file? If not, then we ensure that it ends up
+            # in the same path as the permalink for relative addressing.
+            if not f.basename.endswith(".md"):
+                builder_content.append("mkdir -p $T/%s" % doc.permalink)
+                builder_content.append("cp %s $T/%s" % (f.path, doc.permalink))
+                continue
+
+            # Is this a post? If yes, then we must put this in the _posts
+            # directory. This directory is treated specially with respect to
+            # pagination and page generation.
+            dest = f.short_path
+            if doc.layout == "post":
+                dest = "_posts/" + f.basename
+            builder_content.append("echo Processing %s... >&2" % f.short_path)
+            builder_content.append("mkdir -p $T/$(dirname %s)" % dest)
+
+            # Construct the header dynamically. We include the title field from
+            # the markdown itself, as this is the g3doc format required. The
+            # title will be injected by the web layout however, so we don't
+            # want this to appear in the document.
+            args = dict([(k, getattr(doc, k)) for k in dir(doc)])
+            builder_content.append("title=\"$(grep -E '^# ' %s | head -n 1 | cut -d'#' -f2- || true)\"" % f.path)
+            builder_content.append("cat >$T/%s <<EOF" % dest)
+            builder_content.append("---")
+            builder_content.append("title: $title")
+            builder_content.append(header.format(**args))
+            builder_content.append("---")
+            builder_content.append("EOF")
+
+            # To generate the final page, we need to strip out the title (which
+            # was pulled above to generate the annotation in the frontmatter,
+            # and substitute the [TOC] tag with the {% toc %} plugin tag. Note
+            # that the pipeline here is almost important, as the grep will
+            # return non-zero if the file is empty, but we ignore that within
+            # the pipeline.
+            builder_content.append("grep -v -E '^# ' %s | sed -e 's|^\\[TOC\\]$|- TOC\\n{:toc}|' >>$T/%s" %
+                                   (f.path, dest))
+
+    builder_content.append("declare -r filename=$(readlink -m %s)" % tarball.path)
+    builder_content.append("(cd $T && tar -zcf \"${filename}\" .)\n")
+    ctx.actions.write(builder, "\n".join(builder_content), is_executable = True)
+
+    # Generate the tarball.
+    ctx.actions.run(
+        inputs = depset(ctx.files.deps),
+        outputs = [tarball],
+        progress_message = "Generating %s" % ctx.label,
+        executable = builder,
+    )
+    return [DefaultInfo(
+        files = depset([tarball]),
+    )]
+
+docs = rule(
+    implementation = _docs_impl,
+    doc = "Construct a site tarball from doc dependencies.",
+    attrs = {
+        "deps": attr.label_list(
+            doc = "All document dependencies.",
+        ),
+    },
+)
diff --git a/test/runtimes/runner.sh b/website/import.sh
index a8d9a3460..e1350e83d 100755
--- a/test/runtimes/runner.sh
+++ b/website/import.sh
@@ -14,22 +14,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -euf -x -o pipefail
+set -xeuo pipefail
 
-echo -- "$@"
-
-# Create outputs dir if it does not exist.
-if [[ -n "${TEST_UNDECLARED_OUTPUTS_DIR}" ]]; then
-  mkdir -p "${TEST_UNDECLARED_OUTPUTS_DIR}"
-  chmod a+rwx "${TEST_UNDECLARED_OUTPUTS_DIR}"
+if [[ -d $0.runfiles ]]; then
+  cd $0.runfiles
 fi
 
-# Update the timestamp on the shard status file. Bazel looks for this.
-touch "${TEST_SHARD_STATUS_FILE}"
-
-# Get location of runner binary.
-readonly runner=$(find "${TEST_SRCDIR}" -name runner)
-
-# Pass the arguments of this script directly to the runner.
-exec "${runner}" "$@"
-
+exec docker import \
+  -c "EXPOSE 8080/tcp" \
+  -c "ENTRYPOINT [\"/server\"]" \
+  $(find . -name files.tgz) \
+  gvisor.dev/images/website
diff --git a/website/index.md b/website/index.md
new file mode 100644
index 000000000..84f877d49
--- /dev/null
+++ b/website/index.md
@@ -0,0 +1,50 @@
+<div class="jumbotron jumbotron-fluid">
+  <div class="container">
+    <div class="row">
+      <div class="col-md-3"></div>
+      <div class="col-md-6">
+        <p>gVisor is an <b>application kernel</b> for <b>containers</b> that provides efficient defense-in-depth anywhere.</p>
+        <p style="margin-top: 20px;">
+          <a class="btn" href="/docs/user_guide/quick_start/docker/">Quick start&nbsp;<i class="fas fa-arrow-alt-circle-right ml-2"></i></a>
+          <a class="btn" href="/docs/">Learn More&nbsp;<i class="fas fa-arrow-alt-circle-right ml-2"></i></a>
+        </p>
+      </div>
+      <div class="col-md-3"></div>
+    </div>
+  </div>
+</div>
+
+<div class="container"> <!-- Full page container. -->
+
+<div class="row">
+  <div class="col-md-4">
+    <h4 id="seamless-security">Container-native Security <i class="fas fa-lock"></i></h4>
+    <p>By providing each container with its own application kernel, gVisor
+    limits the attack surface of the host. This protection does not limit
+    functionality: gVisor runs unmodified binaries and integrates with container
+    orchestration systems, such as Docker and Kubernetes, and supports features
+    such as volumes and sidecars.</p>
+    <a class="button" href="/docs/architecture_guide/security/">Read More &raquo;</a>
+  </div>
+
+  <div class="col-md-4">
+    <h4 id="resource-efficiency">Resource Efficiency <i class="fas fa-feather-alt"></i></h4>
+    <p>Containers are efficient because workloads of different shapes and sizes
+    can be packed together by sharing host resources. gVisor uses host-native
+    abstractions, such as threads and memory mappings, to co-operate with the
+    host and enable the same resource model as native containers.</p>
+    <a class="button" href="/docs/architecture_guide/resources/">Read More &raquo;</a>
+  </div>
+
+  <div class="col-md-4">
+    <h4 id="platform-portability">Platform Portability <sup>&#9729;</sup>&#9729;</h4>
+    <p>Modern infrastructure spans multiple cloud services and data centers,
+    often with a mix of managed services and virtualized or traditional servers.
+    The pluggable platform architecture of gVisor allows it to run anywhere,
+    enabling consistent security policies across multiple environments without
+    having to rearchitect your infrastructure.</p>
+    <a class="button" href="/docs/architecture_guide/platforms/">Read More &raquo;</a>
+  </div>
+</div>
+
+</div> <!-- container -->
diff --git a/website/performance/README.md b/website/performance/README.md
new file mode 100644
index 000000000..0dbfd2f02
--- /dev/null
+++ b/website/performance/README.md
@@ -0,0 +1,9 @@
+# Performance data
+
+This directory holds the CSVs generated by the
+[benchmark-tools][benchmark-tools] repository.
+
+In the future, these will be automatically posted to a cloud storage bucket and
+loaded dynamically. At that point, this directory will be removed.
+
+[benchmark-tools]: https://github.com/google/gvisor/tree/master/benchmarks
diff --git a/website/performance/applications.csv b/website/performance/applications.csv
new file mode 100644
index 000000000..7b4661c60
--- /dev/null
+++ b/website/performance/applications.csv
@@ -0,0 +1,13 @@
+runtime,method,metric,result
+runc,http.node,transfer_rate,3814.85
+runc,http.node,latency,11.0
+runc,http.node,requests_per_second,885.81
+runc,http.ruby,transfer_rate,2874.38
+runc,http.ruby,latency,18.0
+runc,http.ruby,requests_per_second,539.97
+runsc,http.node,transfer_rate,1615.54
+runsc,http.node,latency,27.0
+runsc,http.node,requests_per_second,375.13
+runsc,http.ruby,transfer_rate,1382.71
+runsc,http.ruby,latency,38.0
+runsc,http.ruby,requests_per_second,259.75
diff --git a/website/performance/density.csv b/website/performance/density.csv
new file mode 100644
index 000000000..729b44941
--- /dev/null
+++ b/website/performance/density.csv
@@ -0,0 +1,9 @@
+runtime,method,metric,result
+runc,density.empty,memory_usage,4092149.76
+runc,density.node,memory_usage,76709888.0
+runc,density.ruby,memory_usage,45737000.96
+runsc,density.empty,memory_usage,23695032.32
+runsc,density.node,memory_usage,124076605.44
+runsc,density.ruby,memory_usage,106141777.92
+runc,density.redis,memory_usage,1055323750.4
+runsc,density.redis,memory_usage,1076686028.8
diff --git a/website/performance/ffmpeg.csv b/website/performance/ffmpeg.csv
new file mode 100644
index 000000000..08661c749
--- /dev/null
+++ b/website/performance/ffmpeg.csv
@@ -0,0 +1,3 @@
+runtime,metric,result
+runc,run_time,82.000625
+runsc,run_time,88.24018
diff --git a/website/performance/fio-tmpfs.csv b/website/performance/fio-tmpfs.csv
new file mode 100644
index 000000000..99777d2e4
--- /dev/null
+++ b/website/performance/fio-tmpfs.csv
@@ -0,0 +1,9 @@
+runtime,method,metric,result
+runc,fio.read,bandwidth,4240686080
+runc,fio.write,bandwidth,3029744640
+runsc,fio.read,bandwidth,2533604352
+runsc,fio.write,bandwidth,1207536640
+runc,fio.randread,bandwidth,1221472256
+runc,fio.randwrite,bandwidth,1046094848
+runsc,fio.randread,bandwidth,68940800
+runsc,fio.randwrite,bandwidth,67286016
diff --git a/website/performance/fio.csv b/website/performance/fio.csv
new file mode 100644
index 000000000..80d6ae289
--- /dev/null
+++ b/website/performance/fio.csv
@@ -0,0 +1,9 @@
+runtime,method,metric,result
+runc,fio.read,bandwidth,252253184
+runc,fio.write,bandwidth,457767936
+runsc,fio.read,bandwidth,252323840
+runsc,fio.write,bandwidth,431845376
+runc,fio.randread,bandwidth,5284864
+runc,fio.randwrite,bandwidth,107758592
+runsc,fio.randread,bandwidth,4403200
+runsc,fio.randwrite,bandwidth,69161984
diff --git a/website/performance/httpd100k.csv b/website/performance/httpd100k.csv
new file mode 100644
index 000000000..e92c7e9e0
--- /dev/null
+++ b/website/performance/httpd100k.csv
@@ -0,0 +1,17 @@
+connections,runtime,metric,result
+1,runc,transfer_rate,565.35
+1,runc,latency,1.0
+1,runsc,transfer_rate,282.84
+1,runsc,latency,2.0
+5,runc,transfer_rate,3260.57
+5,runc,latency,1.0
+5,runsc,transfer_rate,832.69
+5,runsc,latency,3.0
+10,runc,transfer_rate,4672.01
+10,runc,latency,1.0
+10,runsc,transfer_rate,1095.47
+10,runsc,latency,4.0
+25,runc,transfer_rate,4964.14
+25,runc,latency,2.0
+25,runsc,transfer_rate,961.03
+25,runsc,latency,12.0
diff --git a/website/performance/httpd10240k.csv b/website/performance/httpd10240k.csv
new file mode 100644
index 000000000..60dbe7b40
--- /dev/null
+++ b/website/performance/httpd10240k.csv
@@ -0,0 +1,17 @@
+connections,runtime,metric,result
+1,runc,transfer_rate,674.05
+1,runc,latency,1.0
+1,runsc,transfer_rate,243.35
+1,runsc,latency,2.0
+5,runc,transfer_rate,3089.83
+5,runc,latency,1.0
+5,runsc,transfer_rate,981.91
+5,runsc,latency,2.0
+10,runc,transfer_rate,4701.2
+10,runc,latency,1.0
+10,runsc,transfer_rate,1135.08
+10,runsc,latency,4.0
+25,runc,transfer_rate,5021.36
+25,runc,latency,2.0
+25,runsc,transfer_rate,963.26
+25,runsc,latency,12.0
diff --git a/website/performance/iperf.csv b/website/performance/iperf.csv
new file mode 100644
index 000000000..1f3b41aec
--- /dev/null
+++ b/website/performance/iperf.csv
@@ -0,0 +1,5 @@
+runtime,method,metric,result
+runc,network.download,bandwidth,746386000.0
+runc,network.upload,bandwidth,709808000.0
+runsc,network.download,bandwidth,640303500.0
+runsc,network.upload,bandwidth,482254000.0
diff --git a/website/performance/redis.csv b/website/performance/redis.csv
new file mode 100644
index 000000000..369b16712
--- /dev/null
+++ b/website/performance/redis.csv
@@ -0,0 +1,35 @@
+runtime,method,metric,result
+runc,PING_INLINE,requests_per_second,30525.03
+runc,PING_BULK,requests_per_second,30293.85
+runc,SET,requests_per_second,30257.19
+runc,GET,requests_per_second,30312.21
+runc,INCR,requests_per_second,30525.03
+runc,LPUSH,requests_per_second,30712.53
+runc,RPUSH,requests_per_second,30459.95
+runc,LPOP,requests_per_second,30367.45
+runc,RPOP,requests_per_second,30665.44
+runc,SADD,requests_per_second,30030.03
+runc,HSET,requests_per_second,30656.04
+runc,SPOP,requests_per_second,29940.12
+runc,LRANGE_100,requests_per_second,24224.81
+runc,LRANGE_300,requests_per_second,14302.06
+runc,LRANGE_500,requests_per_second,11728.83
+runc,LRANGE_600,requests_per_second,9900.99
+runc,MSET,requests_per_second,30120.48
+runsc,PING_INLINE,requests_per_second,14528.55
+runsc,PING_BULK,requests_per_second,15627.44
+runsc,SET,requests_per_second,15403.57
+runsc,GET,requests_per_second,15325.67
+runsc,INCR,requests_per_second,15269.51
+runsc,LPUSH,requests_per_second,15172.2
+runsc,RPUSH,requests_per_second,15117.16
+runsc,LPOP,requests_per_second,15257.86
+runsc,RPOP,requests_per_second,15188.33
+runsc,SADD,requests_per_second,15432.1
+runsc,HSET,requests_per_second,15163.0
+runsc,SPOP,requests_per_second,15561.78
+runsc,LRANGE_100,requests_per_second,13365.41
+runsc,LRANGE_300,requests_per_second,9520.18
+runsc,LRANGE_500,requests_per_second,8248.78
+runsc,LRANGE_600,requests_per_second,6544.07
+runsc,MSET,requests_per_second,14367.82
diff --git a/website/performance/startup.csv b/website/performance/startup.csv
new file mode 100644
index 000000000..6bad00df6
--- /dev/null
+++ b/website/performance/startup.csv
@@ -0,0 +1,7 @@
+runtime,method,metric,result
+runc,startup.empty,startup_time_ms,1193.10768
+runc,startup.node,startup_time_ms,2557.95336
+runc,startup.ruby,startup_time_ms,2530.12624
+runsc,startup.empty,startup_time_ms,1144.1775
+runsc,startup.node,startup_time_ms,2441.90284
+runsc,startup.ruby,startup_time_ms,2455.69882
diff --git a/website/performance/sysbench-cpu.csv b/website/performance/sysbench-cpu.csv
new file mode 100644
index 000000000..f4e6b69a6
--- /dev/null
+++ b/website/performance/sysbench-cpu.csv
@@ -0,0 +1,3 @@
+runtime,metric,result
+runc,cpu_events_per_second,103.62
+runsc,cpu_events_per_second,103.21
diff --git a/website/performance/sysbench-memory.csv b/website/performance/sysbench-memory.csv
new file mode 100644
index 000000000..626ff4994
--- /dev/null
+++ b/website/performance/sysbench-memory.csv
@@ -0,0 +1,3 @@
+runtime,metric,result
+runc,memory_ops_per_second,13098.73
+runsc,memory_ops_per_second,13107.44
diff --git a/website/performance/syscall.csv b/website/performance/syscall.csv
new file mode 100644
index 000000000..40bdce49e
--- /dev/null
+++ b/website/performance/syscall.csv
@@ -0,0 +1,4 @@
+runtime,metric,result
+runc,syscall_time_ns,1939.0
+runsc,syscall_time_ns,38219.0
+runsc-kvm,syscall_time_ns,763.0
diff --git a/website/performance/tensorflow.csv b/website/performance/tensorflow.csv
new file mode 100644
index 000000000..03498bef0
--- /dev/null
+++ b/website/performance/tensorflow.csv
@@ -0,0 +1,3 @@
+runtime,metric,result
+runc,run_time,207.1118165
+runsc,run_time,244.473401