214 files changed, 12077 insertions, 1525 deletions
diff --git a/.bazelrc b/.bazelrc
index 379fc8328..7f87e94b1 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Build with C++17.
+build --cxxopt=-std=c++17
+
 # Display the current git revision in the info block.
 build --stamp --workspace_status_command tools/workspace_status.sh
 
diff --git a/BUILD b/BUILD
index de410b008..76286174f 100644
--- a/BUILD
+++ b/BUILD
@@ -23,7 +23,18 @@ go_path(
         "//runsc",
 
         # Packages that are not dependencies of //runsc.
+        "//pkg/sentry/kernel/memevent",
+        "//pkg/tcpip/adapters/gonet",
         "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/muxed",
+        "//pkg/tcpip/link/sharedmem",
+        "//pkg/tcpip/link/sharedmem/pipe",
+        "//pkg/tcpip/link/sharedmem/queue",
+        "//pkg/tcpip/link/tun",
+        "//pkg/tcpip/link/waitable",
+        "//pkg/tcpip/sample/tun_tcp_connect",
+        "//pkg/tcpip/sample/tun_tcp_echo",
+        "//pkg/tcpip/transport/tcpconntrack",
     ],
 )
 
diff --git a/README.md b/README.md
index 5ac6f9046..de3e06f4e 100644
--- a/README.md
+++ b/README.md
@@ -48,9 +48,10 @@ Make sure the following dependencies are installed:
 
 *   Linux 4.14.77+ ([older linux][old-linux])
 *   [git][git]
-*   [Bazel][bazel] 0.28.0+
+*   [Bazel][bazel] 1.2+
 *   [Python][python]
 *   [Docker version 17.09.0 or greater][docker]
+*   C++ toolchain supporting C++17 (GCC 7+, Clang 5+)
 *   Gold linker (e.g. `binutils-gold` package on Ubuntu)
 
 ### Building
diff --git a/WORKSPACE b/WORKSPACE
index 4561ed8fc..4b5a3bfe2 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -106,6 +106,45 @@ load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 
 rules_pkg_dependencies()
 
+# Container rules.
+http_archive(
+    name = "io_bazel_rules_docker",
+    sha256 = "14ac30773fdb393ddec90e158c9ec7ebb3f8a4fd533ec2abbfd8789ad81a284b",
+    strip_prefix = "rules_docker-0.12.1",
+    urls = ["https://github.com/bazelbuild/rules_docker/releases/download/v0.12.1/rules_docker-v0.12.1.tar.gz"],
+)
+
+load(
+    "@io_bazel_rules_docker//repositories:repositories.bzl",
+    container_repositories = "repositories",
+)
+
+container_repositories()
+
+load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps")
+
+container_deps()
+
+load(
+    "@io_bazel_rules_docker//container:container.bzl",
+    "container_pull",
+)
+
+# This container is built from the Dockerfile in test/iptables/runner.
+container_pull(
+    name = "iptables-test",
+    registry = "gcr.io",
+    repository = "gvisor-presubmit/iptables-test",
+    digest = "sha256:a137d692a2eb9fc7bf95c5f4a568da090e2c31098e93634421ed88f3a3f1db65",
+)
+
+load(
+    "@io_bazel_rules_docker//go:image.bzl",
+    _go_image_repos = "repositories",
+)
+
+_go_image_repos()
+
 # External repositories, in sorted order.
 go_repository(
     name = "com_github_cenkalti_backoff",
diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py
index 2166d040a..66b719b63 100644
--- a/benchmarks/harness/machine.py
+++ b/benchmarks/harness/machine.py
@@ -11,7 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Machine abstraction. This is the primary API for benchmarks."""
+"""Machine abstraction passed to benchmarks to run docker containers.
+
+Abstraction for interacting with test machines. Machines are produced
+by Machine producers and represent a local or remote machine. Benchmark
+methods in /benchmarks/suite are passed the required number of machines in order
+to run the benchmark. Machines contain methods to run commands via bash,
+possibly over ssh. Machines also hold a connection to the docker UNIX socket
+to run contianers.
+
+  Typical usage example:
+
+  machine = Machine()
+  machine.run(cmd)
+  machine.pull(path)
+  container = machine.container()
+"""
 
 import logging
 import re
@@ -28,12 +43,16 @@ from benchmarks.harness import ssh_connection
 from benchmarks.harness import tunnel_dispatcher
 
 
-class Machine:
+class Machine(object):
   """The machine object is the primary object for benchmarks.
 
   Machine objects are passed to each metric function call and benchmarks use
   machines to access real connections to those machines.
+
+  Attributes:
+    _name: Name as a string
   """
+  _name = ""
 
   def run(self, cmd: str) -> Tuple[str, str]:
     """Convenience method for running a bash command on a machine object.
@@ -90,11 +109,15 @@ class Machine:
 
   def sleep(self, amount: float):
     """Sleeps the given amount of time."""
-    raise NotImplementedError
+    time.sleep(amount)
+
+  def __str__(self):
+    return self._name
 
 
 class MockMachine(Machine):
   """A mocked machine."""
+  _name = "mock"
 
   def run(self, cmd: str) -> Tuple[str, str]:
     return "", ""
@@ -119,15 +142,18 @@ def get_address(machine: Machine) -> str:
 
 
 class LocalMachine(Machine):
-  """The local machine."""
+  """The local machine.
+
+  Attributes:
+    _name: Name as a string
+    _docker_client: a pythonic connection to to the local dockerd unix socket.
+      See: https://github.com/docker/docker-py
+  """
 
   def __init__(self, name):
     self._name = name
     self._docker_client = docker.from_env()
 
-  def __str__(self):
-    return self._name
-
   def run(self, cmd: str) -> Tuple[str, str]:
     process = subprocess.Popen(
         cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -155,7 +181,17 @@ class LocalMachine(Machine):
 
 
 class RemoteMachine(Machine):
-  """Remote machine accessible via an SSH connection."""
+  """Remote machine accessible via an SSH connection.
+
+  Attributes:
+    _name: Name as a string
+    _ssh_connection: a paramiko backed ssh connection which can be used to run
+      commands on this machine
+    _tunnel: a python wrapper around a port forwarded ssh connection between a
+      local unix socket and the remote machine's dockerd unix socket.
+    _docker_client: a pythonic wrapper backed by the _tunnel. Allows sending
+      docker commands: see https://github.com/docker/docker-py
+  """
 
   def __init__(self, name, **kwargs):
     self._name = name
@@ -164,9 +200,6 @@ class RemoteMachine(Machine):
     self._tunnel.connect()
     self._docker_client = self._tunnel.get_docker_client()
 
-  def __str__(self):
-    return self._name
-
   def run(self, cmd: str) -> Tuple[str, str]:
     return self._ssh_connection.run(cmd)
 
diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD
index 5b2228e01..a48da02a1 100644
--- a/benchmarks/harness/machine_producers/BUILD
+++ b/benchmarks/harness/machine_producers/BUILD
@@ -33,3 +33,8 @@ py_library(
         requirement("PyYAML", False),
     ],
 )
+
+py_library(
+    name = "gcloud_mock_recorder",
+    srcs = ["gcloud_mock_recorder.py"],
+)
diff --git a/benchmarks/harness/machine_producers/gcloud_mock_recorder.py b/benchmarks/harness/machine_producers/gcloud_mock_recorder.py
new file mode 100644
index 000000000..fd9837a37
--- /dev/null
+++ b/benchmarks/harness/machine_producers/gcloud_mock_recorder.py
@@ -0,0 +1,97 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A recorder and replay for testing the GCloudProducer.
+
+MockPrinter and MockReader handle printing and reading mock data for the
+purposes of testing. MockPrinter is passed to GCloudProducer objects. The user
+can then run scenarios and record them for playback in tests later.
+
+MockReader is passed to MockGcloudProducer objects and handles reading the
+previously recorded mock data.
+
+It is left to the user to check if data printed is properly redacted for their
+own use. The intended usecase for this class is data coming from gcloud
+commands, which will contain public IPs and other instance data.
+
+The data format is json and printed/read from the ./test_data directory. The
+data is the output of subprocess.CompletedProcess objects in json format.
+
+  Typical usage example:
+
+  recorder = MockPrinter()
+  producer = GCloudProducer(args, recorder)
+  machines = producer.get_machines(1)
+  with open("my_file.json") as fd:
+    recorder.write_out(fd)
+
+  reader = MockReader(filename)
+  producer = MockGcloudProducer(args, mock)
+  machines = producer.get_machines(1)
+  assert len(machines) == 1
+"""
+
+import io
+import json
+import subprocess
+
+
+class MockPrinter(object):
+  """Handles printing Mock data for MockGcloudProducer.
+
+  Attributes:
+    _records: list of json object records for printing
+  """
+
+  def __init__(self):
+    self._records = []
+
+  def record(self, entry: subprocess.CompletedProcess):
+    """Records data and strips out ip addresses."""
+
+    record = {
+        "args": entry.args,
+        "stdout": entry.stdout.decode("utf-8"),
+        "returncode": str(entry.returncode)
+    }
+    self._records.append(record)
+
+  def write_out(self, fd: io.FileIO):
+    """Prints out the data into the given filepath."""
+    fd.write(json.dumps(self._records, indent=4))
+
+
+class MockReader(object):
+  """Handles reading Mock data for MockGcloudProducer.
+
+  Attributes:
+    _records: List[json] records read from the passed in file.
+  """
+
+  def __init__(self, filepath: str):
+    with open(filepath, "rb") as file:
+      self._records = json.loads(file.read())
+      self._i = 0
+
+  def __iter__(self):
+    return self
+
+  def __next__(self, args) -> subprocess.CompletedProcess:
+    """Returns the next record as a CompletedProcess."""
+    if self._i < len(self._records):
+      record = self._records[self._i]
+      stdout = record["stdout"].encode("ascii")
+      returncode = int(record["returncode"])
+      return subprocess.CompletedProcess(
+          args=args, returncode=returncode, stdout=stdout, stderr=b"")
+    raise StopIteration()
diff --git a/benchmarks/harness/tunnel_dispatcher.py b/benchmarks/harness/tunnel_dispatcher.py
index 8dfe2862a..c56fd022a 100644
--- a/benchmarks/harness/tunnel_dispatcher.py
+++ b/benchmarks/harness/tunnel_dispatcher.py
@@ -11,7 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tunnel handles setting up connections to remote machines."""
+"""Tunnel handles setting up connections to remote machines.
+
+Tunnel dispatcher is a wrapper around the connection from a local UNIX socket
+and a remote UNIX socket via SSH with port forwarding. This is done to
+initialize the pythonic dockerpy client to run containers on the remote host by
+connecting to /var/run/docker.sock (where Docker is listening). Tunnel
+dispatcher sets up the local UNIX socket and calls the `ssh` command as a
+subprocess, and holds a reference to that subprocess. It manages clean-up on
+exit as best it can by killing the ssh subprocess and deleting the local UNIX
+socket,stored in /tmp for easy cleanup in most systems if this fails.
+
+  Typical usage example:
+
+  t = Tunnel(name, **kwargs)
+  t.connect()
+  client = t.get_docker_client() #
+  client.containers.run("ubuntu", "echo hello world")
+
+"""
 
 import os
 import tempfile
@@ -21,31 +39,53 @@ import docker
 import pexpect
 
 SSH_TUNNEL_COMMAND = """ssh
- -o GlobalKnownHostsFile=/dev/null
- -o UserKnownHostsFile=/dev/null
- -o StrictHostKeyChecking=no
- -nNT -L {filename}:/var/run/docker.sock
- -i {key_path}
- {username}@{hostname}"""
+    -o GlobalKnownHostsFile=/dev/null
+    -o UserKnownHostsFile=/dev/null
+    -o StrictHostKeyChecking=no
+    -o IdentitiesOnly=yes
+    -nNT -L {filename}:/var/run/docker.sock
+    -i {key_path}
+    {username}@{hostname}"""
 
 
-class Tunnel:
+class Tunnel(object):
   """The tunnel object represents the tunnel via ssh.
 
   This connects a local unix domain socket with a remote socket.
+
+  Attributes:
+      _filename: a temporary name of the UNIX socket prefixed by the name
+        argument.
+      _hostname: the IP or resolvable hostname of the remote host.
+      _username: the username of the ssh_key used to run ssh.
+      _key_path: path to a valid key.
+      _key_password: optional password to the ssh key in _key_path
+      _process: holds reference to the ssh subprocess created.
+
+    Returns:
+      The new minimum port.
+
+    Raises:
+      ConnectionError: If no available port is found.
   """
 
-  def __init__(self, name, hostname: str, username: str, key_path: str,
+  def __init__(self,
+               name: str,
+               hostname: str,
+               username: str,
+               key_path: str,
+               key_password: str = "",
                **kwargs):
     self._filename = tempfile.NamedTemporaryFile(prefix=name).name
     self._hostname = hostname
     self._username = username
     self._key_path = key_path
+    self._key_password = key_password
     self._kwargs = kwargs
     self._process = None
 
   def connect(self):
-    """Connects the SSH tunnel."""
+    """Connects the SSH tunnel and stores the subprocess reference in _process."""
     cmd = SSH_TUNNEL_COMMAND.format(
         filename=self._filename,
         key_path=self._key_path,
@@ -54,9 +94,9 @@ class Tunnel:
     self._process = pexpect.spawn(cmd, timeout=10)
 
     # If given a password, assume we'll be asked for it.
-    if "key_password" in self._kwargs:
+    if self._key_password:
       self._process.expect(["Enter passphrase for key .*: "])
-      self._process.sendline(self._kwargs["key_password"])
+      self._process.sendline(self._key_password)
 
     while True:
       # Wait for the tunnel to appear.
@@ -71,7 +111,7 @@ class Tunnel:
     return self._filename
 
   def get_docker_client(self):
-    """Returns a docker client for this Tunne0l."""
+    """Returns a docker client for this Tunnel."""
     return docker.DockerClient(base_url="unix:/" + self._filename)
 
   def __del__(self):
diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD
index a3941da42..de24824cc 100644
--- a/benchmarks/runner/BUILD
+++ b/benchmarks/runner/BUILD
@@ -34,7 +34,10 @@ py_test(
     name = "runner_test",
     srcs = ["runner_test.py"],
     python_version = "PY3",
-    tags = ["local"],
+    tags = [
+        "local",
+        "manual",
+    ],
     deps = [
         ":runner",
         requirement("click", True),
diff --git a/benchmarks/workloads/node_template/package-lock.json b/benchmarks/workloads/node_template/package-lock.json
index 1653597a1..580e68aa5 100644
--- a/benchmarks/workloads/node_template/package-lock.json
+++ b/benchmarks/workloads/node_template/package-lock.json
@@ -233,9 +233,9 @@
       "integrity": "sha1-6qM9bd16zo9/b+DJygRA5wZzix4="
     },
     "lodash": {
-      "version": "4.17.11",
-      "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz",
-      "integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg=="
+      "version": "4.17.15",
+      "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz",
+      "integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A=="
     },
     "media-typer": {
       "version": "0.3.0",
@@ -364,6 +364,16 @@
         }
       }
     },
+    "redis-commands": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.5.0.tgz",
+      "integrity": "sha512-6KxamqpZ468MeQC3bkWmCB1fp56XL64D4Kf0zJSwDZbVLLm7KFkoIcHrgRvQ+sk8dnhySs7+yBg94yIkAK7aJg=="
+    },
+    "redis-parser": {
+      "version": "2.6.0",
+      "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz",
+      "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs="
+    },
     "safe-buffer": {
       "version": "5.1.2",
       "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
diff --git a/go.mod b/go.mod
index 821273d22..304b8bf13 100644
--- a/go.mod
+++ b/go.mod
@@ -1,21 +1,21 @@
 module gvisor.dev/gvisor
 
-go 1.12
+go 1.13
 
 require (
-	github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
-	github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
-	github.com/golang/mock v1.3.1
-	github.com/golang/protobuf v1.3.1
-	github.com/google/btree v1.0.0
-	github.com/google/go-cmp v0.2.0
-	github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
-	github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
-	github.com/kr/pty v1.1.1
-	github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
-	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
-	github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
-	github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
-	golang.org/x/net v0.0.0-20190311183353-d8887717615a
-	golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+  github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
+  github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
+  github.com/golang/mock v1.3.1
+  github.com/golang/protobuf v1.3.1
+  github.com/google/btree v1.0.0
+  github.com/google/go-cmp v0.2.0
+  github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
+  github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
+  github.com/kr/pty v1.1.1
+  github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
+  github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+  github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
+  github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
+  golang.org/x/net v0.0.0-20190311183353-d8887717615a
+  golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
 )
diff --git a/kokoro/iptables_tests.cfg b/kokoro/iptables_tests.cfg
new file mode 100644
index 000000000..7af20629a
--- /dev/null
+++ b/kokoro/iptables_tests.cfg
@@ -0,0 +1,10 @@
+build_file: "repo/scripts/iptables_test.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+    regex: "**/runsc_logs_*.tar.gz"
+  }
+}
diff --git a/kokoro/kythe/generate_xrefs.cfg b/kokoro/kythe/generate_xrefs.cfg
new file mode 100644
index 000000000..03e65c54e
--- /dev/null
+++ b/kokoro/kythe/generate_xrefs.cfg
@@ -0,0 +1,28 @@
+build_file: "gvisor/kokoro/kythe/generate_xrefs.sh"
+
+before_action {
+  fetch_keystore {
+    keystore_resource {
+      keystore_config_id: 73898
+      keyname: "kokoro-rbe-service-account"
+    }
+  }
+}
+
+bazel_setting {
+  project_id: "gvisor-rbe"
+  local_execution: false
+  auth_credential: {
+    keystore_config_id: 73898
+    keyname: "kokoro-rbe-service-account"
+  }
+  bes_backend_address: "buildeventservice.googleapis.com"
+  foundry_backend_address: "remotebuildexecution.googleapis.com"
+  upsalite_frontend_address: "https://source.cloud.google.com"
+}
+
+action {
+  define_artifacts {
+    regex: "*.kzip"
+  }
+}
diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
new file mode 100644
index 000000000..799467a34
--- /dev/null
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+
+# Install the latest version of Bazel. The default on Kokoro images is out of
+# date.
+if command -v use_bazel.sh >/dev/null; then
+  use_bazel.sh latest
+fi
+bazel version
+
+python3 -V
+
+readonly KYTHE_VERSION='v0.0.37'
+readonly WORKDIR="$(mktemp -d)"
+readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
+if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
+  readonly KZIP_FILENAME="${KOKORO_ARTIFACTS_DIR}/${KOKORO_GIT_COMMIT}.kzip"
+else
+  readonly KZIP_FILENAME="$(git rev-parse HEAD).kzip"
+fi
+
+wget -q -O "${WORKDIR}/kythe.tar.gz" \
+  "https://github.com/kythe/kythe/releases/download/${KYTHE_VERSION}/kythe-${KYTHE_VERSION}.tar.gz"
+tar --no-same-owner -xzf "${WORKDIR}/kythe.tar.gz" --directory "$WORKDIR"
+
+if [[ -n "$KOKORO_ARTIFACTS_DIR" ]]; then
+  cd "${KOKORO_ARTIFACTS_DIR}/github/gvisor"
+fi
+bazel \
+  --bazelrc="${KYTHE_DIR}/extractors.bazelrc" \
+  build \
+  --override_repository kythe_release="${KYTHE_DIR}" \
+  --define=kythe_corpus=gvisor.dev \
+  --cxxopt=-std=c++17 \
+  //...
+
+"${KYTHE_DIR}/tools/kzip" merge \
+  --output "$KZIP_FILENAME" \
+  $(find -L bazel-out/*/extra_actions/ -name '*.kzip')
diff --git a/kokoro/ubuntu1604/10_core.sh b/kokoro/ubuntu1604/10_core.sh
index e87a6eee8..46dda6bb1 100755
--- a/kokoro/ubuntu1604/10_core.sh
+++ b/kokoro/ubuntu1604/10_core.sh
@@ -21,8 +21,8 @@ apt-get update && apt-get -y install make git-core build-essential linux-headers
 
 # Install a recent go toolchain.
 if ! [[ -d /usr/local/go ]]; then
-    wget https://dl.google.com/go/go1.12.linux-amd64.tar.gz
-    tar -xvf go1.12.linux-amd64.tar.gz
+    wget https://dl.google.com/go/go1.13.5.linux-amd64.tar.gz
+    tar -xvf go1.13.5.linux-amd64.tar.gz
     mv go /usr/local
 fi
 
diff --git a/kokoro/ubuntu1604/README.md b/kokoro/ubuntu1604/README.md
new file mode 100644
index 000000000..64f913b9a
--- /dev/null
+++ b/kokoro/ubuntu1604/README.md
@@ -0,0 +1,34 @@
+## Image Update
+
+After making changes to files in the directory, you must run the following
+commands to update the image Kokoro uses:
+
+```shell
+gcloud config set project gvisor-kokoro-testing
+third_party/gvisor/kokoro/ubuntu1604/build.sh
+third_party/gvisor/kokoro/ubuntu1804/build.sh
+```
+
+Note: the command above will change your default project for `gcloud`. Run
+`gcloud config set project` again to revert back to your default project.
+
+Note: Files in `third_party/gvisor/kokoro/ubuntu1804/` as symlinks to
+`ubuntu1604`, therefore both images must be updated.
+
+After the script finishes, the last few lines of the output will container the
+image name. If the output was lost, you can run `build.sh` again to print the
+image name.
+
+```
+NAME                    PROJECT                FAMILY  DEPRECATED  STATUS
+image-6777fa4666a968c8  gvisor-kokoro-testing                      READY
++ cleanup
++ gcloud compute instances delete --quiet build-tlfrdv
+Deleted [https://www.googleapis.com/compute/v1/projects/gvisor-kokoro-testing/zones/us-central1-f/instances/build-tlfrdv].
+```
+
+To setup Kokoro to use the new image, copy the image names to their
+corresponding file below:
+
+*   //devtools/kokoro/config/gcp/gvisor/ubuntu1604.gcl
+*   //devtools/kokoro/config/gcp/gvisor/ubuntu1804.gcl
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 51774c6b6..9553f164d 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -57,6 +57,7 @@ go_library(
         "uio.go",
         "utsname.go",
         "wait.go",
+        "xattr.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/abi/linux",
     visibility = ["//visibility:public"],
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index f78315ebf..6663a199c 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -16,15 +16,17 @@ package linux
 
 // Commands from linux/fcntl.h.
 const (
-	F_DUPFD         = 0x0
-	F_GETFD         = 0x1
-	F_SETFD         = 0x2
-	F_GETFL         = 0x3
-	F_SETFL         = 0x4
-	F_SETLK         = 0x6
-	F_SETLKW        = 0x7
-	F_SETOWN        = 0x8
-	F_GETOWN        = 0x9
+	F_DUPFD         = 0
+	F_GETFD         = 1
+	F_SETFD         = 2
+	F_GETFL         = 3
+	F_SETFL         = 4
+	F_SETLK         = 6
+	F_SETLKW        = 7
+	F_SETOWN        = 8
+	F_GETOWN        = 9
+	F_SETOWN_EX     = 15
+	F_GETOWN_EX     = 16
 	F_DUPFD_CLOEXEC = 1024 + 6
 	F_SETPIPE_SZ    = 1024 + 7
 	F_GETPIPE_SZ    = 1024 + 8
@@ -32,9 +34,9 @@ const (
 
 // Commands for F_SETLK.
 const (
-	F_RDLCK = 0x0
-	F_WRLCK = 0x1
-	F_UNLCK = 0x2
+	F_RDLCK = 0
+	F_WRLCK = 1
+	F_UNLCK = 2
 )
 
 // Flags for fcntl.
@@ -42,7 +44,7 @@ const (
 	FD_CLOEXEC = 00000001
 )
 
-// Lock structure for F_SETLK.
+// Flock is the lock structure for F_SETLK.
 type Flock struct {
 	Type   int16
 	Whence int16
@@ -52,3 +54,16 @@ type Flock struct {
 	Pid    int32
 	_      [4]byte
 }
+
+// Flags for F_SETOWN_EX and F_GETOWN_EX.
+const (
+	F_OWNER_TID  = 0
+	F_OWNER_PID  = 1
+	F_OWNER_PGRP = 2
+)
+
+// FOwnerEx is the owner structure for F_SETOWN_EX and F_GETOWN_EX.
+type FOwnerEx struct {
+	Type int32
+	PID  int32
+}
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index c9ee098f4..16791d03e 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -144,9 +144,13 @@ const (
 	ModeCharacterDevice = S_IFCHR
 	ModeNamedPipe       = S_IFIFO
 
-	ModeSetUID = 04000
-	ModeSetGID = 02000
-	ModeSticky = 01000
+	S_ISUID = 04000
+	S_ISGID = 02000
+	S_ISVTX = 01000
+
+	ModeSetUID = S_ISUID
+	ModeSetGID = S_ISGID
+	ModeSticky = S_ISVTX
 
 	ModeUserAll     = 0700
 	ModeUserRead    = 0400
@@ -282,6 +286,29 @@ func (m FileMode) String() string {
 	return strings.Join(s, "|")
 }
 
+// DirentType maps file types to dirent types appropriate for (struct
+// dirent)::d_type.
+func (m FileMode) DirentType() uint8 {
+	switch m.FileType() {
+	case ModeSocket:
+		return DT_SOCK
+	case ModeSymlink:
+		return DT_LNK
+	case ModeRegular:
+		return DT_REG
+	case ModeBlockDevice:
+		return DT_BLK
+	case ModeDirectory:
+		return DT_DIR
+	case ModeCharacterDevice:
+		return DT_CHR
+	case ModeNamedPipe:
+		return DT_FIFO
+	default:
+		return DT_UNKNOWN
+	}
+}
+
 var modeExtraBits = abi.FlagSet{
 	{
 		Flag: ModeSetUID,
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index b416e3472..2c652baa2 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -92,3 +92,10 @@ const (
 	SYNC_FILE_RANGE_WRITE       = 2
 	SYNC_FILE_RANGE_WAIT_AFTER  = 4
 )
+
+// Flag argument to renameat2(2), from include/uapi/linux/fs.h.
+const (
+	RENAME_NOREPLACE = (1 << 0) // Don't overwrite target.
+	RENAME_EXCHANGE  = (1 << 1) // Exchange src and dst.
+	RENAME_WHITEOUT  = (1 << 2) // Whiteout src.
+)
diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go
new file mode 100644
index 000000000..a3b6406fa
--- /dev/null
+++ b/pkg/abi/linux/xattr.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Constants for extended attributes.
+const (
+	XATTR_NAME_MAX = 255
+	XATTR_SIZE_MAX = 65536
+
+	XATTR_CREATE  = 1
+	XATTR_REPLACE = 2
+
+	XATTR_USER_PREFIX     = "user."
+	XATTR_USER_PREFIX_LEN = len(XATTR_USER_PREFIX)
+)
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 51869c7d6..b9582c07f 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -257,7 +257,6 @@ func CanOpen(mode FileMode) bool {
 
 // handle implements handler.handle.
 func (t *Tlopen) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -294,7 +293,6 @@ func (t *Tlopen) handle(cs *connState) message {
 			return syscall.EINVAL
 		}
 
-		// Do the open.
 		osFile, qid, ioUnit, err = ref.file.Open(t.Flags)
 		return err
 	}); err != nil {
@@ -311,12 +309,10 @@ func (t *Tlopen) handle(cs *connState) message {
 }
 
 func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return nil, err
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return nil, syscall.EBADF
@@ -390,12 +386,10 @@ func (t *Tsymlink) handle(cs *connState) message {
 }
 
 func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return nil, err
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return nil, syscall.EBADF
@@ -426,19 +420,16 @@ func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
 
 // handle implements handler.handle.
 func (t *Tlink) handle(cs *connState) message {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return newErr(err)
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return newErr(syscall.EBADF)
 	}
 	defer ref.DecRef()
 
-	// Lookup the other FID.
 	refTarget, ok := cs.LookupFID(t.Target)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -467,7 +458,6 @@ func (t *Tlink) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Trenameat) handle(cs *connState) message {
-	// Don't allow complex names.
 	if err := checkSafeName(t.OldName); err != nil {
 		return newErr(err)
 	}
@@ -475,14 +465,12 @@ func (t *Trenameat) handle(cs *connState) message {
 		return newErr(err)
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.OldDirectory)
 	if !ok {
 		return newErr(syscall.EBADF)
 	}
 	defer ref.DecRef()
 
-	// Lookup the other FID.
 	refTarget, ok := cs.LookupFID(t.NewDirectory)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -523,12 +511,10 @@ func (t *Trenameat) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tunlinkat) handle(cs *connState) message {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return newErr(err)
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -577,19 +563,16 @@ func (t *Tunlinkat) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Trename) handle(cs *connState) message {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return newErr(err)
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
 	}
 	defer ref.DecRef()
 
-	// Lookup the target.
 	refTarget, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -641,7 +624,6 @@ func (t *Trename) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Treadlink) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -669,7 +651,6 @@ func (t *Treadlink) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tread) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -708,7 +689,6 @@ func (t *Tread) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Twrite) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -747,12 +727,10 @@ func (t *Tmknod) handle(cs *connState) message {
 }
 
 func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return nil, err
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return nil, syscall.EBADF
@@ -791,12 +769,10 @@ func (t *Tmkdir) handle(cs *connState) message {
 }
 
 func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
-	// Don't allow complex names.
 	if err := checkSafeName(t.Name); err != nil {
 		return nil, err
 	}
 
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return nil, syscall.EBADF
@@ -827,7 +803,6 @@ func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
 
 // handle implements handler.handle.
 func (t *Tgetattr) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -856,7 +831,6 @@ func (t *Tgetattr) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tsetattr) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -883,7 +857,6 @@ func (t *Tsetattr) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tallocate) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -917,7 +890,6 @@ func (t *Tallocate) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Txattrwalk) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -930,7 +902,6 @@ func (t *Txattrwalk) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Txattrcreate) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -943,7 +914,6 @@ func (t *Txattrcreate) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Treaddir) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.Directory)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -977,7 +947,6 @@ func (t *Treaddir) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tfsync) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -1001,7 +970,6 @@ func (t *Tfsync) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tstatfs) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -1192,7 +1160,6 @@ func doWalk(cs *connState, ref *fidRef, names []string, getattr bool) (qids []QI
 
 // handle implements handler.handle.
 func (t *Twalk) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -1213,7 +1180,6 @@ func (t *Twalk) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Twalkgetattr) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -1270,7 +1236,6 @@ func (t *Tumknod) handle(cs *connState) message {
 
 // handle implements handler.handle.
 func (t *Tlconnect) handle(cs *connState) message {
-	// Lookup the FID.
 	ref, ok := cs.LookupFID(t.FID)
 	if !ok {
 		return newErr(syscall.EBADF)
@@ -1303,7 +1268,6 @@ func (t *Tchannel) handle(cs *connState) message {
 		return newErr(err)
 	}
 
-	// Lookup the given channel.
 	ch := cs.lookupChannel(t.ID)
 	if ch == nil {
 		return newErr(syscall.ENOSYS)
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 9e7db8b30..67daa6c24 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -305,7 +305,7 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
 		buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
 		return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil
 	}
-	// TODO(b/34088053): debug registers
+	// Note: x86 debug registers are missing.
 	return c.Native(0), nil
 }
 
@@ -320,6 +320,6 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error {
 		_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
 		return err
 	}
-	// TODO(b/34088053): debug registers
+	// Note: x86 debug registers are missing.
 	return nil
 }
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 1f78d54a2..e1f2fea60 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -22,6 +22,7 @@ import (
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/fd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
@@ -56,6 +57,9 @@ type Profile struct {
 
 	// traceFile is the current execution trace output file.
 	traceFile *fd.FD
+
+	// Kernel is the kernel under profile.
+	Kernel *kernel.Kernel
 }
 
 // StartCPUProfile is an RPC stub which starts recording the CPU profile in a
@@ -147,6 +151,9 @@ func (p *Profile) StartTrace(o *ProfileOpts, _ *struct{}) error {
 		return err
 	}
 
+	// Ensure all trace contexts are registered.
+	p.Kernel.RebuildTraceContexts()
+
 	p.traceFile = output
 	return nil
 }
@@ -158,9 +165,15 @@ func (p *Profile) StopTrace(_, _ *struct{}) error {
 	defer p.mu.Unlock()
 
 	if p.traceFile == nil {
-		return errors.New("Execution tracing not start")
+		return errors.New("Execution tracing not started")
 	}
 
+	// Similarly to the case above, if tasks have not ended traces, we will
+	// lose information. Thus we need to rebuild the tasks in order to have
+	// complete information. This will not lose information if multiple
+	// traces are overlapping.
+	p.Kernel.RebuildTraceContexts()
+
 	trace.Stop()
 	p.traceFile.Close()
 	p.traceFile = nil
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index c35faeb4c..ced51c66c 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -268,14 +268,17 @@ func (proc *Proc) Ps(args *PsArgs, out *string) error {
 }
 
 // Process contains information about a single process in a Sandbox.
-// TODO(b/117881927): Implement TTY field.
 type Process struct {
 	UID auth.KUID       `json:"uid"`
 	PID kernel.ThreadID `json:"pid"`
 	// Parent PID
-	PPID kernel.ThreadID `json:"ppid"`
+	PPID    kernel.ThreadID   `json:"ppid"`
+	Threads []kernel.ThreadID `json:"threads"`
 	// Processor utilization
 	C int32 `json:"c"`
+	// TTY name of the process. Will be of the form "pts/N" if there is a
+	// TTY, or "?" if there is not.
+	TTY string `json:"tty"`
 	// Start time
 	STime string `json:"stime"`
 	// CPU time
@@ -285,18 +288,19 @@ type Process struct {
 }
 
 // ProcessListToTable prints a table with the following format:
-// UID       PID       PPID      C         STIME     TIME       CMD
-// 0         1         0         0         14:04     505262ns   tail
+// UID       PID       PPID      C         TTY		STIME     TIME       CMD
+// 0         1         0         0         pty/4	14:04     505262ns   tail
 func ProcessListToTable(pl []*Process) string {
 	var buf bytes.Buffer
 	tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0)
-	fmt.Fprint(tw, "UID\tPID\tPPID\tC\tSTIME\tTIME\tCMD")
+	fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD")
 	for _, d := range pl {
-		fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s",
+		fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s",
 			d.UID,
 			d.PID,
 			d.PPID,
 			d.C,
+			d.TTY,
 			d.STime,
 			d.Time,
 			d.Cmd)
@@ -307,7 +311,7 @@ func ProcessListToTable(pl []*Process) string {
 
 // ProcessListToJSON will return the JSON representation of ps.
 func ProcessListToJSON(pl []*Process) (string, error) {
-	b, err := json.Marshal(pl)
+	b, err := json.MarshalIndent(pl, "", "  ")
 	if err != nil {
 		return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err)
 	}
@@ -334,7 +338,9 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
 	ts := k.TaskSet()
 	now := k.RealtimeClock().Now()
 	for _, tg := range ts.Root.ThreadGroups() {
-		pid := tg.PIDNamespace().IDOfThreadGroup(tg)
+		pidns := tg.PIDNamespace()
+		pid := pidns.IDOfThreadGroup(tg)
+
 		// If tg has already been reaped ignore it.
 		if pid == 0 {
 			continue
@@ -345,16 +351,19 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
 
 		ppid := kernel.ThreadID(0)
 		if p := tg.Leader().Parent(); p != nil {
-			ppid = p.PIDNamespace().IDOfThreadGroup(p.ThreadGroup())
+			ppid = pidns.IDOfThreadGroup(p.ThreadGroup())
 		}
+		threads := tg.MemberIDs(pidns)
 		*out = append(*out, &Process{
-			UID:   tg.Leader().Credentials().EffectiveKUID,
-			PID:   pid,
-			PPID:  ppid,
-			STime: formatStartTime(now, tg.Leader().StartTime()),
-			C:     percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
-			Time:  tg.CPUStats().SysTime.String(),
-			Cmd:   tg.Leader().Name(),
+			UID:     tg.Leader().Credentials().EffectiveKUID,
+			PID:     pid,
+			PPID:    ppid,
+			Threads: threads,
+			STime:   formatStartTime(now, tg.Leader().StartTime()),
+			C:       percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
+			Time:    tg.CPUStats().SysTime.String(),
+			Cmd:     tg.Leader().Name(),
+			TTY:     ttyName(tg.TTY()),
 		})
 	}
 	sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID })
@@ -395,3 +404,10 @@ func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 {
 	}
 	return int32(percentCPU)
 }
+
+func ttyName(tty *kernel.TTY) string {
+	if tty == nil {
+		return "?"
+	}
+	return fmt.Sprintf("pts/%d", tty.Index)
+}
diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go
index d8ada2694..0a88459b2 100644
--- a/pkg/sentry/control/proc_test.go
+++ b/pkg/sentry/control/proc_test.go
@@ -34,7 +34,7 @@ func TestProcessListTable(t *testing.T) {
 	}{
 		{
 			pl:       []*Process{},
-			expected: "UID       PID       PPID      C         STIME     TIME      CMD",
+			expected: "UID       PID       PPID      C         TTY       STIME     TIME      CMD",
 		},
 		{
 			pl: []*Process{
@@ -43,6 +43,7 @@ func TestProcessListTable(t *testing.T) {
 					PID:   0,
 					PPID:  0,
 					C:     0,
+					TTY:   "?",
 					STime: "0",
 					Time:  "0",
 					Cmd:   "zero",
@@ -52,14 +53,15 @@ func TestProcessListTable(t *testing.T) {
 					PID:   1,
 					PPID:  1,
 					C:     1,
+					TTY:   "pts/4",
 					STime: "1",
 					Time:  "1",
 					Cmd:   "one",
 				},
 			},
-			expected: `UID       PID       PPID      C         STIME     TIME      CMD
-0         0         0         0         0         0         zero
-1         1         1         1         1         1         one`,
+			expected: `UID       PID       PPID      C         TTY       STIME     TIME      CMD
+0         0         0         0         ?         0         0         zero
+1         1         1         1         pts/4     1         1         one`,
 		},
 	}
 
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 8c17603f8..c09f3b71c 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -234,6 +234,8 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	if err != nil {
 		return nil, err
 	}
+	// We're not going to use newFile after return.
+	defer newFile.close(ctx)
 
 	// Stabilize the endpoint map while creation is in progress.
 	unlock := i.session().endpoints.lock()
@@ -254,7 +256,6 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	// Get the attributes of the file to create inode key.
 	qid, mask, attr, err := getattr(ctx, newFile)
 	if err != nil {
-		newFile.close(ctx)
 		return nil, err
 	}
 
@@ -270,7 +271,6 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	// cloned and re-opened multiple times after creation.
 	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
 	if err != nil {
-		newFile.close(ctx)
 		return nil, err
 	}
 
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 0da608548..4e358a46a 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -143,9 +143,9 @@ type session struct {
 	// socket files. This allows unix domain sockets to be used with paths that
 	// belong to a gofer.
 	//
-	// TODO(b/77154739): there are few possible races with someone stat'ing the
-	// file and another deleting it concurrently, where the file will not be
-	// reported as socket file.
+	// TODO(gvisor.dev/issue/1200): there are few possible races with someone
+	// stat'ing the file and another deleting it concurrently, where the file
+	// will not be reported as socket file.
 	endpoints *endpointMaps `state:"wait"`
 }
 
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 2d43dff1d..91e2fde2f 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -270,6 +270,14 @@ func (i *Inode) Getxattr(name string) (string, error) {
 	return i.InodeOperations.Getxattr(i, name)
 }
 
+// Setxattr calls i.InodeOperations.Setxattr with i as the Inode.
+func (i *Inode) Setxattr(name, value string) error {
+	if i.overlay != nil {
+		return overlaySetxattr(i.overlay, name, value)
+	}
+	return i.InodeOperations.Setxattr(i, name, value)
+}
+
 // Listxattr calls i.InodeOperations.Listxattr with i as the Inode.
 func (i *Inode) Listxattr() (map[string]struct{}, error) {
 	if i.overlay != nil {
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index a09147080..63a991beb 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -552,6 +552,11 @@ func overlayGetxattr(o *overlayEntry, name string) (string, error) {
 	return s, err
 }
 
+// TODO(b/146028302): Support setxattr for overlayfs.
+func overlaySetxattr(o *overlayEntry, name, value string) error {
+	return syserror.EOPNOTSUPP
+}
+
 func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 2a598149d..0e46c5fb7 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -67,29 +67,28 @@ type taskDir struct {
 var _ fs.InodeOperations = (*taskDir)(nil)
 
 // newTaskDir creates a new proc task entry.
-func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool) *fs.Inode {
+func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
 	contents := map[string]*fs.Inode{
-		"auxv":    newAuxvec(t, msrc),
-		"cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
-		"comm":    newComm(t, msrc),
-		"environ": newExecArgInode(t, msrc, environExecArg),
-		"exe":     newExe(t, msrc),
-		"fd":      newFdDir(t, msrc),
-		"fdinfo":  newFdInfoDir(t, msrc),
-		"gid_map": newGIDMap(t, msrc),
-		// FIXME(b/123511468): create the correct io file for threads.
-		"io":        newIO(t, msrc),
+		"auxv":      newAuxvec(t, msrc),
+		"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
+		"comm":      newComm(t, msrc),
+		"environ":   newExecArgInode(t, msrc, environExecArg),
+		"exe":       newExe(t, msrc),
+		"fd":        newFdDir(t, msrc),
+		"fdinfo":    newFdInfoDir(t, msrc),
+		"gid_map":   newGIDMap(t, msrc),
+		"io":        newIO(t, msrc, isThreadGroup),
 		"maps":      newMaps(t, msrc),
 		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
 		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
 		"ns":        newNamespaceDir(t, msrc),
 		"smaps":     newSmaps(t, msrc),
-		"stat":      newTaskStat(t, msrc, showSubtasks, p.pidns),
+		"stat":      newTaskStat(t, msrc, isThreadGroup, p.pidns),
 		"statm":     newStatm(t, msrc),
 		"status":    newStatus(t, msrc, p.pidns),
 		"uid_map":   newUIDMap(t, msrc),
 	}
-	if showSubtasks {
+	if isThreadGroup {
 		contents["task"] = p.newSubtasks(t, msrc)
 	}
 	if len(p.cgroupControllers) > 0 {
@@ -619,8 +618,11 @@ type ioData struct {
 	ioUsage
 }
 
-func newIO(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+func newIO(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
+	if isThreadGroup {
+		return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+	}
+	return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate returns whether the generation is old or not.
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index ff8138820..917f90cc0 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -53,8 +53,8 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal
 		d:          d,
 		n:          n,
 		ld:         newLineDiscipline(termios),
-		masterKTTY: &kernel.TTY{},
-		slaveKTTY:  &kernel.TTY{},
+		masterKTTY: &kernel.TTY{Index: n},
+		slaveKTTY:  &kernel.TTY{Index: n},
 	}
 	t.EnableLeakCheck("tty.Terminal")
 	return &t
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 7ccff8b0d..880b7bcd3 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -38,6 +38,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/fd",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 94cd74095..177ce2cb9 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -81,7 +81,11 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
 	ctx := contexttest.Context(b)
 	creds := auth.CredentialsFromContext(ctx)
 
-	if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
+	if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	}); err != nil {
 		b.Fatalf("failed to mount tmpfs submount: %v", err)
 	}
 	return func() {
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 307e4d68c..e9f756732 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -147,55 +147,54 @@ func TestSeek(t *testing.T) {
 				t.Fatalf("vfsfs.OpenAt failed: %v", err)
 			}
 
-			if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
+			if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
 				t.Errorf("expected seek position 0, got %d and error %v", n, err)
 			}
 
-			stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{})
+			stat, err := fd.Stat(ctx, vfs.StatOptions{})
 			if err != nil {
 				t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err)
 			}
 
 			// We should be able to seek beyond the end of file.
 			size := int64(stat.Size)
-			if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
+			if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size, n, err)
 			}
 
 			// EINVAL should be returned if the resulting offset is negative.
-			if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
+			if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
 				t.Errorf("expected error EINVAL but got %v", err)
 			}
 
-			if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
+			if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err)
 			}
 
 			// Make sure negative offsets work with SEEK_CUR.
-			if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
+			if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
 			}
 
 			// EINVAL should be returned if the resulting offset is negative.
-			if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
+			if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
 				t.Errorf("expected error EINVAL but got %v", err)
 			}
 
 			// Make sure SEEK_END works with regular files.
-			switch fd.Impl().(type) {
-			case *regularFileFD:
+			if _, ok := fd.Impl().(*regularFileFD); ok {
 				// Seek back to 0.
-				if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
+				if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
 					t.Errorf("expected seek position %d, got %d and error %v", 0, n, err)
 				}
 
 				// Seek forward beyond EOF.
-				if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
+				if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
 					t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
 				}
 
 				// EINVAL should be returned if the resulting offset is negative.
-				if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
+				if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
 					t.Errorf("expected error EINVAL but got %v", err)
 				}
 			}
@@ -456,7 +455,7 @@ func TestRead(t *testing.T) {
 			want := make([]byte, 1)
 			for {
 				n, err := f.Read(want)
-				fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
+				fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
 
 				if diff := cmp.Diff(got, want); diff != "" {
 					t.Errorf("file data mismatch (-want +got):\n%s", diff)
@@ -464,7 +463,7 @@ func TestRead(t *testing.T) {
 
 				// Make sure there is no more file data left after getting EOF.
 				if n == 0 || err == io.EOF {
-					if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
+					if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
 						t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image)
 					}
 
@@ -574,7 +573,7 @@ func TestIterDirents(t *testing.T) {
 			}
 
 			cb := &iterDirentsCb{}
-			if err = fd.Impl().IterDirents(ctx, cb); err != nil {
+			if err = fd.IterDirents(ctx, cb); err != nil {
 				t.Fatalf("dir fd.IterDirents() failed: %v", err)
 			}
 
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 2d15e8aaf..e7aa3b41b 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -20,6 +20,7 @@ import (
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -441,3 +442,10 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 
 	return syserror.EROFS
 }
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
new file mode 100644
index 000000000..52596c090
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -0,0 +1,60 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "slot_list",
+    out = "slot_list.go",
+    package = "kernfs",
+    prefix = "slot",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*slot",
+        "Linker": "*slot",
+    },
+)
+
+go_library(
+    name = "kernfs",
+    srcs = [
+        "dynamic_bytes_file.go",
+        "fd_impl_util.go",
+        "filesystem.go",
+        "inode_impl_util.go",
+        "kernfs.go",
+        "slot_list.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "kernfs_test",
+    size = "small",
+    srcs = ["kernfs_test.go"],
+    deps = [
+        ":kernfs",
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
new file mode 100644
index 000000000..30c06baf0
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -0,0 +1,131 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// DynamicBytesFile implements kernfs.Inode and represents a read-only
+// file whose contents are backed by a vfs.DynamicBytesSource.
+//
+// Must be initialized with Init before first use.
+type DynamicBytesFile struct {
+	InodeAttrs
+	InodeNoopRefCount
+	InodeNotDirectory
+	InodeNotSymlink
+
+	data vfs.DynamicBytesSource
+}
+
+// Init intializes a dynamic bytes file.
+func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource) {
+	f.InodeAttrs.Init(creds, ino, linux.ModeRegular|0444)
+	f.data = data
+}
+
+// Open implements Inode.Open.
+func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &DynamicBytesFD{}
+	fd.Init(rp.Mount(), vfsd, f.data, flags)
+	return &fd.vfsfd, nil
+}
+
+// SetStat implements Inode.SetStat.
+func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+	// DynamicBytesFiles are immutable.
+	return syserror.EPERM
+}
+
+// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a
+// DynamicBytesFile.
+//
+// Must be initialized with Init before first use.
+type DynamicBytesFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.DynamicBytesFileDescriptionImpl
+
+	vfsfd vfs.FileDescription
+	inode Inode
+	flags uint32
+}
+
+// Init initializes a DynamicBytesFD.
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) {
+	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
+	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
+	fd.flags = flags
+	fd.inode = d.Impl().(*Dentry).inode
+	fd.SetDataSource(data)
+	fd.vfsfd.Init(fd, m, d)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
+}
+
+// Read implmenets vfs.FileDescriptionImpl.Read.
+func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
+}
+
+// PRead implmenets vfs.FileDescriptionImpl.PRead.
+func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return fd.FileDescriptionDefaultImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return fd.FileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *DynamicBytesFD) Release() {}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.Stat(fs), nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
+	// DynamicBytesFiles are immutable.
+	return syserror.EPERM
+}
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *DynamicBytesFD) StatusFlags(ctx context.Context) (uint32, error) {
+	return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *DynamicBytesFD) SetStatusFlags(ctx context.Context, flags uint32) error {
+	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+	// no-op.
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
new file mode 100644
index 000000000..d6c18937a
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -0,0 +1,207 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
+// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
+// compatible with dynamic directories.
+//
+// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
+// IterDirents callback. The IterDirents callback therefore cannot hash or
+// unhash children, or recursively call IterDirents on the same underlying
+// inode.
+//
+// Must be initialize with Init before first use.
+type GenericDirectoryFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	vfsfd    vfs.FileDescription
+	children *OrderedChildren
+	flags    uint32
+	off      int64
+}
+
+// Init initializes a GenericDirectoryFD.
+func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) {
+	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
+	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
+	fd.children = children
+	fd.flags = flags
+	fd.vfsfd.Init(fd, m, d)
+}
+
+// VFSFileDescription returns a pointer to the vfs.FileDescription representing
+// this object.
+func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription {
+	return &fd.vfsfd
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts)
+}
+
+// Read implmenets vfs.FileDescriptionImpl.Read.
+func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts)
+}
+
+// PRead implmenets vfs.FileDescriptionImpl.PRead.
+func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDecriptionImpl.Release.
+func (fd *GenericDirectoryFD) Release() {}
+
+func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *GenericDirectoryFD) inode() Inode {
+	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+}
+
+// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds
+// o.mu when calling cb.
+func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	vfsFS := fd.filesystem()
+	fs := vfsFS.Impl().(*Filesystem)
+	vfsd := fd.vfsfd.VirtualDentry().Dentry()
+
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	// Handle ".".
+	if fd.off == 0 {
+		stat := fd.inode().Stat(vfsFS)
+		dirent := vfs.Dirent{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     stat.Ino,
+			NextOff: 1,
+		}
+		if !cb.Handle(dirent) {
+			return nil
+		}
+		fd.off++
+	}
+
+	// Handle "..".
+	if fd.off == 1 {
+		parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode
+		stat := parentInode.Stat(vfsFS)
+		dirent := vfs.Dirent{
+			Name:    "..",
+			Type:    linux.FileMode(stat.Mode).DirentType(),
+			Ino:     stat.Ino,
+			NextOff: 2,
+		}
+		if !cb.Handle(dirent) {
+			return nil
+		}
+		fd.off++
+	}
+
+	// Handle static children.
+	fd.children.mu.RLock()
+	defer fd.children.mu.RUnlock()
+	// fd.off accounts for "." and "..", but fd.children do not track
+	// these.
+	childIdx := fd.off - 2
+	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
+		inode := it.Dentry.Impl().(*Dentry).inode
+		stat := inode.Stat(vfsFS)
+		dirent := vfs.Dirent{
+			Name:    it.Name,
+			Type:    linux.FileMode(stat.Mode).DirentType(),
+			Ino:     stat.Ino,
+			NextOff: fd.off + 1,
+		}
+		if !cb.Handle(dirent) {
+			return nil
+		}
+		fd.off++
+	}
+
+	return nil
+}
+
+// Seek implements vfs.FileDecriptionImpl.Seek.
+func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fs := fd.filesystem().Impl().(*Filesystem)
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *GenericDirectoryFD) StatusFlags(ctx context.Context) (uint32, error) {
+	return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *GenericDirectoryFD) SetStatusFlags(ctx context.Context, flags uint32) error {
+	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+	// no-op.
+	return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.filesystem()
+	inode := fd.inode()
+	return inode.Stat(fs), nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	fs := fd.filesystem()
+	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+	return inode.SetStat(fs, opts)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
new file mode 100644
index 000000000..db486b6c1
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -0,0 +1,691 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file implements vfs.FilesystemImpl for kernfs.
+
+package kernfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// stepExistingLocked resolves rp.Component() in parent directory vfsd.
+//
+// stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+//
+// Postcondition: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) {
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	// Directory searchable?
+	if err := d.inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	d.dirMu.Lock()
+	nextVFSD, err := rp.ResolveComponent(vfsd)
+	d.dirMu.Unlock()
+	if err != nil {
+		return nil, err
+	}
+	if nextVFSD != nil {
+		// Cached dentry exists, revalidate.
+		next := nextVFSD.Impl().(*Dentry)
+		if !next.inode.Valid(ctx) {
+			d.dirMu.Lock()
+			rp.VirtualFilesystem().ForceDeleteDentry(nextVFSD)
+			d.dirMu.Unlock()
+			fs.deferDecRef(nextVFSD) // Reference from Lookup.
+			nextVFSD = nil
+		}
+	}
+	if nextVFSD == nil {
+		// Dentry isn't cached; it either doesn't exist or failed
+		// revalidation. Attempt to resolve it via Lookup.
+		name := rp.Component()
+		var err error
+		nextVFSD, err = d.inode.Lookup(ctx, name)
+		// Reference on nextVFSD dropped by a corresponding Valid.
+		if err != nil {
+			return nil, err
+		}
+		d.InsertChild(name, nextVFSD)
+	}
+	next := nextVFSD.Impl().(*Dentry)
+
+	// Resolve any symlink at current path component.
+	if rp.ShouldFollowSymlink() && d.isSymlink() {
+		// TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks".
+		target, err := next.inode.Readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink
+
+	}
+	rp.Advance()
+	return nextVFSD, nil
+}
+
+// walkExistingLocked resolves rp to an existing file.
+//
+// walkExistingLocked is loosely analogous to Linux's
+// fs/namei.c:path_lookupat().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
+	vfsd := rp.Start()
+	for !rp.Done() {
+		var err error
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	d := vfsd.Impl().(*Dentry)
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, d.inode, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
+	vfsd := rp.Start()
+	for !rp.Final() {
+		var err error
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, d.inode, nil
+}
+
+// checkCreateLocked checks that a file named rp.Component() may be created in
+// directory parentVFSD, then returns rp.Component().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
+// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
+func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
+	if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return "", err
+	}
+	pc := rp.Component()
+	if pc == "." || pc == ".." {
+		return "", syserror.EEXIST
+	}
+	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+	if err != nil {
+		return "", err
+	}
+	if childVFSD != nil {
+		return "", syserror.EEXIST
+	}
+	if parentVFSD.IsDisowned() {
+		return "", syserror.ENOENT
+	}
+	return pc, nil
+}
+
+// checkDeleteLocked checks that the file represented by vfsd may be deleted.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
+	parentVFSD := vfsd.Parent()
+	if parentVFSD == nil {
+		return syserror.EBUSY
+	}
+	if parentVFSD.IsDisowned() {
+		return syserror.ENOENT
+	}
+	if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	return nil
+}
+
+// checkRenameLocked checks that a rename operation may be performed on the
+// target dentry across the given set of parent directories. The target dentry
+// may be nil.
+//
+// Precondition: isDir(dstInode) == true.
+func checkRenameLocked(creds *auth.Credentials, src, dstDir *vfs.Dentry, dstInode Inode) error {
+	srcDir := src.Parent()
+	if srcDir == nil {
+		return syserror.EBUSY
+	}
+	if srcDir.IsDisowned() {
+		return syserror.ENOENT
+	}
+	if dstDir.IsDisowned() {
+		return syserror.ENOENT
+	}
+	// Check for creation permissions on dst dir.
+	if err := dstInode.CheckPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *Filesystem) Release() {
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *Filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs()
+	defer fs.mu.RUnlock()
+	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+
+	if opts.CheckSearchable {
+		d := vfsd.Impl().(*Dentry)
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	vfsd.IncRef() // Ownership transferred to caller.
+	return vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if rp.Mount() != vd.Mount() {
+		return syserror.EXDEV
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+
+	d := vd.Dentry().Impl().(*Dentry)
+	if d.isDir() {
+		return syserror.EPERM
+	}
+
+	child, err := parentInode.NewLink(ctx, pc, d.inode)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	return nil
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	child, err := parentInode.NewDir(ctx, pc, opts)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	return nil
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	new, err := parentInode.NewNode(ctx, pc, opts)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, new)
+	return nil
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Filter out flags that are not supported by kernfs. O_DIRECTORY and
+	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
+	// appropriate bits in rp), but are returned by
+	// FileDescriptionImpl.StatusFlags().
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
+	ats := vfs.AccessTypesForOpenFlags(opts.Flags)
+
+	// Do not create new file.
+	if opts.Flags&linux.O_CREAT == 0 {
+		fs.mu.RLock()
+		defer fs.processDeferredDecRefs()
+		defer fs.mu.RUnlock()
+		vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+		if err != nil {
+			return nil, err
+		}
+		if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil {
+			return nil, err
+		}
+		return inode.Open(rp, vfsd, opts.Flags)
+	}
+
+	// May create new file.
+	mustCreate := opts.Flags&linux.O_EXCL != 0
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*Dentry).inode
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	if rp.Done() {
+		if rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil {
+			return nil, err
+		}
+		return inode.Open(rp, vfsd, opts.Flags)
+	}
+afterTrailingSymlink:
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+	// Reject attempts to open directories with O_CREAT.
+	if rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
+	pc := rp.Component()
+	if pc == "." || pc == ".." {
+		return nil, syserror.EISDIR
+	}
+	// Determine whether or not we need to create a file.
+	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+	if err != nil {
+		return nil, err
+	}
+	if childVFSD == nil {
+		// Already checked for searchability above; now check for writability.
+		if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+			return nil, err
+		}
+		if err := rp.Mount().CheckBeginWrite(); err != nil {
+			return nil, err
+		}
+		defer rp.Mount().EndWrite()
+		// Create and open the child.
+		child, err := parentInode.NewFile(ctx, pc, opts)
+		if err != nil {
+			return nil, err
+		}
+		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+		return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags)
+	}
+	// Open existing file or follow symlink.
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	childDentry := childVFSD.Impl().(*Dentry)
+	childInode := childDentry.inode
+	if rp.ShouldFollowSymlink() {
+		if childDentry.isSymlink() {
+			target, err := childInode.Readlink(ctx)
+			if err != nil {
+				return nil, err
+			}
+			if err := rp.HandleSymlink(target); err != nil {
+				return nil, err
+			}
+			// rp.Final() may no longer be true since we now need to resolve the
+			// symlink target.
+			goto afterTrailingSymlink
+		}
+	}
+	if err := childInode.CheckPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+	return childInode.Open(rp, childVFSD, opts.Flags)
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	fs.mu.RLock()
+	d, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return "", err
+	}
+	if !d.Impl().(*Dentry).isSymlink() {
+		return "", syserror.EINVAL
+	}
+	return inode.Readlink(ctx)
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
+	exchange := opts.Flags&linux.RENAME_EXCHANGE != 0
+	whiteout := opts.Flags&linux.RENAME_WHITEOUT != 0
+	if exchange && (noReplace || whiteout) {
+		// Can't specify RENAME_NOREPLACE or RENAME_WHITEOUT with RENAME_EXCHANGE.
+		return syserror.EINVAL
+	}
+	if exchange || whiteout {
+		// Exchange and Whiteout flags are not supported on kernfs.
+		return syserror.EINVAL
+	}
+
+	fs.mu.Lock()
+	defer fs.mu.Lock()
+
+	mnt := rp.Mount()
+	if mnt != vd.Mount() {
+		return syserror.EXDEV
+	}
+
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+
+	srcVFSD := vd.Dentry()
+	srcDirVFSD := srcVFSD.Parent()
+
+	// Can we remove the src dentry?
+	if err := checkDeleteLocked(rp, srcVFSD); err != nil {
+		return err
+	}
+
+	// Can we create the dst dentry?
+	var dstVFSD *vfs.Dentry
+	pc, err := checkCreateLocked(rp, dstDirVFSD, dstDirInode)
+	switch err {
+	case nil:
+		// Ok, continue with rename as replacement.
+	case syserror.EEXIST:
+		if noReplace {
+			// Won't overwrite existing node since RENAME_NOREPLACE was requested.
+			return syserror.EEXIST
+		}
+		dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc)
+		if err != nil {
+			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+		}
+	default:
+		return err
+	}
+
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	virtfs := rp.VirtualFilesystem()
+
+	srcDirDentry := srcDirVFSD.Impl().(*Dentry)
+	dstDirDentry := dstDirVFSD.Impl().(*Dentry)
+
+	// We can't deadlock here due to lock ordering because we're protected from
+	// concurrent renames by fs.mu held for writing.
+	srcDirDentry.dirMu.Lock()
+	defer srcDirDentry.dirMu.Unlock()
+	dstDirDentry.dirMu.Lock()
+	defer dstDirDentry.dirMu.Unlock()
+
+	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
+		return err
+	}
+	srcDirInode := srcDirDentry.inode
+	replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD)
+	if err != nil {
+		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
+		return err
+	}
+	virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced)
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	if err := checkDeleteLocked(rp, vfsd); err != nil {
+		return err
+	}
+	if !vfsd.Impl().(*Dentry).isDir() {
+		return syserror.ENOTDIR
+	}
+	if inode.HasChildren() {
+		return syserror.ENOTEMPTY
+	}
+	virtfs := rp.VirtualFilesystem()
+	parentDentry := vfsd.Parent().Impl().(*Dentry)
+	parentDentry.dirMu.Lock()
+	defer parentDentry.dirMu.Unlock()
+	if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+		return err
+	}
+	if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+		virtfs.AbortDeleteDentry(vfsd)
+		return err
+	}
+	virtfs.CommitDeleteDentry(vfsd)
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return err
+	}
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	return inode.SetStat(fs.VFSFilesystem(), opts)
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	return inode.Stat(fs.VFSFilesystem()), nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	// TODO: actually implement statfs
+	return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	child, err := parentInode.NewSymlink(ctx, pc, target)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+	return nil
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	vfsd, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	if err := checkDeleteLocked(rp, vfsd); err != nil {
+		return err
+	}
+	if vfsd.Impl().(*Dentry).isDir() {
+		return syserror.EISDIR
+	}
+	virtfs := rp.VirtualFilesystem()
+	parentDentry := vfsd.Parent().Impl().(*Dentry)
+	parentDentry.dirMu.Lock()
+	defer parentDentry.dirMu.Unlock()
+	if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+		return err
+	}
+	if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+		virtfs.AbortDeleteDentry(vfsd)
+		return err
+	}
+	virtfs.CommitDeleteDentry(vfsd)
+	return nil
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
new file mode 100644
index 000000000..7b45b702a
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -0,0 +1,492 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// InodeNoopRefCount partially implements the Inode interface, specifically the
+// inodeRefs sub interface. InodeNoopRefCount implements a simple reference
+// count for inodes, performing no extra actions when references are obtained or
+// released. This is suitable for simple file inodes that don't reference any
+// resources.
+type InodeNoopRefCount struct {
+}
+
+// IncRef implements Inode.IncRef.
+func (n *InodeNoopRefCount) IncRef() {
+}
+
+// DecRef implements Inode.DecRef.
+func (n *InodeNoopRefCount) DecRef() {
+}
+
+// TryIncRef implements Inode.TryIncRef.
+func (n *InodeNoopRefCount) TryIncRef() bool {
+	return true
+}
+
+// Destroy implements Inode.Destroy.
+func (n *InodeNoopRefCount) Destroy() {
+}
+
+// InodeDirectoryNoNewChildren partially implements the Inode interface.
+// InodeDirectoryNoNewChildren represents a directory inode which does not
+// support creation of new children.
+type InodeDirectoryNoNewChildren struct{}
+
+// NewFile implements Inode.NewFile.
+func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewLink implements Inode.NewLink.
+func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// InodeNotDirectory partially implements the Inode interface, specifically the
+// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
+// represent directories can embed this to provide no-op implementations for
+// directory-related functions.
+type InodeNotDirectory struct {
+}
+
+// HasChildren implements Inode.HasChildren.
+func (*InodeNotDirectory) HasChildren() bool {
+	return false
+}
+
+// NewFile implements Inode.NewFile.
+func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+	panic("NewFile called on non-directory inode")
+}
+
+// NewDir implements Inode.NewDir.
+func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+	panic("NewDir called on non-directory inode")
+}
+
+// NewLink implements Inode.NewLinkink.
+func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+	panic("NewLink called on non-directory inode")
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	panic("NewSymlink called on non-directory inode")
+}
+
+// NewNode implements Inode.NewNode.
+func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	panic("NewNode called on non-directory inode")
+}
+
+// Unlink implements Inode.Unlink.
+func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+	panic("Unlink called on non-directory inode")
+}
+
+// RmDir implements Inode.RmDir.
+func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+	panic("RmDir called on non-directory inode")
+}
+
+// Rename implements Inode.Rename.
+func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+	panic("Rename called on non-directory inode")
+}
+
+// Lookup implements Inode.Lookup.
+func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	panic("Lookup called on non-directory inode")
+}
+
+// Valid implements Inode.Valid.
+func (*InodeNotDirectory) Valid(context.Context) bool {
+	return true
+}
+
+// InodeNoDynamicLookup partially implements the Inode interface, specifically
+// the inodeDynamicLookup sub interface. Directory inodes that do not support
+// dymanic entries (i.e. entries that are not "hashed" into the
+// vfs.Dentry.children) can embed this to provide no-op implementations for
+// functions related to dynamic entries.
+type InodeNoDynamicLookup struct{}
+
+// Lookup implements Inode.Lookup.
+func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	return nil, syserror.ENOENT
+}
+
+// Valid implements Inode.Valid.
+func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
+	return true
+}
+
+// InodeNotSymlink partially implements the Inode interface, specifically the
+// inodeSymlink sub interface. All inodes that are not symlinks may embed this
+// to return the appropriate errors from symlink-related functions.
+type InodeNotSymlink struct{}
+
+// Readlink implements Inode.Readlink.
+func (*InodeNotSymlink) Readlink(context.Context) (string, error) {
+	return "", syserror.EINVAL
+}
+
+// InodeAttrs partially implements the Inode interface, specifically the
+// inodeMetadata sub interface. InodeAttrs provides functionality related to
+// inode attributes.
+//
+// Must be initialized by Init prior to first use.
+type InodeAttrs struct {
+	ino   uint64
+	mode  uint32
+	uid   uint32
+	gid   uint32
+	nlink uint32
+}
+
+// Init initializes this InodeAttrs.
+func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMode) {
+	if mode.FileType() == 0 {
+		panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
+	}
+
+	nlink := uint32(1)
+	if mode.FileType() == linux.ModeDirectory {
+		nlink = 2
+	}
+	atomic.StoreUint64(&a.ino, ino)
+	atomic.StoreUint32(&a.mode, uint32(mode))
+	atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID))
+	atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID))
+	atomic.StoreUint32(&a.nlink, nlink)
+}
+
+// Mode implements Inode.Mode.
+func (a *InodeAttrs) Mode() linux.FileMode {
+	return linux.FileMode(atomic.LoadUint32(&a.mode))
+}
+
+// Stat partially implements Inode.Stat. Note that this function doesn't provide
+// all the stat fields, and the embedder should consider extending the result
+// with filesystem-specific fields.
+func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx {
+	var stat linux.Statx
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
+	stat.Ino = atomic.LoadUint64(&a.ino)
+	stat.Mode = uint16(a.Mode())
+	stat.UID = atomic.LoadUint32(&a.uid)
+	stat.GID = atomic.LoadUint32(&a.gid)
+	stat.Nlink = atomic.LoadUint32(&a.nlink)
+
+	// TODO: Implement other stat fields like timestamps.
+
+	return stat
+}
+
+// SetStat implements Inode.SetStat.
+func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+	stat := opts.Stat
+	if stat.Mask&linux.STATX_MODE != 0 {
+		for {
+			old := atomic.LoadUint32(&a.mode)
+			new := old | uint32(stat.Mode & ^uint16(linux.S_IFMT))
+			if swapped := atomic.CompareAndSwapUint32(&a.mode, old, new); swapped {
+				break
+			}
+		}
+	}
+
+	if stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&a.uid, stat.UID)
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&a.gid, stat.GID)
+	}
+
+	// Note that not all fields are modifiable. For example, the file type and
+	// inode numbers are immutable after node creation.
+
+	// TODO: Implement other stat fields like timestamps.
+
+	return nil
+}
+
+// CheckPermissions implements Inode.CheckPermissions.
+func (a *InodeAttrs) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	mode := a.Mode()
+	return vfs.GenericCheckPermissions(
+		creds,
+		ats,
+		mode.FileType() == linux.ModeDirectory,
+		uint16(mode),
+		auth.KUID(atomic.LoadUint32(&a.uid)),
+		auth.KGID(atomic.LoadUint32(&a.gid)),
+	)
+}
+
+// IncLinks implements Inode.IncLinks.
+func (a *InodeAttrs) IncLinks(n uint32) {
+	if atomic.AddUint32(&a.nlink, n) <= n {
+		panic("InodeLink.IncLinks called with no existing links")
+	}
+}
+
+// DecLinks implements Inode.DecLinks.
+func (a *InodeAttrs) DecLinks() {
+	if nlink := atomic.AddUint32(&a.nlink, ^uint32(0)); nlink == ^uint32(0) {
+		// Negative overflow
+		panic("Inode.DecLinks called at 0 links")
+	}
+}
+
+type slot struct {
+	Name   string
+	Dentry *vfs.Dentry
+	slotEntry
+}
+
+// OrderedChildrenOptions contains initialization options for OrderedChildren.
+type OrderedChildrenOptions struct {
+	// Writable indicates whether vfs.FilesystemImpl methods implemented by
+	// OrderedChildren may modify the tracked children. This applies to
+	// operations related to rename, unlink and rmdir. If an OrderedChildren is
+	// not writable, these operations all fail with EPERM.
+	Writable bool
+}
+
+// OrderedChildren partially implements the Inode interface. OrderedChildren can
+// be embedded in directory inodes to keep track of the children in the
+// directory, and can then be used to implement a generic directory FD -- see
+// GenericDirectoryFD. OrderedChildren is not compatible with dynamic
+// directories.
+//
+// Must be initialize with Init before first use.
+type OrderedChildren struct {
+	refs.AtomicRefCount
+
+	// Can children be modified by user syscalls? It set to false, interface
+	// methods that would modify the children return EPERM. Immutable.
+	writable bool
+
+	mu    sync.RWMutex
+	order slotList
+	set   map[string]*slot
+}
+
+// Init initializes an OrderedChildren.
+func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
+	o.writable = opts.Writable
+	o.set = make(map[string]*slot)
+}
+
+// DecRef implements Inode.DecRef.
+func (o *OrderedChildren) DecRef() {
+	o.AtomicRefCount.DecRefWithDestructor(o.Destroy)
+}
+
+// Destroy cleans up resources referenced by this OrderedChildren.
+func (o *OrderedChildren) Destroy() {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	o.order.Reset()
+	o.set = nil
+}
+
+// Populate inserts children into this OrderedChildren, and d's dentry
+// cache. Populate returns the number of directories inserted, which the caller
+// may use to update the link count for the parent directory.
+//
+// Precondition: d.Impl() must be a kernfs Dentry. d must represent a directory
+// inode. children must not contain any conflicting entries already in o.
+func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
+	var links uint32
+	for name, child := range children {
+		if child.isDir() {
+			links++
+		}
+		if err := o.Insert(name, child.VFSDentry()); err != nil {
+			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
+		}
+		d.InsertChild(name, child.VFSDentry())
+	}
+	return links
+}
+
+// HasChildren implements Inode.HasChildren.
+func (o *OrderedChildren) HasChildren() bool {
+	o.mu.RLock()
+	defer o.mu.RUnlock()
+	return len(o.set) > 0
+}
+
+// Insert inserts child into o. This ignores the writability of o, as this is
+// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
+func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if _, ok := o.set[name]; ok {
+		return syserror.EEXIST
+	}
+	s := &slot{
+		Name:   name,
+		Dentry: child,
+	}
+	o.order.PushBack(s)
+	o.set[name] = s
+	return nil
+}
+
+// Precondition: caller must hold o.mu for writing.
+func (o *OrderedChildren) removeLocked(name string) {
+	if s, ok := o.set[name]; ok {
+		delete(o.set, name)
+		o.order.Remove(s)
+	}
+}
+
+// Precondition: caller must hold o.mu for writing.
+func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+	if s, ok := o.set[name]; ok {
+		// Existing slot with given name, simply replace the dentry.
+		var old *vfs.Dentry
+		old, s.Dentry = s.Dentry, new
+		return old
+	}
+
+	// No existing slot with given name, create and hash new slot.
+	s := &slot{
+		Name:   name,
+		Dentry: new,
+	}
+	o.order.PushBack(s)
+	o.set[name] = s
+	return nil
+}
+
+// Precondition: caller must hold o.mu for reading or writing.
+func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+	s, ok := o.set[name]
+	if !ok {
+		return syserror.ENOENT
+	}
+	if s.Dentry != child {
+		panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child))
+	}
+	return nil
+}
+
+// Unlink implements Inode.Unlink.
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+	if !o.writable {
+		return syserror.EPERM
+	}
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if err := o.checkExistingLocked(name, child); err != nil {
+		return err
+	}
+	o.removeLocked(name)
+	return nil
+}
+
+// Rmdir implements Inode.Rmdir.
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+	// We're not responsible for checking that child is a directory, that it's
+	// empty, or updating any link counts; so this is the same as unlink.
+	return o.Unlink(ctx, name, child)
+}
+
+type renameAcrossDifferentImplementationsError struct{}
+
+func (renameAcrossDifferentImplementationsError) Error() string {
+	return "rename across inodes with different implementations"
+}
+
+// Rename implements Inode.Rename.
+//
+// Precondition: Rename may only be called across two directory inodes with
+// identical implementations of Rename. Practically, this means filesystems that
+// implement Rename by embedding OrderedChildren for any directory
+// implementation must use OrderedChildren for all directory implementations
+// that will support Rename.
+//
+// Postcondition: reference on any replaced dentry transferred to caller.
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
+	dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+	if !ok {
+		return nil, renameAcrossDifferentImplementationsError{}
+	}
+	if !o.writable || !dst.writable {
+		return nil, syserror.EPERM
+	}
+	// Note: There's a potential deadlock below if concurrent calls to Rename
+	// refer to the same src and dst directories in reverse. We avoid any
+	// ordering issues because the caller is required to serialize concurrent
+	// calls to Rename in accordance with the interface declaration.
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if dst != o {
+		dst.mu.Lock()
+		defer dst.mu.Unlock()
+	}
+	if err := o.checkExistingLocked(oldname, child); err != nil {
+		return nil, err
+	}
+	replaced := dst.replaceChildLocked(newname, child)
+	return replaced, nil
+}
+
+// nthLocked returns an iterator to the nth child tracked by this object. The
+// iterator is valid until the caller releases o.mu. Returns nil if the
+// requested index falls out of bounds.
+//
+// Preconditon: Caller must hold o.mu for reading.
+func (o *OrderedChildren) nthLocked(i int64) *slot {
+	for it := o.order.Front(); it != nil && i >= 0; it = it.Next() {
+		if i == 0 {
+			return it
+		}
+		i--
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
new file mode 100644
index 000000000..bb01c3d01
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -0,0 +1,405 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernfs provides the tools to implement inode-based filesystems.
+// Kernfs has two main features:
+//
+// 1. The Inode interface, which maps VFS2's path-based filesystem operations to
+//    specific filesystem nodes. Kernfs uses the Inode interface to provide a
+//    blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as
+//    the synchronization mechanism for all filesystem operations by holding a
+//    filesystem-wide lock across all operations.
+//
+// 2. Various utility types which provide generic implementations for various
+//    parts of the Inode and vfs.FileDescription interfaces. Client filesystems
+//    based on kernfs can embed the appropriate set of these to avoid having to
+//    reimplement common filesystem operations. See inode_impl_util.go and
+//    fd_impl_util.go.
+//
+// Reference Model:
+//
+// Kernfs dentries represents named pointers to inodes. Dentries and inode have
+// independent lifetimes and reference counts. A child dentry unconditionally
+// holds a reference on its parent directory's dentry. A dentry also holds a
+// reference on the inode it points to. Multiple dentries can point to the same
+// inode (for example, in the case of hardlinks). File descriptors hold a
+// reference to the dentry they're opened on.
+//
+// Dentries are guaranteed to exist while holding Filesystem.mu for
+// reading. Dropping dentries require holding Filesystem.mu for writing. To
+// queue dentries for destruction from a read critical section, see
+// Filesystem.deferDecRef.
+//
+// Lock ordering:
+//
+// kernfs.Filesystem.mu
+//   kernfs.Dentry.dirMu
+//     vfs.VirtualFilesystem.mountMu
+//       vfs.Dentry.mu
+//   kernfs.Filesystem.droppedDentriesMu
+//   (inode implementation locks, if any)
+package kernfs
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
+// filesystem. Concrete implementations are expected to embed this in their own
+// Filesystem type.
+type Filesystem struct {
+	vfsfs vfs.Filesystem
+
+	droppedDentriesMu sync.Mutex
+
+	// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
+	// used to defer dentry destruction until mu can be acquired for
+	// writing. Protected by droppedDentriesMu.
+	droppedDentries []*vfs.Dentry
+
+	// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
+	// for reading guarantees continued existence of any resolved dentries, but
+	// the dentry tree may be modified.
+	//
+	// Kernfs dentries can only be DecRef()ed while holding mu for writing. For
+	// example:
+	//
+	//   fs.mu.Lock()
+	//   defer fs.mu.Unlock()
+	//   ...
+	//   dentry1.DecRef()
+	//   defer dentry2.DecRef() // Ok, will run before Unlock.
+	//
+	// If discarding dentries in a read context, use Filesystem.deferDecRef. For
+	// example:
+	//
+	//   fs.mu.RLock()
+	//   fs.mu.processDeferredDecRefs()
+	//   defer fs.mu.RUnlock()
+	//   ...
+	//   fs.deferDecRef(dentry)
+	mu sync.RWMutex
+
+	// nextInoMinusOne is used to to allocate inode numbers on this
+	// filesystem. Must be accessed by atomic operations.
+	nextInoMinusOne uint64
+}
+
+// deferDecRef defers dropping a dentry ref until the next call to
+// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
+//
+// Precondition: d must not already be pending destruction.
+func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+	fs.droppedDentriesMu.Lock()
+	fs.droppedDentries = append(fs.droppedDentries, d)
+	fs.droppedDentriesMu.Unlock()
+}
+
+// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
+// droppedDentries list. See comment on Filesystem.mu.
+func (fs *Filesystem) processDeferredDecRefs() {
+	fs.mu.Lock()
+	fs.processDeferredDecRefsLocked()
+	fs.mu.Unlock()
+}
+
+// Precondition: fs.mu must be held for writing.
+func (fs *Filesystem) processDeferredDecRefsLocked() {
+	fs.droppedDentriesMu.Lock()
+	for _, d := range fs.droppedDentries {
+		d.DecRef()
+	}
+	fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
+	fs.droppedDentriesMu.Unlock()
+}
+
+// Init initializes a kernfs filesystem. This should be called from during
+// vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding
+// kernfs.
+func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem) {
+	fs.vfsfs.Init(vfsObj, fs)
+}
+
+// VFSFilesystem returns the generic vfs filesystem object.
+func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem {
+	return &fs.vfsfs
+}
+
+// NextIno allocates a new inode number on this filesystem.
+func (fs *Filesystem) NextIno() uint64 {
+	return atomic.AddUint64(&fs.nextInoMinusOne, 1)
+}
+
+// These consts are used in the Dentry.flags field.
+const (
+	// Dentry points to a directory inode.
+	dflagsIsDir = 1 << iota
+
+	// Dentry points to a symlink inode.
+	dflagsIsSymlink
+)
+
+// Dentry implements vfs.DentryImpl.
+//
+// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
+// named reference to an inode. A dentry generally lives as long as it's part of
+// a mounted filesystem tree. Kernfs doesn't cache dentries once all references
+// to them are removed. Dentries hold a single reference to the inode they point
+// to, and child dentries hold a reference on their parent.
+//
+// Must be initialized by Init prior to first use.
+type Dentry struct {
+	refs.AtomicRefCount
+
+	vfsd  vfs.Dentry
+	inode Inode
+
+	refs uint64
+
+	// flags caches useful information about the dentry from the inode. See the
+	// dflags* consts above. Must be accessed by atomic ops.
+	flags uint32
+
+	// dirMu protects vfsd.children for directory dentries.
+	dirMu sync.Mutex
+}
+
+// Init initializes this dentry.
+//
+// Precondition: Caller must hold a reference on inode.
+//
+// Postcondition: Caller's reference on inode is transferred to the dentry.
+func (d *Dentry) Init(inode Inode) {
+	d.vfsd.Init(d)
+	d.inode = inode
+	ftype := inode.Mode().FileType()
+	if ftype == linux.ModeDirectory {
+		d.flags |= dflagsIsDir
+	}
+	if ftype == linux.ModeSymlink {
+		d.flags |= dflagsIsSymlink
+	}
+}
+
+// VFSDentry returns the generic vfs dentry for this kernfs dentry.
+func (d *Dentry) VFSDentry() *vfs.Dentry {
+	return &d.vfsd
+}
+
+// isDir checks whether the dentry points to a directory inode.
+func (d *Dentry) isDir() bool {
+	return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0
+}
+
+// isSymlink checks whether the dentry points to a symlink inode.
+func (d *Dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef() {
+	d.AtomicRefCount.DecRefWithDestructor(d.destroy)
+}
+
+// Precondition: Dentry must be removed from VFS' dentry cache.
+func (d *Dentry) destroy() {
+	d.inode.DecRef() // IncRef from Init.
+	d.inode = nil
+	if parent := d.vfsd.Parent(); parent != nil {
+		parent.DecRef() // IncRef from Dentry.InsertChild.
+	}
+}
+
+// InsertChild inserts child into the vfs dentry cache with the given name under
+// this dentry. This does not update the directory inode, so calling this on
+// it's own isn't sufficient to insert a child into a directory. InsertChild
+// updates the link count on d if required.
+//
+// Precondition: d must represent a directory inode.
+func (d *Dentry) InsertChild(name string, child *vfs.Dentry) {
+	if !d.isDir() {
+		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
+	}
+	vfsDentry := d.VFSDentry()
+	vfsDentry.IncRef() // DecRef in child's Dentry.destroy.
+	d.dirMu.Lock()
+	vfsDentry.InsertChild(child, name)
+	d.dirMu.Unlock()
+}
+
+// The Inode interface maps filesystem-level operations that operate on paths to
+// equivalent operations on specific filesystem nodes.
+//
+// The interface methods are groups into logical categories as sub interfaces
+// below. Generally, an implementation for each sub interface can be provided by
+// embedding an appropriate type from inode_impl_utils.go. The sub interfaces
+// are purely organizational. Methods declared directly in the main interface
+// have no generic implementations, and should be explicitly provided by the
+// client filesystem.
+//
+// Generally, implementations are not responsible for tasks that are common to
+// all filesystems. These include:
+//
+// - Checking that dentries passed to methods are of the appropriate file type.
+// - Checking permissions.
+// - Updating link and reference counts.
+//
+// Specific responsibilities of implementations are documented below.
+type Inode interface {
+	// Methods related to reference counting. A generic implementation is
+	// provided by InodeNoopRefCount. These methods are generally called by the
+	// equivalent Dentry methods.
+	inodeRefs
+
+	// Methods related to node metadata. A generic implementation is provided by
+	// InodeAttrs.
+	inodeMetadata
+
+	// Method for inodes that represent symlink. InodeNotSymlink provides a
+	// blanket implementation for all non-symlink inodes.
+	inodeSymlink
+
+	// Method for inodes that represent directories. InodeNotDirectory provides
+	// a blanket implementation for all non-directory inodes.
+	inodeDirectory
+
+	// Method for inodes that represent dynamic directories and their
+	// children. InodeNoDynamicLookup provides a blanket implementation for all
+	// non-dynamic-directory inodes.
+	inodeDynamicLookup
+
+	// Open creates a file description for the filesystem object represented by
+	// this inode. The returned file description should hold a reference on the
+	// inode for its lifetime.
+	//
+	// Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry.
+	Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error)
+}
+
+type inodeRefs interface {
+	IncRef()
+	DecRef()
+	TryIncRef() bool
+	// Destroy is called when the inode reaches zero references. Destroy release
+	// all resources (references) on objects referenced by the inode, including
+	// any child dentries.
+	Destroy()
+}
+
+type inodeMetadata interface {
+	// CheckPermissions checks that creds may access this inode for the
+	// requested access type, per the the rules of
+	// fs/namei.c:generic_permission().
+	CheckPermissions(creds *auth.Credentials, atx vfs.AccessTypes) error
+
+	// Mode returns the (struct stat)::st_mode value for this inode. This is
+	// separated from Stat for performance.
+	Mode() linux.FileMode
+
+	// Stat returns the metadata for this inode. This corresponds to
+	// vfs.FilesystemImpl.StatAt.
+	Stat(fs *vfs.Filesystem) linux.Statx
+
+	// SetStat updates the metadata for this inode. This corresponds to
+	// vfs.FilesystemImpl.SetStatAt.
+	SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error
+}
+
+// Precondition: All methods in this interface may only be called on directory
+// inodes.
+type inodeDirectory interface {
+	// The New{File,Dir,Node,Symlink} methods below should return a new inode
+	// hashed into this inode.
+	//
+	// These inode constructors are inode-level operations rather than
+	// filesystem-level operations to allow client filesystems to mix different
+	// implementations based on the new node's location in the
+	// filesystem.
+
+	// HasChildren returns true if the directory inode has any children.
+	HasChildren() bool
+
+	// NewFile creates a new regular file inode.
+	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+
+	// NewDir creates a new directory inode.
+	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+
+	// NewLink creates a new hardlink to a specified inode in this
+	// directory. Implementations should create a new kernfs Dentry pointing to
+	// target, and update target's link count.
+	NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+
+	// NewSymlink creates a new symbolic link inode.
+	NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+
+	// NewNode creates a new filesystem node for a mknod syscall.
+	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+
+	// Unlink removes a child dentry from this directory inode.
+	Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+
+	// RmDir removes an empty child directory from this directory
+	// inode. Implementations must update the parent directory's link count,
+	// if required. Implementations are not responsible for checking that child
+	// is a directory, checking for an empty directory.
+	RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+
+	// Rename is called on the source directory containing an inode being
+	// renamed. child should point to the resolved child in the source
+	// directory. If Rename replaces a dentry in the destination directory, it
+	// should return the replaced dentry or nil otherwise.
+	//
+	// Precondition: Caller must serialize concurrent calls to Rename.
+	Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
+}
+
+type inodeDynamicLookup interface {
+	// Lookup should return an appropriate dentry if name should resolve to a
+	// child of this dynamic directory inode. This gives the directory an
+	// opportunity on every lookup to resolve additional entries that aren't
+	// hashed into the directory. This is only called when the inode is a
+	// directory. If the inode is not a directory, or if the directory only
+	// contains a static set of children, the implementer can unconditionally
+	// return an appropriate error (ENOTDIR and ENOENT respectively).
+	//
+	// The child returned by Lookup will be hashed into the VFS dentry tree. Its
+	// lifetime can be controlled by the filesystem implementation with an
+	// appropriate implementation of Valid.
+	//
+	// Lookup returns the child with an extra reference and the caller owns this
+	// reference.
+	Lookup(ctx context.Context, name string) (*vfs.Dentry, error)
+
+	// Valid should return true if this inode is still valid, or needs to
+	// be resolved again by a call to Lookup.
+	Valid(ctx context.Context) bool
+}
+
+type inodeSymlink interface {
+	// Readlink resolves the target of a symbolic link. If an inode is not a
+	// symlink, the implementation should return EINVAL.
+	Readlink(ctx context.Context) (string, error)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
new file mode 100644
index 000000000..f78bb7b04
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -0,0 +1,423 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs_test
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"runtime"
+	"sync"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const defaultMode linux.FileMode = 01777
+const staticFileContent = "This is sample content for a static test file."
+
+// RootDentryFn is a generator function for creating the root dentry of a test
+// filesystem. See newTestSystem.
+type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
+
+// TestSystem represents the context for a single test.
+type TestSystem struct {
+	t     *testing.T
+	ctx   context.Context
+	creds *auth.Credentials
+	vfs   *vfs.VirtualFilesystem
+	mns   *vfs.MountNamespace
+	root  vfs.VirtualDentry
+}
+
+// newTestSystem sets up a minimal environment for running a test, including an
+// instance of a test filesystem. Tests can control the contents of the
+// filesystem by providing an appropriate rootFn, which should return a
+// pre-populated root dentry.
+func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+	v := vfs.New()
+	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn})
+	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("Failed to create testfs root mount: %v", err)
+	}
+
+	s := &TestSystem{
+		t:     t,
+		ctx:   ctx,
+		creds: creds,
+		vfs:   v,
+		mns:   mns,
+		root:  mns.Root(),
+	}
+	runtime.SetFinalizer(s, func(s *TestSystem) { s.root.DecRef() })
+	return s
+}
+
+// PathOpAtRoot constructs a vfs.PathOperation for a path from the
+// root of the test filesystem.
+//
+// Precondition: path should be relative path.
+func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation {
+	return vfs.PathOperation{
+		Root:     s.root,
+		Start:    s.root,
+		Pathname: path,
+	}
+}
+
+// GetDentryOrDie attempts to resolve a dentry referred to by the
+// provided path operation. If unsuccessful, the test fails.
+func (s *TestSystem) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry {
+	vd, err := s.vfs.GetDentryAt(s.ctx, s.creds, &pop, &vfs.GetDentryOptions{})
+	if err != nil {
+		s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err)
+	}
+	return vd
+}
+
+func (s *TestSystem) ReadToEnd(fd *vfs.FileDescription) (string, error) {
+	buf := make([]byte, usermem.PageSize)
+	bufIOSeq := usermem.BytesIOSequence(buf)
+	opts := vfs.ReadOptions{}
+
+	var content bytes.Buffer
+	for {
+		n, err := fd.Impl().Read(s.ctx, bufIOSeq, opts)
+		if n == 0 || err != nil {
+			if err == io.EOF {
+				err = nil
+			}
+			return content.String(), err
+		}
+		content.Write(buf[:n])
+	}
+}
+
+type fsType struct {
+	rootFn RootDentryFn
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+}
+
+type file struct {
+	kernfs.DynamicBytesFile
+	content string
+}
+
+func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
+	f := &file{}
+	f.content = content
+	f.DynamicBytesFile.Init(creds, fs.NextIno(), f)
+
+	d := &kernfs.Dentry{}
+	d.Init(f)
+	return d
+}
+
+func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%s", f.content)
+	return nil
+}
+
+type attrs struct {
+	kernfs.InodeAttrs
+}
+
+func (a *attrs) SetStat(fs *vfs.Filesystem, opt vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+type readonlyDir struct {
+	attrs
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeDirectoryNoNewChildren
+
+	kernfs.OrderedChildren
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+	dir := &readonlyDir{}
+	dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	dir.dentry.Init(dir)
+
+	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+
+	return &dir.dentry
+}
+
+func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
+
+type dir struct {
+	attrs
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoDynamicLookup
+
+	fs     *filesystem
+	dentry kernfs.Dentry
+	kernfs.OrderedChildren
+}
+
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+	dir := &dir{}
+	dir.fs = fs
+	dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
+	dir.dentry.Init(dir)
+
+	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+
+	return &dir.dentry
+}
+
+func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	fd := &kernfs.GenericDirectoryFD{}
+	fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+	return fd.VFSFileDescription(), nil
+}
+
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	dir := d.fs.newDir(creds, opts.Mode, nil)
+	dirVFSD := dir.VFSDentry()
+	if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
+		dir.DecRef()
+		return nil, err
+	}
+	d.IncLinks(1)
+	return dirVFSD, nil
+}
+
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	f := d.fs.newFile(creds, "")
+	fVFSD := f.VFSDentry()
+	if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
+		f.DecRef()
+		return nil, err
+	}
+	return fVFSD, nil
+}
+
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (fst *fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	fs := &filesystem{}
+	fs.Init(vfsObj)
+	root := fst.rootFn(creds, fs)
+	return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// -------------------- Remainder of the file are test cases --------------------
+
+func TestBasic(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+	sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef()
+}
+
+func TestMkdirGetDentry(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"dir1": fs.newDir(creds, 0755, nil),
+		})
+	})
+
+	pop := sys.PathOpAtRoot("dir1/a new directory")
+	if err := sys.vfs.MkdirAt(sys.ctx, sys.creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
+		t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err)
+	}
+	sys.GetDentryOrDie(pop).DecRef()
+}
+
+func TestReadStaticFile(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+
+	pop := sys.PathOpAtRoot("file1")
+	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	if err != nil {
+		sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef()
+
+	content, err := sys.ReadToEnd(fd)
+	if err != nil {
+		sys.t.Fatalf("Read failed: %v", err)
+	}
+	if diff := cmp.Diff(staticFileContent, content); diff != "" {
+		sys.t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff)
+	}
+}
+
+func TestCreateNewFileInStaticDir(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"dir1": fs.newDir(creds, 0755, nil),
+		})
+	})
+
+	pop := sys.PathOpAtRoot("dir1/newfile")
+	opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode}
+	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, opts)
+	if err != nil {
+		sys.t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err)
+	}
+
+	// Close the file. The file should persist.
+	fd.DecRef()
+
+	fd, err = sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	if err != nil {
+		sys.t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
+	}
+	fd.DecRef()
+}
+
+// direntCollector provides an implementation for vfs.IterDirentsCallback for
+// testing. It simply iterates to the end of a given directory FD and collects
+// all dirents emitted by the callback.
+type direntCollector struct {
+	mu      sync.Mutex
+	dirents map[string]vfs.Dirent
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (d *direntCollector) Handle(dirent vfs.Dirent) bool {
+	d.mu.Lock()
+	if d.dirents == nil {
+		d.dirents = make(map[string]vfs.Dirent)
+	}
+	d.dirents[dirent.Name] = dirent
+	d.mu.Unlock()
+	return true
+}
+
+// count returns the number of dirents currently in the collector.
+func (d *direntCollector) count() int {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return len(d.dirents)
+}
+
+// contains checks whether the collector has a dirent with the given name and
+// type.
+func (d *direntCollector) contains(name string, typ uint8) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	dirent, ok := d.dirents[name]
+	if !ok {
+		return fmt.Errorf("No dirent named %q found", name)
+	}
+	if dirent.Type != typ {
+		return fmt.Errorf("Dirent named %q found, but was expecting type %d, got: %+v", name, typ, dirent)
+	}
+	return nil
+}
+
+func TestDirFDReadWrite(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, nil)
+	})
+
+	pop := sys.PathOpAtRoot("/")
+	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	if err != nil {
+		sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef()
+
+	// Read/Write should fail for directory FDs.
+	if _, err := fd.Read(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR {
+		sys.t.Fatalf("Read for directory FD failed with unexpected error: %v", err)
+	}
+	if _, err := fd.Write(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR {
+		sys.t.Fatalf("Wrire for directory FD failed with unexpected error: %v", err)
+	}
+}
+
+func TestDirFDIterDirents(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			// Fill root with nodes backed by various inode implementations.
+			"dir1": fs.newReadonlyDir(creds, 0755, nil),
+			"dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{
+				"dir3": fs.newDir(creds, 0755, nil),
+			}),
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+
+	pop := sys.PathOpAtRoot("/")
+	fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+	if err != nil {
+		sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef()
+
+	collector := &direntCollector{}
+	if err := fd.IterDirents(sys.ctx, collector); err != nil {
+		sys.t.Fatalf("IterDirent failed: %v", err)
+	}
+
+	// Root directory should contain ".", ".." and 3 children:
+	if collector.count() != 5 {
+		sys.t.Fatalf("IterDirent returned too many dirents")
+	}
+	for _, dirName := range []string{".", "..", "dir1", "dir2"} {
+		if err := collector.contains(dirName, linux.DT_DIR); err != nil {
+			sys.t.Fatalf("IterDirent had unexpected results: %v", err)
+		}
+	}
+	if err := collector.contains("file1", linux.DT_REG); err != nil {
+		sys.t.Fatalf("IterDirent had unexpected results: %v", err)
+	}
+
+}
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index bc5c0b591..0cc751eb8 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -32,6 +32,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/fspath",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index ea6417ce7..4a7a94a52 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -394,7 +394,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			}
 			defer mountPoint.DecRef()
 			// Create and mount the submount.
-			if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.GetFilesystemOptions{}); err != nil {
+			if err := vfsObj.MountAt(ctx, creds, "", &pop, "memfs", &vfs.MountOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 08a9cb8ef..1f2a5122a 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -19,6 +19,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -582,3 +583,10 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	inode.decLinksLocked()
 	return nil
 }
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
index a3a870571..5bf527c80 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/memfs/pipe_test.go
@@ -194,7 +194,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
 	readData := make([]byte, 1)
 	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
 	if err != syserror.ErrWouldBlock {
 		t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
 	}
@@ -207,7 +207,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
 func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
 	writeData := []byte(msg)
 	src := usermem.BytesIOSequence(writeData)
-	bytesWritten, err := fd.Impl().Write(ctx, src, vfs.WriteOptions{})
+	bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{})
 	if err != nil {
 		t.Fatalf("error writing to pipe %q: %v", fileName, err)
 	}
@@ -220,7 +220,7 @@ func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg
 func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
 	readData := make([]byte, len(msg))
 	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
 	if err != nil {
 		t.Fatalf("error reading from pipe %q: %v", fileName, err)
 	}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 28ba950bd..bd3fb4c03 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -841,9 +841,11 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
 		ContainerID:             args.ContainerID,
 	}
-	if _, err := k.tasks.NewTask(config); err != nil {
+	t, err := k.tasks.NewTask(config)
+	if err != nil {
 		return nil, 0, err
 	}
+	t.traceExecEvent(tc) // Simulate exec for tracing.
 
 	// Success.
 	tgid := k.tasks.Root.IDOfThreadGroup(tg)
@@ -1118,6 +1120,22 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
 	return lastErr
 }
 
+// RebuildTraceContexts rebuilds the trace context for all tasks.
+//
+// Unfortunately, if these are built while tracing is not enabled, then we will
+// not have meaningful trace data. Rebuilding here ensures that we can do so
+// after tracing has been enabled.
+func (k *Kernel) RebuildTraceContexts() {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+
+	for t, tid := range k.tasks.Root.tids {
+		t.rebuildTraceContext(tid)
+	}
+}
+
 // FeatureSet returns the FeatureSet.
 func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
 	return k.featureSet
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 93fe68a3e..de9617e9d 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -302,7 +302,7 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred
 		return syserror.ERANGE
 	}
 
-	// TODO(b/29354920): Clear undo entries in all processes
+	// TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
 	sem.value = val
 	sem.pid = pid
 	s.changeTime = ktime.NowFromContext(ctx)
@@ -336,7 +336,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
 	for i, val := range vals {
 		sem := &s.sems[i]
 
-		// TODO(b/29354920): Clear undo entries in all processes
+		// TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
 		sem.value = int16(val)
 		sem.pid = pid
 		sem.wakeWaiters()
@@ -481,7 +481,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch
 	}
 
 	// All operations succeeded, apply them.
-	// TODO(b/29354920): handle undo operations.
+	// TODO(gvisor.dev/issue/137): handle undo operations.
 	for i, v := range tmpVals {
 		s.sems[i].value = v
 		s.sems[i].wakeWaiters()
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 220fa73a2..2fdee0282 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -339,6 +339,14 @@ func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
 	return nil
 }
 
+// LookupName looks up a syscall name.
+func (s *SyscallTable) LookupName(sysno uintptr) string {
+	if sc, ok := s.Table[sysno]; ok {
+		return sc.Name
+	}
+	return fmt.Sprintf("sys_%d", sysno) // Unlikely.
+}
+
 // LookupEmulate looks up an emulation syscall number.
 func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
 	sysno, ok := s.Emulate[addr]
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 80c8e5464..ab0c6c4aa 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -15,6 +15,8 @@
 package kernel
 
 import (
+	gocontext "context"
+	"runtime/trace"
 	"sync"
 	"sync/atomic"
 
@@ -390,7 +392,14 @@ type Task struct {
 
 	// logPrefix is a string containing the task's thread ID in the root PID
 	// namespace, and is prepended to log messages emitted by Task.Infof etc.
-	logPrefix atomic.Value `state:".(string)"`
+	logPrefix atomic.Value `state:"nosave"`
+
+	// traceContext and traceTask are both used for tracing, and are
+	// updated along with the logPrefix in updateInfoLocked.
+	//
+	// These are exclusive to the task goroutine.
+	traceContext gocontext.Context `state:"nosave"`
+	traceTask    *trace.Task       `state:"nosave"`
 
 	// creds is the task's credentials.
 	//
@@ -528,14 +537,6 @@ func (t *Task) loadPtraceTracer(tracer *Task) {
 	t.ptraceTracer.Store(tracer)
 }
 
-func (t *Task) saveLogPrefix() string {
-	return t.logPrefix.Load().(string)
-}
-
-func (t *Task) loadLogPrefix(prefix string) {
-	t.logPrefix.Store(prefix)
-}
-
 func (t *Task) saveSyscallFilters() []bpf.Program {
 	if f := t.syscallFilters.Load(); f != nil {
 		return f.([]bpf.Program)
@@ -549,6 +550,7 @@ func (t *Task) loadSyscallFilters(filters []bpf.Program) {
 
 // afterLoad is invoked by stateify.
 func (t *Task) afterLoad() {
+	t.updateInfoLocked()
 	t.interruptChan = make(chan struct{}, 1)
 	t.gosched.State = TaskGoroutineNonexistent
 	if t.stop != nil {
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index dd69939f9..4a4a69ee2 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"runtime"
+	"runtime/trace"
 	"time"
 
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -133,19 +134,24 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
 		runtime.Gosched()
 	}
 
+	region := trace.StartRegion(t.traceContext, blockRegion)
 	select {
 	case <-C:
+		region.End()
 		t.SleepFinish(true)
+		// Woken by event.
 		return nil
 
 	case <-interrupt:
+		region.End()
 		t.SleepFinish(false)
 		// Return the indicated error on interrupt.
 		return syserror.ErrInterrupted
 
 	case <-timerChan:
-		// We've timed out.
+		region.End()
 		t.SleepFinish(true)
+		// We've timed out.
 		return syserror.ETIMEDOUT
 	}
 }
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 0916fd658..3eadfedb4 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -299,6 +299,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	// nt that it must receive before its task goroutine starts running.
 	tid := nt.k.tasks.Root.IDOfTask(nt)
 	defer nt.Start(tid)
+	t.traceCloneEvent(tid)
 
 	// "If fork/clone and execve are allowed by @prog, any child processes will
 	// be constrained to the same filters and system call ABI as the parent." -
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 17a089b90..90a6190f1 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -129,6 +129,7 @@ type runSyscallAfterExecStop struct {
 }
 
 func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
+	t.traceExecEvent(r.tc)
 	t.tg.pidns.owner.mu.Lock()
 	t.tg.execing = nil
 	if t.killed() {
@@ -253,7 +254,7 @@ func (t *Task) promoteLocked() {
 
 	t.tg.leader = t
 	t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
-	t.updateLogPrefixLocked()
+	t.updateInfoLocked()
 	// Reap the original leader. If it has a tracer, detach it instead of
 	// waiting for it to acknowledge the original leader's death.
 	oldLeader.exitParentNotified = true
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 535f03e50..435761e5a 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -236,6 +236,7 @@ func (*runExit) execute(t *Task) taskRunState {
 type runExitMain struct{}
 
 func (*runExitMain) execute(t *Task) taskRunState {
+	t.traceExitEvent()
 	lastExiter := t.exitThreadGroup()
 
 	// If the task has a cleartid, and the thread group wasn't killed by a
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index a29e9b9eb..0fb3661de 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"fmt"
+	"runtime/trace"
 	"sort"
 
 	"gvisor.dev/gvisor/pkg/log"
@@ -127,11 +128,88 @@ func (t *Task) debugDumpStack() {
 	}
 }
 
-// updateLogPrefix updates the task's cached log prefix to reflect its
-// current thread ID.
+// trace definitions.
+//
+// Note that all region names are prefixed by ':' in order to ensure that they
+// are lexically ordered before all system calls, which use the naked system
+// call name (e.g. "read") for maximum clarity.
+const (
+	traceCategory = "task"
+	runRegion     = ":run"
+	blockRegion   = ":block"
+	cpuidRegion   = ":cpuid"
+	faultRegion   = ":fault"
+)
+
+// updateInfoLocked updates the task's cached log prefix and tracing
+// information to reflect its current thread ID.
 //
 // Preconditions: The task's owning TaskSet.mu must be locked.
-func (t *Task) updateLogPrefixLocked() {
+func (t *Task) updateInfoLocked() {
 	// Use the task's TID in the root PID namespace for logging.
-	t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t]))
+	tid := t.tg.pidns.owner.Root.tids[t]
+	t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid))
+	t.rebuildTraceContext(tid)
+}
+
+// rebuildTraceContext rebuilds the trace context.
+//
+// Precondition: the passed tid must be the tid in the root namespace.
+func (t *Task) rebuildTraceContext(tid ThreadID) {
+	// Re-initialize the trace context.
+	if t.traceTask != nil {
+		t.traceTask.End()
+	}
+
+	// Note that we define the "task type" to be the dynamic TID. This does
+	// not align perfectly with the documentation for "tasks" in the
+	// tracing package. Tasks may be assumed to be bounded by analysis
+	// tools. However, if we just use a generic "task" type here, then the
+	// "user-defined tasks" page on the tracing dashboard becomes nearly
+	// unusable, as it loads all traces from all tasks.
+	//
+	// We can assume that the number of tasks in the system is not
+	// arbitrarily large (in general it won't be, especially for cases
+	// where we're collecting a brief profile), so using the TID is a
+	// reasonable compromise in this case.
+	t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid))
+}
+
+// traceCloneEvent is called when a new task is spawned.
+//
+// ntid must be the new task's ThreadID in the root namespace.
+func (t *Task) traceCloneEvent(ntid ThreadID) {
+	if !trace.IsEnabled() {
+		return
+	}
+	trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid)
+}
+
+// traceExitEvent is called when a task exits.
+func (t *Task) traceExitEvent() {
+	if !trace.IsEnabled() {
+		return
+	}
+	trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status())
+}
+
+// traceExecEvent is called when a task calls exec.
+func (t *Task) traceExecEvent(tc *TaskContext) {
+	if !trace.IsEnabled() {
+		return
+	}
+	d := tc.MemoryManager.Executable()
+	if d == nil {
+		trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
+		return
+	}
+	defer d.DecRef()
+	root := t.fsContext.RootDirectory()
+	if root == nil {
+		trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>")
+		return
+	}
+	defer root.DecRef()
+	n, _ := d.FullName(root)
+	trace.Logf(t.traceContext, traceCategory, "exec: %s", n)
 }
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index c92266c59..d97f8c189 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"bytes"
 	"runtime"
+	"runtime/trace"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -205,9 +206,11 @@ func (*runApp) execute(t *Task) taskRunState {
 		t.tg.pidns.owner.mu.RUnlock()
 	}
 
+	region := trace.StartRegion(t.traceContext, runRegion)
 	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
 	info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
 	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
+	region.End()
 
 	if clearSinglestep {
 		t.Arch().ClearSingleStep()
@@ -225,6 +228,7 @@ func (*runApp) execute(t *Task) taskRunState {
 
 	case platform.ErrContextSignalCPUID:
 		// Is this a CPUID instruction?
+		region := trace.StartRegion(t.traceContext, cpuidRegion)
 		expected := arch.CPUIDInstruction[:]
 		found := make([]byte, len(expected))
 		_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
@@ -232,10 +236,12 @@ func (*runApp) execute(t *Task) taskRunState {
 			// Skip the cpuid instruction.
 			t.Arch().CPUIDEmulate(t)
 			t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+			region.End()
 
 			// Resume execution.
 			return (*runApp)(nil)
 		}
+		region.End() // Not an actual CPUID, but required copy-in.
 
 		// The instruction at the given RIP was not a CPUID, and we
 		// fallthrough to the default signal deliver behavior below.
@@ -251,8 +257,10 @@ func (*runApp) execute(t *Task) taskRunState {
 		// an application-generated signal and we should continue execution
 		// normally.
 		if at.Any() {
+			region := trace.StartRegion(t.traceContext, faultRegion)
 			addr := usermem.Addr(info.Addr())
 			err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+			region.End()
 			if err == nil {
 				// The fault was handled appropriately.
 				// We can resume running the application.
@@ -260,6 +268,12 @@ func (*runApp) execute(t *Task) taskRunState {
 			}
 
 			// Is this a vsyscall that we need emulate?
+			//
+			// Note that we don't track vsyscalls as part of a
+			// specific trace region. This is because regions don't
+			// stack, and the actual system call will count as a
+			// region. We should be able to easily identify
+			// vsyscalls by having a <fault><syscall> pair.
 			if at.Execute {
 				if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
 					return t.doVsyscall(addr, sysno)
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index ae6fc4025..3522a4ae5 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -154,10 +154,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 	// Below this point, newTask is expected not to fail (there is no rollback
 	// of assignTIDsLocked or any of the following).
 
-	// Logging on t's behalf will panic if t.logPrefix hasn't been initialized.
-	// This is the earliest point at which we can do so (since t now has thread
-	// IDs).
-	t.updateLogPrefixLocked()
+	// Logging on t's behalf will panic if t.logPrefix hasn't been
+	// initialized. This is the earliest point at which we can do so
+	// (since t now has thread IDs).
+	t.updateInfoLocked()
 
 	if cfg.InheritParent != nil {
 		t.parent = cfg.InheritParent.parent
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index b543d536a..3180f5560 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"fmt"
 	"os"
+	"runtime/trace"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -160,6 +161,10 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
 		ctrl = ctrlStopAndReinvokeSyscall
 	} else {
 		fn := s.Lookup(sysno)
+		var region *trace.Region // Only non-nil if tracing == true.
+		if trace.IsEnabled() {
+			region = trace.StartRegion(t.traceContext, s.LookupName(sysno))
+		}
 		if fn != nil {
 			// Call our syscall implementation.
 			rval, ctrl, err = fn(t, args)
@@ -167,6 +172,9 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
 			// Use the missing function if not found.
 			rval, err = t.SyscallTable().Missing(t, sysno, args)
 		}
+		if region != nil {
+			region.End()
+		}
 	}
 
 	if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
index 34f84487a..048de26dc 100644
--- a/pkg/sentry/kernel/tty.go
+++ b/pkg/sentry/kernel/tty.go
@@ -21,8 +21,19 @@ import "sync"
 //
 // +stateify savable
 type TTY struct {
+	// Index is the terminal index. It is immutable.
+	Index uint32
+
 	mu sync.Mutex `state:"nosave"`
 
 	// tg is protected by mu.
 	tg *ThreadGroup
 }
+
+// TTY returns the thread group's controlling terminal. If nil, there is no
+// controlling terminal.
+func (tg *ThreadGroup) TTY() *TTY {
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	return tg.tty
+}
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index c2c3ec06e..6299a3e2f 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -408,6 +408,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 				start = vaddr
 			}
 			if vaddr < end {
+				// NOTE(b/37474556): Linux allows out-of-order
+				// segments, in violation of the spec.
 				ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end)
 				return loadedELF{}, syserror.ENOEXEC
 			}
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index 3ef84245b..112794e9c 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -41,7 +41,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
-        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/platform",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 03b99aaea..16a722a13 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -18,7 +18,6 @@ package memmap
 import (
 	"fmt"
 
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -235,8 +234,11 @@ type InvalidateOpts struct {
 // coincidental; fs.File implements MappingIdentity, and some
 // fs.InodeOperations implement Mappable.)
 type MappingIdentity interface {
-	// MappingIdentity is reference-counted.
-	refs.RefCounter
+	// IncRef increments the MappingIdentity's reference count.
+	IncRef()
+
+	// DecRef decrements the MappingIdentity's reference count.
+	DecRef()
 
 	// MappedName returns the application-visible name shown in
 	// /proc/[pid]/maps.
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 6803d488c..5c52d4007 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -12,18 +12,26 @@ go_library(
         "bluepill_amd64.go",
         "bluepill_amd64.s",
         "bluepill_amd64_unsafe.go",
+        "bluepill_arm64.go",
+        "bluepill_arm64.s",
+        "bluepill_arm64_unsafe.go",
         "bluepill_fault.go",
         "bluepill_unsafe.go",
         "context.go",
-        "filters.go",
+        "filters_amd64.go",
+        "filters_arm64.go",
         "kvm.go",
         "kvm_amd64.go",
         "kvm_amd64_unsafe.go",
+        "kvm_arm64.go",
+        "kvm_arm64_unsafe.go",
         "kvm_const.go",
+        "kvm_const_arm64.go",
         "machine.go",
         "machine_amd64.go",
         "machine_amd64_unsafe.go",
         "machine_arm64.go",
+        "machine_arm64_unsafe.go",
         "machine_unsafe.go",
         "physical_map.go",
         "virtual_map.go",
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 043de51b3..30dbb74d6 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -20,6 +20,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/safecopy"
 )
 
@@ -36,6 +37,18 @@ func sighandler()
 func dieTrampoline()
 
 var (
+	// bounceSignal is the signal used for bouncing KVM.
+	//
+	// We use SIGCHLD because it is not masked by the runtime, and
+	// it will be ignored properly by other parts of the kernel.
+	bounceSignal = syscall.SIGCHLD
+
+	// bounceSignalMask has only bounceSignal set.
+	bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
+
+	// bounce is the interrupt vector used to return to the kernel.
+	bounce = uint32(ring0.VirtualizationException)
+
 	// savedHandler is a pointer to the previous handler.
 	//
 	// This is called by bluepillHandler.
@@ -45,6 +58,13 @@ var (
 	dieTrampolineAddr uintptr
 )
 
+// redpill invokes a syscall with -1.
+//
+//go:nosplit
+func redpill() {
+	syscall.RawSyscall(^uintptr(0), 0, 0, 0)
+}
+
 // dieHandler is called by dieTrampoline.
 //
 //go:nosplit
@@ -73,8 +93,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
 
 func init() {
 	// Install the handler.
-	if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
-		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err))
+	if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
 	}
 
 	// Extract the address for the trampoline.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index 421c88220..133c2203d 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -24,26 +24,10 @@ import (
 )
 
 var (
-	// bounceSignal is the signal used for bouncing KVM.
-	//
-	// We use SIGCHLD because it is not masked by the runtime, and
-	// it will be ignored properly by other parts of the kernel.
-	bounceSignal = syscall.SIGCHLD
-
-	// bounceSignalMask has only bounceSignal set.
-	bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
-
-	// bounce is the interrupt vector used to return to the kernel.
-	bounce = uint32(ring0.VirtualizationException)
+	// The action for bluepillSignal is changed by sigaction().
+	bluepillSignal = syscall.SIGSEGV
 )
 
-// redpill on amd64 invokes a syscall with -1.
-//
-//go:nosplit
-func redpill() {
-	syscall.RawSyscall(^uintptr(0), 0, 0, 0)
-}
-
 // bluepillArchEnter is called during bluepillEnter.
 //
 //go:nosplit
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 9d8af143e..a63a6a071 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -23,13 +23,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
-// bluepillArchContext returns the arch-specific context.
-//
-//go:nosplit
-func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
-	return &((*arch.UContext64)(context).MContext)
-}
-
 // dieArchSetup initializes the state for dieTrampoline.
 //
 // The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
new file mode 100644
index 000000000..552341721
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -0,0 +1,79 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+)
+
+var (
+	// The action for bluepillSignal is changed by sigaction().
+	bluepillSignal = syscall.SIGILL
+)
+
+// bluepillArchEnter is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
+	c = vCPUPtr(uintptr(context.Regs[8]))
+	regs := c.CPU.Registers()
+	regs.Regs = context.Regs
+	regs.Sp = context.Sp
+	regs.Pc = context.Pc
+	regs.Pstate = context.Pstate
+	regs.Pstate &^= uint64(ring0.KernelFlagsClear)
+	regs.Pstate |= ring0.KernelFlagsSet
+	return
+}
+
+// bluepillArchExit is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
+	regs := c.CPU.Registers()
+	context.Regs = regs.Regs
+	context.Sp = regs.Sp
+	context.Pc = regs.Pc
+	context.Pstate = regs.Pstate
+	context.Pstate &^= uint64(ring0.UserFlagsClear)
+	context.Pstate |= ring0.UserFlagsSet
+}
+
+// KernelSyscall handles kernel syscalls.
+//
+//go:nosplit
+func (c *vCPU) KernelSyscall() {
+	regs := c.Registers()
+	if regs.Regs[8] != ^uint64(0) {
+		regs.Pc -= 4 // Rewind.
+	}
+	ring0.Halt()
+}
+
+// KernelException handles kernel exceptions.
+//
+//go:nosplit
+func (c *vCPU) KernelException(vector ring0.Vector) {
+	regs := c.Registers()
+	if vector == ring0.Vector(bounce) {
+		regs.Pc = 0
+	}
+	ring0.Halt()
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
new file mode 100644
index 000000000..c61700892
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -0,0 +1,87 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// VCPU_CPU is the location of the CPU in the vCPU struct.
+//
+// This is guaranteed to be zero.
+#define VCPU_CPU 0x0
+
+// CPU_SELF is the self reference in ring0's percpu.
+//
+// This is guaranteed to be zero.
+#define CPU_SELF 0x0
+
+// Context offsets.
+//
+// Only limited use of the context is done in the assembly stub below, most is
+// done in the Go handlers.
+#define SIGINFO_SIGNO 0x0
+#define CONTEXT_PC  0x1B8
+#define CONTEXT_R0 0xB8
+
+// See bluepill.go.
+TEXT ·bluepill(SB),NOSPLIT,$0
+begin:
+	MOVD	vcpu+0(FP), R8
+	MOVD	$VCPU_CPU(R8), R9
+	ORR	$0xffff000000000000, R9, R9
+	// Trigger sigill.
+	// In ring0.Start(), the value of R8 will be stored into tpidr_el1.
+	// When the context was loaded into vcpu successfully,
+	// we will check if the value of R10 and R9 are the same.
+	WORD	$0xd538d08a // MRS TPIDR_EL1, R10
+check_vcpu:
+	CMP	R10, R9
+	BEQ	right_vCPU
+wrong_vcpu:
+	CALL	·redpill(SB)
+	B	begin
+right_vCPU:
+	RET
+
+// sighandler: see bluepill.go for documentation.
+//
+// The arguments are the following:
+//
+// 	R0 - The signal number.
+// 	R1 - Pointer to siginfo_t structure.
+// 	R2 - Pointer to ucontext structure.
+//
+TEXT ·sighandler(SB),NOSPLIT,$0
+	// si_signo should be sigill.
+	MOVD	SIGINFO_SIGNO(R1), R7
+	CMPW	$4, R7
+	BNE	fallback
+
+	MOVD	CONTEXT_PC(R2), R7
+	CMPW	$0, R7
+	BEQ	fallback
+
+	MOVD	R2, 8(RSP)
+	BL	·bluepillHandler(SB)   // Call the handler.
+
+	RET
+
+fallback:
+	// Jump to the previous signal handler.
+	MOVD	·savedHandler(SB), R7
+	B	(R7)
+
+// dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation.
+TEXT ·dieTrampoline(SB),NOSPLIT,$0
+	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
+	MOVD	R9, 8(RSP)
+	BL	·dieHandler(SB)
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
new file mode 100644
index 000000000..e5fac0d6a
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+//go:nosplit
+func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
+	// TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index ca011ef78..9add7c944 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -23,6 +23,8 @@ import (
 	"sync/atomic"
 	"syscall"
 	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
 //go:linkname throw runtime.throw
@@ -49,6 +51,13 @@ func uintptrValue(addr *byte) uintptr {
 	return (uintptr)(unsafe.Pointer(addr))
 }
 
+// bluepillArchContext returns the UContext64.
+//
+//go:nosplit
+func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
+	return &((*arch.UContext64)(context).MContext)
+}
+
 // bluepillHandler is called from the signal stub.
 //
 // The world may be stopped while this is executing, and it executes on the
diff --git a/pkg/sentry/platform/kvm/filters.go b/pkg/sentry/platform/kvm/filters_amd64.go
index 7d949f1dd..7d949f1dd 100644
--- a/pkg/sentry/platform/kvm/filters.go
+++ b/pkg/sentry/platform/kvm/filters_amd64.go
diff --git a/pkg/sentry/platform/kvm/filters_arm64.go b/pkg/sentry/platform/kvm/filters_arm64.go
new file mode 100644
index 000000000..9245d07c2
--- /dev/null
+++ b/pkg/sentry/platform/kvm/filters_arm64.go
@@ -0,0 +1,32 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// SyscallFilters returns syscalls made exclusively by the KVM platform.
+func (*KVM) SyscallFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_IOCTL:           {},
+		syscall.SYS_MMAP:            {},
+		syscall.SYS_RT_SIGSUSPEND:   {},
+		syscall.SYS_RT_SIGTIMEDWAIT: {},
+		0xffffffffffffffff:          {}, // KVM uses syscall -1 to transition to host.
+	}
+}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index ee4cd2f4d..f2c2c059e 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -21,7 +21,6 @@ import (
 	"sync"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
@@ -56,9 +55,7 @@ func New(deviceFile *os.File) (*KVM, error) {
 
 	// Ensure global initialization is done.
 	globalOnce.Do(func() {
-		physicalInit()
-		globalErr = updateSystemValues(int(fd))
-		ring0.Init(cpuid.HostFeatureSet())
+		globalErr = updateGlobalOnce(int(fd))
 	})
 	if globalErr != nil {
 		return nil, globalErr
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
index 5d8ef4761..c5a6f9c7d 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -17,6 +17,7 @@
 package kvm
 
 import (
+	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
 )
 
@@ -211,3 +212,11 @@ type cpuidEntries struct {
 	_       uint32
 	entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry
 }
+
+// updateGlobalOnce does global initialization. It has to be called only once.
+func updateGlobalOnce(fd int) error {
+	physicalInit()
+	err := updateSystemValues(int(fd))
+	ring0.Init(cpuid.HostFeatureSet())
+	return err
+}
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
new file mode 100644
index 000000000..2319c86d3
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -0,0 +1,83 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"syscall"
+)
+
+// userMemoryRegion is a region of physical memory.
+//
+// This mirrors kvm_memory_region.
+type userMemoryRegion struct {
+	slot          uint32
+	flags         uint32
+	guestPhysAddr uint64
+	memorySize    uint64
+	userspaceAddr uint64
+}
+
+type kvmOneReg struct {
+	id   uint64
+	addr uint64
+}
+
+const KVM_NR_SPSR = 5
+
+type userFpsimdState struct {
+	vregs    [64]uint64
+	fpsr     uint32
+	fpcr     uint32
+	reserved [2]uint32
+}
+
+type userRegs struct {
+	Regs    syscall.PtraceRegs
+	sp_el1  uint64
+	elr_el1 uint64
+	spsr    [KVM_NR_SPSR]uint64
+	fpRegs  userFpsimdState
+}
+
+// runData is the run structure. This may be mapped for synchronous register
+// access (although that doesn't appear to be supported by my kernel at least).
+//
+// This mirrors kvm_run.
+type runData struct {
+	requestInterruptWindow uint8
+	_                      [7]uint8
+
+	exitReason                 uint32
+	readyForInterruptInjection uint8
+	ifFlag                     uint8
+	_                          [2]uint8
+
+	cr8      uint64
+	apicBase uint64
+
+	// This is the union data for exits. Interpretation depends entirely on
+	// the exitReason above (see vCPU code for more information).
+	data [32]uint64
+}
+
+// updateGlobalOnce does global initialization. It has to be called only once.
+func updateGlobalOnce(fd int) error {
+	physicalInit()
+	err := updateSystemValues(int(fd))
+	updateVectorTable()
+	return err
+}
diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
new file mode 100644
index 000000000..6531bae1d
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
@@ -0,0 +1,39 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"fmt"
+	"syscall"
+)
+
+var (
+	runDataSize int
+)
+
+func updateSystemValues(fd int) error {
+	// Extract the mmap size.
+	sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0)
+	if errno != 0 {
+		return fmt.Errorf("getting VCPU mmap size: %v", errno)
+	}
+	// Save the data.
+	runDataSize = int(sz)
+
+	// Success.
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 766131d60..1d5c77ff4 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -49,11 +49,13 @@ const (
 	_KVM_EXIT_SHUTDOWN        = 0x8
 	_KVM_EXIT_FAIL_ENTRY      = 0x9
 	_KVM_EXIT_INTERNAL_ERROR  = 0x11
+	_KVM_EXIT_SYSTEM_EVENT    = 0x18
 )
 
 // KVM capability options.
 const (
-	_KVM_CAP_MAX_VCPUS = 0x42
+	_KVM_CAP_MAX_VCPUS       = 0x42
+	_KVM_CAP_ARM_VM_IPA_SIZE = 0xa5
 )
 
 // KVM limits.
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
new file mode 100644
index 000000000..5a74c6e36
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -0,0 +1,132 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+// KVM ioctls for Arm64.
+const (
+	_KVM_GET_ONE_REG = 0x4010aeab
+	_KVM_SET_ONE_REG = 0x4010aeac
+
+	_KVM_ARM_PREFERRED_TARGET = 0x8020aeaf
+	_KVM_ARM_VCPU_INIT        = 0x4020aeae
+	_KVM_ARM64_REGS_PSTATE    = 0x6030000000100042
+	_KVM_ARM64_REGS_SP_EL1    = 0x6030000000100044
+	_KVM_ARM64_REGS_R0        = 0x6030000000100000
+	_KVM_ARM64_REGS_R1        = 0x6030000000100002
+	_KVM_ARM64_REGS_R2        = 0x6030000000100004
+	_KVM_ARM64_REGS_R3        = 0x6030000000100006
+	_KVM_ARM64_REGS_R8        = 0x6030000000100010
+	_KVM_ARM64_REGS_R18       = 0x6030000000100024
+	_KVM_ARM64_REGS_PC        = 0x6030000000100040
+	_KVM_ARM64_REGS_MAIR_EL1  = 0x603000000013c510
+	_KVM_ARM64_REGS_TCR_EL1   = 0x603000000013c102
+	_KVM_ARM64_REGS_TTBR0_EL1 = 0x603000000013c100
+	_KVM_ARM64_REGS_TTBR1_EL1 = 0x603000000013c101
+	_KVM_ARM64_REGS_SCTLR_EL1 = 0x603000000013c080
+	_KVM_ARM64_REGS_CPACR_EL1 = 0x603000000013c082
+	_KVM_ARM64_REGS_VBAR_EL1  = 0x603000000013c600
+)
+
+// Arm64: Architectural Feature Access Control Register EL1.
+const (
+	_FPEN_NOTRAP = 0x3
+	_FPEN_SHIFT  = 0x20
+)
+
+// Arm64: System Control Register EL1.
+const (
+	_SCTLR_M = 1 << 0
+	_SCTLR_C = 1 << 2
+	_SCTLR_I = 1 << 12
+)
+
+// Arm64: Translation Control Register EL1.
+const (
+	_TCR_IPS_40BITS = 2 << 32 // PA=40
+	_TCR_IPS_48BITS = 5 << 32 // PA=48
+
+	_TCR_T0SZ_OFFSET = 0
+	_TCR_T1SZ_OFFSET = 16
+	_TCR_IRGN0_SHIFT = 8
+	_TCR_IRGN1_SHIFT = 24
+	_TCR_ORGN0_SHIFT = 10
+	_TCR_ORGN1_SHIFT = 26
+	_TCR_SH0_SHIFT   = 12
+	_TCR_SH1_SHIFT   = 28
+	_TCR_TG0_SHIFT   = 14
+	_TCR_TG1_SHIFT   = 30
+
+	_TCR_T0SZ_VA48 = 64 - 48 // VA=48
+	_TCR_T1SZ_VA48 = 64 - 48 // VA=48
+
+	_TCR_ASID16 = 1 << 36
+	_TCR_TBI0   = 1 << 37
+
+	_TCR_TXSZ_VA48 = (_TCR_T0SZ_VA48 << _TCR_T0SZ_OFFSET) | (_TCR_T1SZ_VA48 << _TCR_T1SZ_OFFSET)
+
+	_TCR_TG0_4K  = 0 << _TCR_TG0_SHIFT // 4K
+	_TCR_TG0_64K = 1 << _TCR_TG0_SHIFT // 64K
+
+	_TCR_TG1_4K = 2 << _TCR_TG1_SHIFT
+
+	_TCR_TG_FLAGS = _TCR_TG0_4K | _TCR_TG1_4K
+
+	_TCR_IRGN0_WBWA = 1 << _TCR_IRGN0_SHIFT
+	_TCR_IRGN1_WBWA = 1 << _TCR_IRGN1_SHIFT
+	_TCR_IRGN_WBWA  = _TCR_IRGN0_WBWA | _TCR_IRGN1_WBWA
+
+	_TCR_ORGN0_WBWA = 1 << _TCR_ORGN0_SHIFT
+	_TCR_ORGN1_WBWA = 1 << _TCR_ORGN1_SHIFT
+
+	_TCR_ORGN_WBWA = _TCR_ORGN0_WBWA | _TCR_ORGN1_WBWA
+
+	_TCR_SHARED = (3 << _TCR_SH0_SHIFT) | (3 << _TCR_SH1_SHIFT)
+
+	_TCR_CACHE_FLAGS = _TCR_IRGN_WBWA | _TCR_ORGN_WBWA
+)
+
+// Arm64: Memory Attribute Indirection Register EL1.
+const (
+	_MT_DEVICE_nGnRnE = 0
+	_MT_DEVICE_nGnRE  = 1
+	_MT_DEVICE_GRE    = 2
+	_MT_NORMAL_NC     = 3
+	_MT_NORMAL        = 4
+	_MT_NORMAL_WT     = 5
+	_MT_EL1_INIT      = (0 << _MT_DEVICE_nGnRnE) | (0x4 << _MT_DEVICE_nGnRE * 8) | (0xc << _MT_DEVICE_GRE * 8) | (0x44 << _MT_NORMAL_NC * 8) | (0xff << _MT_NORMAL * 8) | (0xbb << _MT_NORMAL_WT * 8)
+)
+
+const (
+	_KVM_ARM_VCPU_POWER_OFF = 0 // CPU is started in OFF state
+	_KVM_ARM_VCPU_PSCI_0_2  = 2 // CPU uses PSCI v0.2
+)
+
+// Arm64: Exception Syndrome Register EL1.
+const (
+	_ESR_ELx_FSC = 0x3F
+
+	_ESR_SEGV_MAPERR_L0 = 0x4
+	_ESR_SEGV_MAPERR_L1 = 0x5
+	_ESR_SEGV_MAPERR_L2 = 0x6
+	_ESR_SEGV_MAPERR_L3 = 0x7
+
+	_ESR_SEGV_ACCERR_L1 = 0x9
+	_ESR_SEGV_ACCERR_L2 = 0xa
+	_ESR_SEGV_ACCERR_L3 = 0xb
+
+	_ESR_SEGV_PEMERR_L1 = 0xd
+	_ESR_SEGV_PEMERR_L2 = 0xe
+	_ESR_SEGV_PEMERR_L3 = 0xf
+)
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 61227cafb..7156c245f 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -135,3 +135,43 @@ func (c *vCPU) setSignalMask() error {
 	}
 	return nil
 }
+
+// setUserRegisters sets user registers in the vCPU.
+func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return fmt.Errorf("error setting user registers: %v", errno)
+	}
+	return nil
+}
+
+// getUserRegisters reloads user registers in the vCPU.
+//
+// This is safe to call from a nosplit context.
+//
+//go:nosplit
+func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return errno
+	}
+	return 0
+}
+
+// setSystemRegisters sets system registers.
+func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SREGS,
+		uintptr(unsafe.Pointer(sregs))); errno != 0 {
+		return fmt.Errorf("error setting system registers: %v", errno)
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index b7e2cfb9d..7ae47f291 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -12,8 +12,38 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build arm64
+
 package kvm
 
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+type vCPUArchState struct {
+	// PCIDs is the set of PCIDs for this vCPU.
+	//
+	// This starts above fixedKernelPCID.
+	PCIDs *pagetables.PCIDs
+}
+
+const (
+	// fixedKernelPCID is a fixed kernel PCID used for the kernel page
+	// tables. We must start allocating user PCIDs above this in order to
+	// avoid any conflict (see below).
+	fixedKernelPCID = 1
+
+	// poolPCIDs is the number of PCIDs to record in the database. As this
+	// grows, assignment can take longer, since it is a simple linear scan.
+	// Beyond a relatively small number, there are likely few perform
+	// benefits, since the TLB has likely long since lost any translations
+	// from more than a few PCIDs past.
+	poolPCIDs = 8
+)
+
 // Get all read-only physicalRegions.
 func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
 	var rdonlyRegions []region
@@ -59,3 +89,95 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
 
 	return phyRegions
 }
+
+// dropPageTables drops cached page table entries.
+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	// Clear from all PCIDs.
+	for _, c := range m.vCPUs {
+		c.PCIDs.Drop(pt)
+	}
+}
+
+// nonCanonical generates a canonical address return.
+//
+//go:nosplit
+func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+	*info = arch.SignalInfo{
+		Signo: signal,
+		Code:  arch.SignalInfoKernel,
+	}
+	info.SetAddr(addr) // Include address.
+	return usermem.NoAccess, platform.ErrContextSignal
+}
+
+// fault generates an appropriate fault return.
+//
+//go:nosplit
+func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+	faultAddr := c.GetFaultAddr()
+	code, user := c.ErrorCode()
+
+	// Reset the pointed SignalInfo.
+	*info = arch.SignalInfo{Signo: signal}
+	info.SetAddr(uint64(faultAddr))
+
+	read := true
+	write := false
+	execute := true
+
+	ret := code & _ESR_ELx_FSC
+	switch ret {
+	case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3:
+		info.Code = 1 //SEGV_MAPERR
+		read = false
+		write = true
+		execute = false
+	case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3:
+		info.Code = 2 // SEGV_ACCERR.
+		read = true
+		write = false
+		execute = false
+	default:
+		info.Code = 2
+	}
+
+	if !user {
+		read = true
+		write = false
+		execute = true
+
+	}
+	accessType := usermem.AccessType{
+		Read:    read,
+		Write:   write,
+		Execute: execute,
+	}
+
+	return accessType, platform.ErrContextSignal
+}
+
+// retryInGuest runs the given function in guest mode.
+//
+// If the function does not complete in guest mode (due to execution of a
+// system call due to a GC stall, for example), then it will be retried. The
+// given function must be idempotent as a result of the retry mechanism.
+func (m *machine) retryInGuest(fn func()) {
+	c := m.Get()
+	defer m.Put(c)
+	for {
+		c.ClearErrorCode() // See below.
+		bluepill(c)        // Force guest mode.
+		fn()               // Execute the given function.
+		_, user := c.ErrorCode()
+		if user {
+			// If user is set, then we haven't bailed back to host
+			// mode via a kernel exception or system call. We
+			// consider the full function to have executed in guest
+			// mode and we can return.
+			break
+		}
+	}
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
new file mode 100644
index 000000000..3f2f97a6b
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -0,0 +1,362 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+	"fmt"
+	"reflect"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// setMemoryRegion initializes a region.
+//
+// This may be called from bluepillHandler, and therefore returns an errno
+// directly (instead of wrapping in an error) to avoid allocations.
+//
+//go:nosplit
+func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
+	userRegion := userMemoryRegion{
+		slot:          uint32(slot),
+		flags:         0,
+		guestPhysAddr: uint64(physical),
+		memorySize:    uint64(length),
+		userspaceAddr: uint64(virtual),
+	}
+
+	// Set the region.
+	_, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_SET_USER_MEMORY_REGION,
+		uintptr(unsafe.Pointer(&userRegion)))
+	return errno
+}
+
+type kvmVcpuInit struct {
+	target   uint32
+	features [7]uint32
+}
+
+var vcpuInit kvmVcpuInit
+
+// initArchState initializes architecture-specific state.
+func (m *machine) initArchState() error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_ARM_PREFERRED_TARGET,
+		uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
+		panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno))
+	}
+	return nil
+}
+
+func getPageWithReflect(p uintptr) []byte {
+	return (*(*[0xFFFFFF]byte)(unsafe.Pointer(p & ^uintptr(syscall.Getpagesize()-1))))[:syscall.Getpagesize()]
+}
+
+// Work around: move ring0.Vectors() into a specific address with 11-bits alignment.
+//
+// According to the design documentation of Arm64,
+// the start address of exception vector table should be 11-bits aligned.
+// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S
+// But, we can't align a function's start address to a specific address by using golang.
+// We have raised this question in golang community:
+// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I
+// This function will be removed when golang supports this feature.
+//
+// There are 2 jobs were implemented in this function:
+// 1, move the start address of exception vector table into the specific address.
+// 2, modify the offset of each instruction.
+func updateVectorTable() {
+	fromLocation := reflect.ValueOf(ring0.Vectors).Pointer()
+	offset := fromLocation & (1<<11 - 1)
+	if offset != 0 {
+		offset = 1<<11 - offset
+	}
+
+	toLocation := fromLocation + offset
+	page := getPageWithReflect(toLocation)
+	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
+		panic(err)
+	}
+
+	page = getPageWithReflect(toLocation + 4096)
+	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
+		panic(err)
+	}
+
+	// Move exception-vector-table into the specific address.
+	var entry *uint32
+	var entryFrom *uint32
+	for i := 1; i <= 0x800; i++ {
+		entry = (*uint32)(unsafe.Pointer(toLocation + 0x800 - uintptr(i)))
+		entryFrom = (*uint32)(unsafe.Pointer(fromLocation + 0x800 - uintptr(i)))
+		*entry = *entryFrom
+	}
+
+	// The offset from the address of each unconditionally branch is changed.
+	// We should modify the offset of each instruction.
+	nums := []uint32{0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480, 0x500, 0x580, 0x600, 0x680, 0x700, 0x780}
+	for _, num := range nums {
+		entry = (*uint32)(unsafe.Pointer(toLocation + uintptr(num)))
+		*entry = *entry - (uint32)(offset/4)
+	}
+
+	page = getPageWithReflect(toLocation)
+	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
+		panic(err)
+	}
+
+	page = getPageWithReflect(toLocation + 4096)
+	if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
+		panic(err)
+	}
+}
+
+// initArchState initializes architecture-specific state.
+func (c *vCPU) initArchState() error {
+	var (
+		reg     kvmOneReg
+		data    uint64
+		regGet  kvmOneReg
+		dataGet uint64
+	)
+
+	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
+	regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer())
+
+	vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_ARM_VCPU_INIT,
+		uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
+		panic(fmt.Sprintf("error setting KVM_ARM_VCPU_INIT failed: %v", errno))
+	}
+
+	// cpacr_el1
+	reg.id = _KVM_ARM64_REGS_CPACR_EL1
+	data = (_FPEN_NOTRAP << _FPEN_SHIFT)
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// sctlr_el1
+	regGet.id = _KVM_ARM64_REGS_SCTLR_EL1
+	if err := c.getOneRegister(&regGet); err != nil {
+		return err
+	}
+
+	dataGet |= (_SCTLR_M | _SCTLR_C | _SCTLR_I)
+	data = dataGet
+	reg.id = _KVM_ARM64_REGS_SCTLR_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// tcr_el1
+	data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
+	reg.id = _KVM_ARM64_REGS_TCR_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// mair_el1
+	data = _MT_EL1_INIT
+	reg.id = _KVM_ARM64_REGS_MAIR_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// ttbr0_el1
+	data = c.machine.kernel.PageTables.TTBR0_EL1(false, 0)
+
+	reg.id = _KVM_ARM64_REGS_TTBR0_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	c.SetTtbr0Kvm(uintptr(data))
+
+	// ttbr1_el1
+	data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0)
+
+	reg.id = _KVM_ARM64_REGS_TTBR1_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// sp_el1
+	data = c.CPU.StackTop()
+	reg.id = _KVM_ARM64_REGS_SP_EL1
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// pc
+	reg.id = _KVM_ARM64_REGS_PC
+	data = uint64(reflect.ValueOf(ring0.Start).Pointer())
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// r8
+	reg.id = _KVM_ARM64_REGS_R8
+	data = uint64(reflect.ValueOf(&c.CPU).Pointer())
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	// vbar_el1
+	reg.id = _KVM_ARM64_REGS_VBAR_EL1
+
+	fromLocation := reflect.ValueOf(ring0.Vectors).Pointer()
+	offset := fromLocation & (1<<11 - 1)
+	if offset != 0 {
+		offset = 1<<11 - offset
+	}
+
+	toLocation := fromLocation + offset
+	data = uint64(ring0.KernelStartAddress | toLocation)
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	data = ring0.PsrDefaultSet | ring0.KernelFlagsSet
+	reg.id = _KVM_ARM64_REGS_PSTATE
+	if err := c.setOneRegister(&reg); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+//go:nosplit
+func (c *vCPU) loadSegments(tid uint64) {
+	// TODO(gvisor.dev/issue/1238):  TLS is not supported.
+	// Get TLS from tpidr_el0.
+	atomic.StoreUint64(&c.tid, tid)
+}
+
+func (c *vCPU) setOneRegister(reg *kvmOneReg) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_ONE_REG,
+		uintptr(unsafe.Pointer(reg))); errno != 0 {
+		return fmt.Errorf("error setting one register: %v", errno)
+	}
+	return nil
+}
+
+func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_ONE_REG,
+		uintptr(unsafe.Pointer(reg))); errno != 0 {
+		return fmt.Errorf("error setting one register: %v", errno)
+	}
+	return nil
+}
+
+// setCPUID sets the CPUID to be used by the guest.
+func (c *vCPU) setCPUID() error {
+	return nil
+}
+
+// setSystemTime sets the TSC for the vCPU.
+func (c *vCPU) setSystemTime() error {
+	return nil
+}
+
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+
+	return nil
+}
+
+// SwitchToUser unpacks architectural-details.
+func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
+	// Check for canonical addresses.
+	if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) {
+		return nonCanonical(regs.Pc, int32(syscall.SIGSEGV), info)
+	} else if !ring0.IsCanonical(regs.Sp) {
+		return nonCanonical(regs.Sp, int32(syscall.SIGBUS), info)
+	}
+
+	var vector ring0.Vector
+	ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0)
+	c.SetTtbr0App(uintptr(ttbr0App))
+
+	// TODO(gvisor.dev/issue/1238): full context-switch supporting for Arm64.
+	// The Arm64 user-mode execution state consists of:
+	// x0-x30
+	// PC, SP, PSTATE
+	// V0-V31: 32 128-bit registers for floating point, and simd
+	// FPSR
+	// TPIDR_EL0, used for TLS
+	appRegs := switchOpts.Registers
+	c.SetAppAddr(ring0.KernelStartAddress | uintptr(unsafe.Pointer(appRegs)))
+
+	entersyscall()
+	bluepill(c)
+	vector = c.CPU.SwitchToUser(switchOpts)
+	exitsyscall()
+
+	switch vector {
+	case ring0.Syscall:
+		// Fast path: system call executed.
+		return usermem.NoAccess, nil
+
+	case ring0.PageFault:
+		return c.fault(int32(syscall.SIGSEGV), info)
+	case 0xaa:
+		return usermem.NoAccess, nil
+	default:
+		return usermem.NoAccess, platform.ErrContextSignal
+	}
+
+}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index ed9433311..f04be2ab5 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -87,46 +87,6 @@ func unmapRunData(r *runData) error {
 	return nil
 }
 
-// setUserRegisters sets user registers in the vCPU.
-func (c *vCPU) setUserRegisters(uregs *userRegs) error {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_REGS,
-		uintptr(unsafe.Pointer(uregs))); errno != 0 {
-		return fmt.Errorf("error setting user registers: %v", errno)
-	}
-	return nil
-}
-
-// getUserRegisters reloads user registers in the vCPU.
-//
-// This is safe to call from a nosplit context.
-//
-//go:nosplit
-func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_GET_REGS,
-		uintptr(unsafe.Pointer(uregs))); errno != 0 {
-		return errno
-	}
-	return 0
-}
-
-// setSystemRegisters sets system registers.
-func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_SREGS,
-		uintptr(unsafe.Pointer(sregs))); errno != 0 {
-		return fmt.Errorf("error setting system registers: %v", errno)
-	}
-	return nil
-}
-
 // atomicAddressSpace is an atomic address space pointer.
 type atomicAddressSpace struct {
 	pointer unsafe.Pointer
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 4649a94a7..a55cff507 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -21,6 +21,8 @@ import (
 	"strings"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
@@ -143,3 +145,43 @@ func (t *thread) adjustInitRegsRip() {
 func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 	initregs.R15 = uint64(ppid)
 }
+
+// patchSignalInfo patches the signal info to account for hitting the seccomp
+// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
+// synchronous trap, but patch the structure to appear like a SIGSEGV with the
+// Rip as the faulting address.
+//
+// Note that this should only be called after verifying that the signalInfo has
+// been generated by the kernel.
+func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
+		signalInfo.Signo = int32(linux.SIGSEGV)
+
+		// Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
+		// with the si_call_addr field pointing to the current RIP. This field
+		// aligns with the si_addr field for a SIGSEGV, so we don't need to touch
+		// anything there. We do need to unwind emulation however, so we set the
+		// instruction pointer to the faulting value, and "unpop" the stack.
+		regs.Rip = signalInfo.Addr()
+		regs.Rsp -= 8
+	}
+}
+
+// enableCpuidFault enable cpuid-faulting; this may fail on older kernels or hardware,
+// so we just disregard the result. Host CPUID will be enabled.
+func enableCpuidFault() {
+	syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0)
+}
+
+// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
+// Ref attachedThread() for more detail.
+func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
+	return append(rules, seccomp.RuleSet{
+		Rules: seccomp.SyscallRules{
+			syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+				{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+			},
+		},
+		Action: linux.SECCOMP_RET_ALLOW,
+	})
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index bec884ba5..aed34e7ee 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -17,8 +17,12 @@
 package ptrace
 
 import (
+	"fmt"
+	"strings"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 )
 
@@ -37,7 +41,7 @@ const (
 // resetSysemuRegs sets up emulation registers.
 //
 // This should be called prior to calling sysemu.
-func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) {
+func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
 }
 
 // createSyscallRegs sets up syscall registers.
@@ -124,3 +128,34 @@ func (t *thread) adjustInitRegsRip() {
 func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
 	initregs.Regs[7] = uint64(ppid)
 }
+
+// patchSignalInfo patches the signal info to account for hitting the seccomp
+// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
+// synchronous trap, but patch the structure to appear like a SIGSEGV with the
+// Rip as the faulting address.
+//
+// Note that this should only be called after verifying that the signalInfo has
+// been generated by the kernel.
+func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
+		signalInfo.Signo = int32(linux.SIGSEGV)
+
+		// Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
+		// with the si_call_addr field pointing to the current RIP. This field
+		// aligns with the si_addr field for a SIGSEGV, so we don't need to touch
+		// anything there. We do need to unwind emulation however, so we set the
+		// instruction pointer to the faulting value, and "unpop" the stack.
+		regs.Pc = signalInfo.Addr()
+		regs.Sp -= 8
+	}
+}
+
+// Noop on arm64.
+func enableCpuidFault() {
+}
+
+// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
+// Ref attachedThread() for more detail.
+func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
+	return rules
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 3782d4332..cf13ea5e4 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/procid"
@@ -77,27 +78,6 @@ func probeSeccomp() bool {
 	}
 }
 
-// patchSignalInfo patches the signal info to account for hitting the seccomp
-// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
-// synchronous trap, but patch the structure to appear like a SIGSEGV with the
-// Rip as the faulting address.
-//
-// Note that this should only be called after verifying that the signalInfo has
-// been generated by the kernel.
-func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
-	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
-		signalInfo.Signo = int32(linux.SIGSEGV)
-
-		// Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
-		// with the si_call_addr field pointing to the current RIP. This field
-		// aligns with the si_addr field for a SIGSEGV, so we don't need to touch
-		// anything there. We do need to unwind emulation however, so we set the
-		// instruction pointer to the faulting value, and "unpop" the stack.
-		regs.Rip = signalInfo.Addr()
-		regs.Rsp -= 8
-	}
-}
-
 // createStub creates a fresh stub processes.
 //
 // Precondition: the runtime OS thread must be locked.
@@ -149,7 +129,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 			Rules: seccomp.SyscallRules{
 				syscall.SYS_GETTIMEOFDAY: {},
 				syscall.SYS_TIME:         {},
-				309:                      {}, // SYS_GETCPU.
+				unix.SYS_GETCPU:          {}, // SYS_GETCPU was not defined in package syscall on amd64.
 			},
 			Action:   linux.SECCOMP_RET_TRAP,
 			Vsyscall: true,
@@ -173,10 +153,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 
 				// For the initial process creation.
 				syscall.SYS_WAIT4: {},
-				syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-					{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
-				},
-				syscall.SYS_EXIT: {},
+				syscall.SYS_EXIT:  {},
 
 				// For the stub prctl dance (all).
 				syscall.SYS_PRCTL: []seccomp.Rule{
@@ -196,6 +173,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 			},
 			Action: linux.SECCOMP_RET_ALLOW,
 		})
+
+		rules = appendArchSeccompRules(rules)
 	}
 	instrs, err := seccomp.BuildProgram(rules, defaultAction)
 	if err != nil {
@@ -267,9 +246,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
 	}
 
-	// Enable cpuid-faulting; this may fail on older kernels or hardware,
-	// so we just disregard the result. Host CPUID will be enabled.
-	syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0)
+	// Enable cpuid-faulting.
+	enableCpuidFault()
 
 	// Call the stub; should not return.
 	stubCall(stubStart, ppid)
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 2f65db70b..ba1f9043d 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -16,7 +16,6 @@
 package sighandling
 
 import (
-	"fmt"
 	"os"
 	"os/signal"
 	"reflect"
@@ -31,37 +30,25 @@ const numSignals = 32
 // handleSignals listens for incoming signals and calls the given handler
 // function.
 //
-// It starts when the start channel is closed, stops when the stop channel
-// is closed, and closes done once it will no longer deliver signals to k.
-func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start, stop, done chan struct{}) {
+// It stops when the stop channel is closed. The done channel is closed once it
+// will no longer deliver signals to k.
+func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), stop, done chan struct{}) {
 	// Build a select case.
-	sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(start)}}
+	sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}}
 	for _, sigchan := range sigchans {
 		sc = append(sc, reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(sigchan)})
 	}
 
-	started := false
 	for {
 		// Wait for a notification.
 		index, _, ok := reflect.Select(sc)
 
-		// Was it the start / stop channel?
+		// Was it the stop channel?
 		if index == 0 {
 			if !ok {
-				if !started {
-					// start channel; start forwarding and
-					// swap this case for the stop channel
-					// to select stop requests.
-					started = true
-					sc[0] = reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}
-				} else {
-					// stop channel; stop forwarding and
-					// clear this case so it is never
-					// selected again.
-					started = false
-					close(done)
-					sc[0].Chan = reflect.Value{}
-				}
+				// Stop forwarding and notify that it's done.
+				close(done)
+				return
 			}
 			continue
 		}
@@ -73,44 +60,17 @@ func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start,
 
 		// Otherwise, it was a signal on channel N. Index 0 represents the stop
 		// channel, so index N represents the channel for signal N.
-		signal := linux.Signal(index)
-
-		if !started {
-			// Kernel cannot receive signals, either because it is
-			// not ready yet or is shutting down.
-			//
-			// Kill ourselves if this signal would have killed the
-			// process before PrepareForwarding was called. i.e., all
-			// _SigKill signals; see Go
-			// src/runtime/sigtab_linux_generic.go.
-			//
-			// Otherwise ignore the signal.
-			//
-			// TODO(b/114489875): Drop in Go 1.12, which uses tgkill
-			// in runtime.raise.
-			switch signal {
-			case linux.SIGHUP, linux.SIGINT, linux.SIGTERM:
-				dieFromSignal(signal)
-				panic(fmt.Sprintf("Failed to die from signal %d", signal))
-			default:
-				continue
-			}
-		}
-
-		// Pass the signal to the handler.
-		handler(signal)
+		handler(linux.Signal(index))
 	}
 }
 
-// PrepareHandler ensures that synchronous signals are passed to the given
-// handler function and returns a callback that starts signal delivery, which
-// itself returns a callback that stops signal handling.
+// StartSignalForwarding ensures that synchronous signals are passed to the
+// given handler function and returns a callback that stops signal delivery.
 //
 // Note that this function permanently takes over signal handling. After the
 // stop callback, signals revert to the default Go runtime behavior, which
 // cannot be overridden with external calls to signal.Notify.
-func PrepareHandler(handler func(linux.Signal)) func() func() {
-	start := make(chan struct{})
+func StartSignalForwarding(handler func(linux.Signal)) func() {
 	stop := make(chan struct{})
 	done := make(chan struct{})
 
@@ -128,13 +88,10 @@ func PrepareHandler(handler func(linux.Signal)) func() func() {
 		signal.Notify(sigchan, syscall.Signal(sig))
 	}
 	// Start up our listener.
-	go handleSignals(sigchans, handler, start, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
+	go handleSignals(sigchans, handler, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
 
-	return func() func() {
-		close(start)
-		return func() {
-			close(stop)
-			<-done
-		}
+	return func() {
+		close(stop)
+		<-done
 	}
 }
diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
index c303435d5..1ebe22d34 100644
--- a/pkg/sentry/sighandling/sighandling_unsafe.go
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -15,8 +15,6 @@
 package sighandling
 
 import (
-	"fmt"
-	"runtime"
 	"syscall"
 	"unsafe"
 
@@ -48,27 +46,3 @@ func IgnoreChildStop() error {
 
 	return nil
 }
-
-// dieFromSignal kills the current process with sig.
-//
-// Preconditions: The default action of sig is termination.
-func dieFromSignal(sig linux.Signal) {
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
-	sa := sigaction{handler: linux.SIG_DFL}
-	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, linux.SignalSetSize, 0, 0); e != 0 {
-		panic(fmt.Sprintf("rt_sigaction failed: %v", e))
-	}
-
-	set := linux.MakeSignalSet(sig)
-	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_UNBLOCK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0); e != 0 {
-		panic(fmt.Sprintf("rt_sigprocmask failed: %v", e))
-	}
-
-	if err := syscall.Tgkill(syscall.Getpid(), syscall.Gettid(), syscall.Signal(sig)); err != nil {
-		panic(fmt.Sprintf("tgkill failed: %v", err))
-	}
-
-	panic("failed to die")
-}
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 782a3cb92..af1a4e95f 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -195,15 +195,15 @@ func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([
 	// the available space, we must align down.
 	//
 	// align must be >= 4 and each data int32 is 4 bytes. The length of the
-	// header is already aligned, so if we align to the with of the data there
+	// header is already aligned, so if we align to the width of the data there
 	// are two cases:
 	// 1. The aligned length is less than the length of the header. The
 	// unaligned length was also less than the length of the header, so we
 	// can't write anything.
 	// 2. The aligned length is greater than or equal to the length of the
-	// header. We can write the header plus zero or more datas. We can't write
-	// a partial int32, so the length of the message will be
-	// min(aligned length, header + datas).
+	// header. We can write the header plus zero or more bytes of data. We can't
+	// write a partial int32, so the length of the message will be
+	// min(aligned length, header + data).
 	if space < linux.SizeOfControlMessageHeader {
 		flags |= linux.MSG_CTRUNC
 		return buf, flags
@@ -240,12 +240,12 @@ func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data interf
 
 	buf = binary.Marshal(buf, usermem.ByteOrder, data)
 
-	// Check if we went over.
+	// If the control message data brought us over capacity, omit it.
 	if cap(buf) != cap(ob) {
 		return hdrBuf
 	}
 
-	// Fix up length.
+	// Update control message length to include data.
 	putUint64(ob, uint64(len(buf)-len(ob)))
 
 	return alignSlice(buf, align)
@@ -348,43 +348,62 @@ func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
 	)
 }
 
-func addSpaceForCmsg(cmsgDataLen int, buf []byte) []byte {
-	newBuf := make([]byte, 0, len(buf)+linux.SizeOfControlMessageHeader+cmsgDataLen)
-	return append(newBuf, buf...)
-}
-
-// PackControlMessages converts the given ControlMessages struct into a buffer.
+// PackControlMessages packs control messages into the given buffer.
+//
 // We skip control messages specific to Unix domain sockets.
-func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages) []byte {
-	var buf []byte
-	// The use of t.Arch().Width() is analogous to Linux's use of sizeof(long) in
-	// CMSG_ALIGN.
-	width := t.Arch().Width()
-
+//
+// Note that some control messages may be truncated if they do not fit under
+// the capacity of buf.
+func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byte) []byte {
 	if cmsgs.IP.HasTimestamp {
-		buf = addSpaceForCmsg(int(width), buf)
 		buf = PackTimestamp(t, cmsgs.IP.Timestamp, buf)
 	}
 
 	if cmsgs.IP.HasInq {
 		// In Linux, TCP_CM_INQ is added after SO_TIMESTAMP.
-		buf = addSpaceForCmsg(AlignUp(linux.SizeOfControlMessageInq, width), buf)
 		buf = PackInq(t, cmsgs.IP.Inq, buf)
 	}
 
 	if cmsgs.IP.HasTOS {
-		buf = addSpaceForCmsg(AlignUp(linux.SizeOfControlMessageTOS, width), buf)
 		buf = PackTOS(t, cmsgs.IP.TOS, buf)
 	}
 
 	if cmsgs.IP.HasTClass {
-		buf = addSpaceForCmsg(AlignUp(linux.SizeOfControlMessageTClass, width), buf)
 		buf = PackTClass(t, cmsgs.IP.TClass, buf)
 	}
 
 	return buf
 }
 
+// cmsgSpace is equivalent to CMSG_SPACE in Linux.
+func cmsgSpace(t *kernel.Task, dataLen int) int {
+	return linux.SizeOfControlMessageHeader + AlignUp(dataLen, t.Arch().Width())
+}
+
+// CmsgsSpace returns the number of bytes needed to fit the control messages
+// represented in cmsgs.
+func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int {
+	space := 0
+
+	if cmsgs.IP.HasTimestamp {
+		space += cmsgSpace(t, linux.SizeOfTimeval)
+	}
+
+	if cmsgs.IP.HasInq {
+		space += cmsgSpace(t, linux.SizeOfControlMessageInq)
+	}
+
+	if cmsgs.IP.HasTOS {
+		space += cmsgSpace(t, linux.SizeOfControlMessageTOS)
+	}
+
+	if cmsgs.IP.HasTClass {
+		space += cmsgSpace(t, linux.SizeOfControlMessageTClass)
+	}
+
+	return space
+}
+
 // Parse parses a raw socket control message into portable objects.
 func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) {
 	var (
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index a8c152b54..c957b0f1d 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -45,7 +45,7 @@ const (
 	sizeofSockaddr = syscall.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in)
 
 	// maxControlLen is the maximum size of a control message buffer used in a
-	// recvmsg syscall.
+	// recvmsg or sendmsg syscall.
 	maxControlLen = 1024
 )
 
@@ -412,9 +412,12 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			msg.Namelen = uint32(len(senderAddrBuf))
 		}
 		if controlLen > 0 {
-			controlBuf = make([]byte, maxControlLen)
+			if controlLen > maxControlLen {
+				controlLen = maxControlLen
+			}
+			controlBuf = make([]byte, controlLen)
 			msg.Control = &controlBuf[0]
-			msg.Controllen = maxControlLen
+			msg.Controllen = controlLen
 		}
 		n, err := recvmsg(s.fd, &msg, sysflags)
 		if err != nil {
@@ -489,7 +492,14 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		return 0, syserr.ErrInvalidArgument
 	}
 
-	controlBuf := control.PackControlMessages(t, controlMessages)
+	space := uint64(control.CmsgsSpace(t, controlMessages))
+	if space > maxControlLen {
+		space = maxControlLen
+	}
+	controlBuf := make([]byte, 0, space)
+	// PackControlMessages will append up to space bytes to controlBuf.
+	controlBuf = control.PackControlMessages(t, controlMessages, controlBuf)
+
 	sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) {
 		// Refuse to do anything if any part of src.Addrs was unusable.
 		if uint64(src.NumBytes()) != srcs.NumBytes() {
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index d92399efd..140851c17 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -151,6 +151,8 @@ var Metrics = tcpip.Stats{
 		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
 		CurrentEstablished:                 mustCreateMetric("/netstack/tcp/current_established", "Number of connections in either ESTABLISHED or CLOSE-WAIT state now."),
 		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
+		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "number of times established TCP connections made a transition to CLOSED state."),
+		EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
 		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
 		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
 		ListenOverflowSynCookieSent:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
@@ -324,7 +326,7 @@ func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress,
 	}
 
 	family := usermem.ByteOrder.Uint16(addr)
-	if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) {
+	if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) {
 		return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
 	}
 
@@ -1125,6 +1127,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(time.Duration(v) / time.Second), nil
 
+	case linux.TCP_USER_TIMEOUT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.TCPUserTimeoutOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Millisecond), nil
+
 	case linux.TCP_INFO:
 		var v tcpip.TCPInfoOption
 		if err := ep.GetSockOpt(&v); err != nil {
@@ -1561,6 +1575,17 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_USER_TIMEOUT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		if v < 0 {
+			return syserr.ErrInvalidArgument
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))))
+
 	case linux.TCP_CONGESTION:
 		v := tcpip.CongestionControlOption(optVal)
 		if err := ep.SetSockOpt(v); err != nil {
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 8c250c325..2389a9cdb 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -43,6 +43,11 @@ type ControlMessages struct {
 	IP   tcpip.ControlMessages
 }
 
+// Release releases Unix domain socket credentials and rights.
+func (c *ControlMessages) Release() {
+	c.Unix.Release()
+}
+
 // Socket is the interface containing socket syscalls used by the syscall layer
 // to redirect them to the appropriate implementation.
 type Socket interface {
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 1aaae8487..885758054 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -118,6 +118,9 @@ func (s *SocketOperations) Endpoint() transport.Endpoint {
 func extractPath(sockaddr []byte) (string, *syserr.Error) {
 	addr, _, err := netstack.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */)
 	if err != nil {
+		if err == syserr.ErrAddressFamilyNotSupported {
+			err = syserr.ErrInvalidArgument
+		}
 		return "", err
 	}
 
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 72ebf766d..d46421199 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -14,6 +14,7 @@ go_library(
         "open.go",
         "poll.go",
         "ptrace.go",
+        "select.go",
         "signal.go",
         "socket.go",
         "strace.go",
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 5d57b75af..e603f858f 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -40,7 +40,7 @@ var linuxAMD64 = SyscallMap{
 	20:  makeSyscallInfo("writev", FD, WriteIOVec, Hex),
 	21:  makeSyscallInfo("access", Path, Oct),
 	22:  makeSyscallInfo("pipe", PipeFDs),
-	23:  makeSyscallInfo("select", Hex, Hex, Hex, Hex, Timeval),
+	23:  makeSyscallInfo("select", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timeval),
 	24:  makeSyscallInfo("sched_yield"),
 	25:  makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex),
 	26:  makeSyscallInfo("msync", Hex, Hex, Hex),
@@ -287,7 +287,7 @@ var linuxAMD64 = SyscallMap{
 	267: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex),
 	268: makeSyscallInfo("fchmodat", FD, Path, Mode),
 	269: makeSyscallInfo("faccessat", FD, Path, Oct, Hex),
-	270: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex),
+	270: makeSyscallInfo("pselect6", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timespec, SigSet),
 	271: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex),
 	272: makeSyscallInfo("unshare", CloneFlags),
 	273: makeSyscallInfo("set_robust_list", Hex, Hex),
@@ -335,5 +335,33 @@ var linuxAMD64 = SyscallMap{
 	315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex),
 	316: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex),
 	317: makeSyscallInfo("seccomp", Hex, Hex, Hex),
+	318: makeSyscallInfo("getrandom", Hex, Hex, Hex),
+	319: makeSyscallInfo("memfd_create", Path, Hex), // Not quite a path, but close.
+	320: makeSyscallInfo("kexec_file_load", FD, FD, Hex, Hex, Hex),
+	321: makeSyscallInfo("bpf", Hex, Hex, Hex),
+	322: makeSyscallInfo("execveat", FD, Path, ExecveStringVector, ExecveStringVector, Hex),
+	323: makeSyscallInfo("userfaultfd", Hex),
+	324: makeSyscallInfo("membarrier", Hex, Hex),
+	325: makeSyscallInfo("mlock2", Hex, Hex, Hex),
+	326: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex),
+	327: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex),
+	328: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex),
+	329: makeSyscallInfo("pkey_mprotect", Hex, Hex, Hex, Hex),
+	330: makeSyscallInfo("pkey_alloc", Hex, Hex),
+	331: makeSyscallInfo("pkey_free", Hex),
 	332: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex),
+	333: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet),
+	334: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex),
+	424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex),
+	425: makeSyscallInfo("io_uring_setup", Hex, Hex),
+	426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex),
+	427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex),
+	428: makeSyscallInfo("open_tree", FD, Path, Hex),
+	429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex),
+	430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close.
+	431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex),
+	432: makeSyscallInfo("fsmount", FD, Hex, Hex),
+	433: makeSyscallInfo("fspick", FD, Path, Hex),
+	434: makeSyscallInfo("pidfd_open", Hex, Hex),
+	435: makeSyscallInfo("clone3", Hex, Hex),
 }
diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go
new file mode 100644
index 000000000..dea309fda
--- /dev/null
+++ b/pkg/sentry/strace/select.go
@@ -0,0 +1,53 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+func fdsFromSet(t *kernel.Task, set []byte) []int {
+	var fds []int
+	// Append n if the n-th bit is 1.
+	for i, v := range set {
+		for j := 0; j < 8; j++ {
+			if (v>>j)&1 == 1 {
+				fds = append(fds, i*8+j)
+			}
+		}
+	}
+	return fds
+}
+
+func fdSet(t *kernel.Task, nfds int, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	// Calculate the size of the fd set (one bit per fd).
+	nBytes := (nfds + 7) / 8
+	nBitsInLastPartialByte := nfds % 8
+
+	set, err := linux.CopyInFDSet(t, addr, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return fmt.Sprintf("%#x (error decoding fdset: %s)", addr, err)
+	}
+
+	return fmt.Sprintf("%#x %v", addr, fdsFromSet(t, set))
+}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 311389547..629c1f308 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -439,6 +439,8 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer()))
 		case PollFDs:
 			output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false))
+		case SelectFDSet:
+			output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer()))
 		case Oct:
 			output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8))
 		case Hex:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 3c389d375..e5d486c4e 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -206,6 +206,10 @@ const (
 	// PollFDs is an array of struct pollfd. The number of entries in the
 	// array is in the next argument.
 	PollFDs
+
+	// SelectFDSet is an fd_set argument in select(2)/pselect(2). The number of
+	// fds represented must be the first argument.
+	SelectFDSet
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 4c0bf96e4..6766ba587 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -49,6 +49,7 @@ go_library(
         "sys_tls.go",
         "sys_utsname.go",
         "sys_write.go",
+        "sys_xattr.go",
         "timespec.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux",
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 81e4f93a6..272ae9991 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,10 +228,10 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		188: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
 		189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		191: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
 		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
@@ -260,7 +260,7 @@ var AMD64 = &kernel.SyscallTable{
 		217: syscalls.Supported("getdents64", Getdents64),
 		218: syscalls.Supported("set_tid_address", SetTidAddress),
 		219: syscalls.Supported("restart_syscall", RestartSyscall),
-		220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
+		220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
 		221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
 		222: syscalls.Supported("timer_create", TimerCreate),
 		223: syscalls.Supported("timer_settime", TimerSettime),
@@ -367,11 +367,31 @@ var AMD64 = &kernel.SyscallTable{
 		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(gvisor.dev/issue/267)
 		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
 
-		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
+		// Syscalls implemented after 325 are "backports" from versions
+		// of Linux after 4.4.
 		326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
 		327: syscalls.Supported("preadv2", Preadv2),
 		328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+		329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+		330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+		331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
 		332: syscalls.Supported("statx", Statx),
+		333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+		334: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil),
+
+		// Linux skips ahead to syscall 424 to sync numbers between arches.
+		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+		425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+		426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+		427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+		428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+		429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+		430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+		431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+		432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+		433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+		434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+		435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
 	},
 
 	Emulate: map[usermem.Addr]uintptr{
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index f1dd4b0c0..3b584eed9 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -41,10 +41,10 @@ var ARM64 = &kernel.SyscallTable{
 		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		5:   syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		5:   syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
 		6:   syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		7:   syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		8:   syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
 		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
@@ -224,7 +224,7 @@ var ARM64 = &kernel.SyscallTable{
 		189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}),          // TODO(b/29354921)
 		190: syscalls.Supported("semget", Semget),
 		191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
-		192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
+		192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
 		193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
 		194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
 		195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
@@ -302,7 +302,26 @@ var ARM64 = &kernel.SyscallTable{
 		285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
 		286: syscalls.Supported("preadv2", Preadv2),
 		287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+		288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+		289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+		290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
 		291: syscalls.Supported("statx", Statx),
+		292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+		293: syscalls.ErrorWithEvent("rseq", syserror.ENOSYS, "", nil),
+
+		// Linux skips ahead to syscall 424 to sync numbers between arches.
+		424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+		425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+		426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+		427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+		428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+		429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+		430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+		431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+		432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+		433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+		434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+		435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
 	},
 	Emulate: map[usermem.Addr]uintptr{},
 
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 3b9181002..9bc2445a5 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -840,25 +840,42 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	return uintptr(newfd), nil, nil
 }
 
-func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx {
 	ma := file.Async(nil)
 	if ma == nil {
-		return 0
+		return linux.FOwnerEx{}
 	}
 	a := ma.(*fasync.FileAsync)
 	ot, otg, opg := a.Owner()
 	switch {
 	case ot != nil:
-		return int32(t.PIDNamespace().IDOfTask(ot))
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_TID,
+			PID:  int32(t.PIDNamespace().IDOfTask(ot)),
+		}
 	case otg != nil:
-		return int32(t.PIDNamespace().IDOfThreadGroup(otg))
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PID,
+			PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+		}
 	case opg != nil:
-		return int32(-t.PIDNamespace().IDOfProcessGroup(opg))
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PGRP,
+			PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+		}
 	default:
-		return 0
+		return linux.FOwnerEx{}
 	}
 }
 
+func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+	owner := fGetOwnEx(t, file)
+	if owner.Type == linux.F_OWNER_PGRP {
+		return -owner.PID
+	}
+	return owner.PID
+}
+
 // fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux.
 //
 // If who is positive, it represents a PID. If negative, it represents a PGID.
@@ -901,11 +918,13 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		t.FDTable().SetFlags(fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
+		return 0, nil, nil
 	case linux.F_GETFL:
 		return uintptr(file.Flags().ToLinux()), nil, nil
 	case linux.F_SETFL:
 		flags := uint(args[2].Uint())
 		file.SetFlags(linuxToFlags(flags).Settable())
+		return 0, nil, nil
 	case linux.F_SETLK, linux.F_SETLKW:
 		// In Linux the file system can choose to provide lock operations for an inode.
 		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
@@ -1008,6 +1027,44 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.F_SETOWN:
 		fSetOwn(t, file, args[2].Int())
 		return 0, nil, nil
+	case linux.F_GETOWN_EX:
+		addr := args[2].Pointer()
+		owner := fGetOwnEx(t, file)
+		_, err := t.CopyOut(addr, &owner)
+		return 0, nil, err
+	case linux.F_SETOWN_EX:
+		addr := args[2].Pointer()
+		var owner linux.FOwnerEx
+		n, err := t.CopyIn(addr, &owner)
+		if err != nil {
+			return 0, nil, err
+		}
+		a := file.Async(fasync.New).(*fasync.FileAsync)
+		switch owner.Type {
+		case linux.F_OWNER_TID:
+			task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID))
+			if task == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerTask(t, task)
+			return uintptr(n), nil, nil
+		case linux.F_OWNER_PID:
+			tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
+			if tg == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerThreadGroup(t, tg)
+			return uintptr(n), nil, nil
+		case linux.F_OWNER_PGRP:
+			pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
+			if pg == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerProcessGroup(t, pg)
+			return uintptr(n), nil, nil
+		default:
+			return 0, nil, syserror.EINVAL
+		}
 	case linux.F_GET_SEALS:
 		val, err := tmpfs.GetSeals(file.Dirent.Inode)
 		return uintptr(val), nil, err
@@ -1035,7 +1092,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		// Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
 	}
-	return 0, nil, nil
 }
 
 const (
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 7a13beac2..2b2df989a 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -197,53 +197,51 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
 	return remainingTimeout, n, err
 }
 
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
+	set := make([]byte, nBytes)
+
+	if addr != 0 {
+		if _, err := t.CopyIn(addr, &set); err != nil {
+			return nil, err
+		}
+		// If we only use part of the last byte, mask out the extraneous bits.
+		//
+		// N.B. This only works on little-endian architectures.
+		if nBitsInLastPartialByte != 0 {
+			set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+		}
+	}
+	return set, nil
+}
+
 func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
 	if nfds < 0 || nfds > fileCap {
 		return 0, syserror.EINVAL
 	}
 
-	// Capture all the provided input vectors.
-	//
-	// N.B. This only works on little-endian architectures.
-	byteCount := (nfds + 7) / 8
-
-	bitsInLastPartialByte := uint(nfds % 8)
-	r := make([]byte, byteCount)
-	w := make([]byte, byteCount)
-	e := make([]byte, byteCount)
+	// Calculate the size of the fd sets (one bit per fd).
+	nBytes := (nfds + 7) / 8
+	nBitsInLastPartialByte := nfds % 8
 
-	if readFDs != 0 {
-		if _, err := t.CopyIn(readFDs, &r); err != nil {
-			return 0, err
-		}
-		// Mask out bits above nfds.
-		if bitsInLastPartialByte != 0 {
-			r[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
-		}
+	// Capture all the provided input vectors.
+	r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
 	}
-
-	if writeFDs != 0 {
-		if _, err := t.CopyIn(writeFDs, &w); err != nil {
-			return 0, err
-		}
-		if bitsInLastPartialByte != 0 {
-			w[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
-		}
+	w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
 	}
-
-	if exceptFDs != 0 {
-		if _, err := t.CopyIn(exceptFDs, &e); err != nil {
-			return 0, err
-		}
-		if bitsInLastPartialByte != 0 {
-			e[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
-		}
+	e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+	if err != nil {
+		return 0, err
 	}
 
 	// Count how many FDs are actually being requested so that we can build
 	// a PollFD array.
 	fdCount := 0
-	for i := 0; i < byteCount; i++ {
+	for i := 0; i < nBytes; i++ {
 		v := r[i] | w[i] | e[i]
 		for v != 0 {
 			v &= (v - 1)
@@ -254,7 +252,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 	// Build the PollFD array.
 	pfd := make([]linux.PollFD, 0, fdCount)
 	var fd int32
-	for i := 0; i < byteCount; i++ {
+	for i := 0; i < nBytes; i++ {
 		rV, wV, eV := r[i], w[i], e[i]
 		v := rV | wV | eV
 		m := byte(1)
@@ -295,8 +293,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 	}
 
 	// Do the syscall, then count the number of bits set.
-	_, _, err := pollBlock(t, pfd, timeout)
-	if err != nil {
+	if _, _, err = pollBlock(t, pfd, timeout); err != nil {
 		return 0, syserror.ConvertIntr(err, syserror.EINTR)
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index d8acae063..4b5aafcc0 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -764,7 +764,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 		}
 		if !cms.Unix.Empty() {
 			mflags |= linux.MSG_CTRUNC
-			cms.Unix.Release()
+			cms.Release()
 		}
 
 		if int(msg.Flags) != mflags {
@@ -784,32 +784,16 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	if e != nil {
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
-	defer cms.Unix.Release()
+	defer cms.Release()
 
 	controlData := make([]byte, 0, msg.ControlLen)
+	controlData = control.PackControlMessages(t, cms, controlData)
 
 	if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
 		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
 		controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
 	}
 
-	if cms.IP.HasTimestamp {
-		controlData = control.PackTimestamp(t, cms.IP.Timestamp, controlData)
-	}
-
-	if cms.IP.HasInq {
-		// In Linux, TCP_CM_INQ is added after SO_TIMESTAMP.
-		controlData = control.PackInq(t, cms.IP.Inq, controlData)
-	}
-
-	if cms.IP.HasTOS {
-		controlData = control.PackTOS(t, cms.IP.TOS, controlData)
-	}
-
-	if cms.IP.HasTClass {
-		controlData = control.PackTClass(t, cms.IP.TClass, controlData)
-	}
-
 	if cms.Unix.Rights != nil {
 		controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
 	}
@@ -885,7 +869,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
 	}
 
 	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
-	cm.Unix.Release()
+	cm.Release()
 	if e != nil {
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
@@ -1071,7 +1055,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
 	err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
 	if err != nil {
-		controlMessages.Unix.Release()
+		controlMessages.Release()
 	}
 	return uintptr(n), err
 }
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
new file mode 100644
index 000000000..97d9a65ea
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -0,0 +1,169 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getxattr implements linux syscall getxattr(2).
+func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	valueLen := 0
+	err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		value, err := getxattr(t, d, dirPath, nameAddr)
+		if err != nil {
+			return err
+		}
+
+		valueLen = len(value)
+		if size == 0 {
+			return nil
+		}
+		if size > linux.XATTR_SIZE_MAX {
+			size = linux.XATTR_SIZE_MAX
+		}
+		if valueLen > int(size) {
+			return syserror.ERANGE
+		}
+
+		_, err = t.CopyOutBytes(valueAddr, []byte(value))
+		return err
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(valueLen), nil, nil
+}
+
+// getxattr implements getxattr from the given *fs.Dirent.
+func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr) (string, error) {
+	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+		return "", syserror.ENOTDIR
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
+		return "", err
+	}
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return "", err
+	}
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+
+	return d.Inode.Getxattr(name)
+}
+
+// Setxattr implements linux syscall setxattr(2).
+func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Uint()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		return setxattr(t, d, dirPath, nameAddr, valueAddr, size, flags)
+	})
+}
+
+// setxattr implements setxattr from the given *fs.Dirent.
+func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint, flags uint32) error {
+	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+		return syserror.ENOTDIR
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+		return err
+	}
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	if size > linux.XATTR_SIZE_MAX {
+		return syserror.E2BIG
+	}
+	buf := make([]byte, size)
+	if _, err = t.CopyInBytes(valueAddr, buf); err != nil {
+		return err
+	}
+	value := string(buf)
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+
+	return d.Inode.Setxattr(name, value)
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+	name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+	if err != nil {
+		if err == syserror.ENAMETOOLONG {
+			return "", syserror.ERANGE
+		}
+		return "", err
+	}
+	if len(name) == 0 {
+		return "", syserror.ERANGE
+	}
+	return name, nil
+}
+
+func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error {
+	// Restrict xattrs to regular files and directories.
+	//
+	// In Linux, this restriction technically only applies to xattrs in the
+	// "user.*" namespace, but we don't allow any other xattr prefixes anyway.
+	if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) {
+		if perms.Write {
+			return syserror.EPERM
+		}
+		return syserror.ENODATA
+	}
+
+	return i.CheckPermission(t, perms)
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 74a325309..e3e554b88 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -17,9 +17,9 @@ go_library(
         "mount.go",
         "mount_unsafe.go",
         "options.go",
+        "pathname.go",
         "permissions.go",
         "resolving_path.go",
-        "syscalls.go",
         "testutil.go",
         "vfs.go",
     ],
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index 32cf9151b..705194ebc 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -24,6 +24,9 @@ type contextID int
 const (
 	// CtxMountNamespace is a Context.Value key for a MountNamespace.
 	CtxMountNamespace contextID = iota
+
+	// CtxRoot is a Context.Value key for a VFS root.
+	CtxRoot
 )
 
 // MountNamespaceFromContext returns the MountNamespace used by ctx. It does
@@ -35,3 +38,13 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
 	}
 	return nil
 }
+
+// RootFromContext returns the VFS root used by ctx. It takes a reference on
+// the returned VirtualDentry. If ctx does not have a specific VFS root,
+// RootFromContext returns a zero-value VirtualDentry.
+func RootFromContext(ctx context.Context) VirtualDentry {
+	if v := ctx.Value(CtxRoot); v != nil {
+		return v.(VirtualDentry)
+	}
+	return VirtualDentry{}
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 34007eb57..6575afd16 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -241,3 +241,140 @@ type IterDirentsCallback interface {
 	// called.
 	Handle(dirent Dirent) bool
 }
+
+// OnClose is called when a file descriptor representing the FileDescription is
+// closed. Returning a non-nil error should not prevent the file descriptor
+// from being closed.
+func (fd *FileDescription) OnClose(ctx context.Context) error {
+	return fd.impl.OnClose(ctx)
+}
+
+// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
+func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+	flags, err := fd.impl.StatusFlags(ctx)
+	flags |= linux.O_LARGEFILE
+	return flags, err
+}
+
+// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
+func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+	return fd.impl.SetStatusFlags(ctx, flags)
+}
+
+// Stat returns metadata for the file represented by fd.
+func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	return fd.impl.Stat(ctx, opts)
+}
+
+// SetStat updates metadata for the file represented by fd.
+func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
+	return fd.impl.SetStat(ctx, opts)
+}
+
+// StatFS returns metadata for the filesystem containing the file represented
+// by fd.
+func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return fd.impl.StatFS(ctx)
+}
+
+// PRead reads from the file represented by fd into dst, starting at the given
+// offset, and returns the number of bytes read. PRead is permitted to return
+// partial reads with a nil error.
+func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	return fd.impl.PRead(ctx, dst, offset, opts)
+}
+
+// Read is similar to PRead, but does not specify an offset.
+func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	return fd.impl.Read(ctx, dst, opts)
+}
+
+// PWrite writes src to the file represented by fd, starting at the given
+// offset, and returns the number of bytes written. PWrite is permitted to
+// return partial writes with a nil error.
+func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	return fd.impl.PWrite(ctx, src, offset, opts)
+}
+
+// Write is similar to PWrite, but does not specify an offset.
+func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	return fd.impl.Write(ctx, src, opts)
+}
+
+// IterDirents invokes cb on each entry in the directory represented by fd. If
+// IterDirents has been called since the last call to Seek, it continues
+// iteration from the end of the last call.
+func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
+	return fd.impl.IterDirents(ctx, cb)
+}
+
+// Seek changes fd's offset (assuming one exists) and returns its new value.
+func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return fd.impl.Seek(ctx, offset, whence)
+}
+
+// Sync has the semantics of fsync(2).
+func (fd *FileDescription) Sync(ctx context.Context) error {
+	return fd.impl.Sync(ctx)
+}
+
+// ConfigureMMap mutates opts to implement mmap(2) for the file represented by
+// fd.
+func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.impl.ConfigureMMap(ctx, opts)
+}
+
+// Ioctl implements the ioctl(2) syscall.
+func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return fd.impl.Ioctl(ctx, uio, args)
+}
+
+// SyncFS instructs the filesystem containing fd to execute the semantics of
+// syncfs(2).
+func (fd *FileDescription) SyncFS(ctx context.Context) error {
+	return fd.vd.mount.fs.impl.Sync(ctx)
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (fd *FileDescription) MappedName(ctx context.Context) string {
+	vfsroot := RootFromContext(ctx)
+	s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
+	if vfsroot.Ok() {
+		vfsroot.DecRef()
+	}
+	return s
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (fd *FileDescription) DeviceID() uint64 {
+	stat, err := fd.impl.Stat(context.Background(), StatOptions{
+		// There is no STATX_DEV; we assume that Stat will return it if it's
+		// available regardless of mask.
+		Mask: 0,
+		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev
+		// directly.
+		Sync: linux.AT_STATX_DONT_SYNC,
+	})
+	if err != nil {
+		return 0
+	}
+	return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor))
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (fd *FileDescription) InodeID() uint64 {
+	stat, err := fd.impl.Stat(context.Background(), StatOptions{
+		Mask: linux.STATX_INO,
+		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
+		Sync: linux.AT_STATX_DONT_SYNC,
+	})
+	if err != nil || stat.Mask&linux.STATX_INO == 0 {
+		return 0
+	}
+	return stat.Ino
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	return fd.impl.Sync(ctx)
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 4fbad7840..aae023254 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -252,3 +252,12 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6
 	fd.off = offset
 	return offset, nil
 }
+
+// GenericConfigureMMap may be used by most implementations of
+// FileDescriptionImpl.ConfigureMMap.
+func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error {
+	opts.Mappable = m
+	opts.MappingIdentity = fd
+	fd.IncRef()
+	return nil
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index a5561dcbe..ac7799296 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -103,7 +103,7 @@ func TestGenCountFD(t *testing.T) {
 	// The first read causes Generate to be called to fill the FD's buffer.
 	buf := make([]byte, 2)
 	ioseq := usermem.BytesIOSequence(buf)
-	n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	n, err := fd.Read(ctx, ioseq, ReadOptions{})
 	if n != 1 || (err != nil && err != io.EOF) {
 		t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err)
 	}
@@ -112,17 +112,17 @@ func TestGenCountFD(t *testing.T) {
 	}
 
 	// A second read without seeking is still at EOF.
-	n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	n, err = fd.Read(ctx, ioseq, ReadOptions{})
 	if n != 0 || err != io.EOF {
 		t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err)
 	}
 
 	// Seeking to the beginning of the file causes it to be regenerated.
-	n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET)
+	n, err = fd.Seek(ctx, 0, linux.SEEK_SET)
 	if n != 0 || err != nil {
 		t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err)
 	}
-	n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	n, err = fd.Read(ctx, ioseq, ReadOptions{})
 	if n != 1 || (err != nil && err != io.EOF) {
 		t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err)
 	}
@@ -131,7 +131,7 @@ func TestGenCountFD(t *testing.T) {
 	}
 
 	// PRead at the beginning of the file also causes it to be regenerated.
-	n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{})
+	n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{})
 	if n != 1 || (err != nil && err != io.EOF) {
 		t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err)
 	}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 76ff8cf51..8011eba3f 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 )
 
@@ -47,6 +48,9 @@ func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) {
 	fs.refs = 1
 	fs.vfs = vfsObj
 	fs.impl = impl
+	vfsObj.filesystemsMu.Lock()
+	vfsObj.filesystems[fs] = struct{}{}
+	vfsObj.filesystemsMu.Unlock()
 }
 
 // VirtualFilesystem returns the containing VirtualFilesystem.
@@ -66,9 +70,28 @@ func (fs *Filesystem) IncRef() {
 	}
 }
 
+// TryIncRef increments fs' reference count and returns true. If fs' reference
+// count is zero, TryIncRef does nothing and returns false.
+//
+// TryIncRef does not require that a reference is held on fs.
+func (fs *Filesystem) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&fs.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
 // DecRef decrements fs' reference count.
 func (fs *Filesystem) DecRef() {
 	if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+		fs.vfs.filesystemsMu.Lock()
+		delete(fs.vfs.filesystems, fs)
+		fs.vfs.filesystemsMu.Unlock()
 		fs.impl.Release()
 	} else if refs < 0 {
 		panic("Filesystem.decRef() called without holding a reference")
@@ -163,5 +186,56 @@ type FilesystemImpl interface {
 	// UnlinkAt removes the non-directory file at rp.
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
-	// TODO: d_path(); extended attributes; inotify_add_watch(); bind()
+	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
+	//
+	// If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
+	// before vd.Mount().Root(), PrependPath should stop prepending path
+	// components and return a PrependPathAtVFSRootError.
+	//
+	// If traversal of vd.Dentry()'s ancestors encounters an independent
+	// ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a
+	// descendant of vd.Mount().Root()), PrependPath should stop prepending
+	// path components and return a PrependPathAtNonMountRootError.
+	//
+	// Filesystems for which Dentries do not have meaningful paths may prepend
+	// an arbitrary descriptive string to b and then return a
+	// PrependPathSyntheticError.
+	//
+	// Most implementations can acquire the appropriate locks to ensure that
+	// Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of
+	// its ancestors, then call GenericPrependPath.
+	//
+	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
+	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
+
+	// TODO: extended attributes; inotify_add_watch(); bind()
+}
+
+// PrependPathAtVFSRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+type PrependPathAtVFSRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtVFSRootError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() reached VFS root"
+}
+
+// PrependPathAtNonMountRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter an independent ancestor
+// Dentry that is not the Mount root.
+type PrependPathAtNonMountRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtNonMountRootError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root"
+}
+
+// PrependPathSyntheticError is returned by implementations of
+// FilesystemImpl.PrependPath() for which prepended names do not represent real
+// paths.
+type PrependPathSyntheticError struct{}
+
+// Error implements error.Error.
+func (PrependPathSyntheticError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() prepended synthetic name"
 }
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 465e610e0..7315a588e 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,6 +16,8 @@ package vfs
 
 import (
 	"strings"
+
+	"gvisor.dev/gvisor/pkg/fspath"
 )
 
 // GenericParseMountOptions parses a comma-separated list of options of the
@@ -41,3 +43,27 @@ func GenericParseMountOptions(str string) map[string]string {
 	}
 	return m
 }
+
+// GenericPrependPath may be used by implementations of
+// FilesystemImpl.PrependPath() for which a single statically-determined lock
+// or set of locks is sufficient to ensure its preconditions (as opposed to
+// e.g. per-Dentry locks).
+//
+// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for
+// vd.Dentry() and all of its ancestors.
+func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+	mnt, d := vd.mount, vd.dentry
+	for {
+		if mnt == vfsroot.mount && d == vfsroot.dentry {
+			return PrependPathAtVFSRootError{}
+		}
+		if d == mnt.root {
+			return nil
+		}
+		if d.parent == nil {
+			return PrependPathAtNonMountRootError{}
+		}
+		b.PrependComponent(d.name)
+		d = d.parent
+	}
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1c3b2e987..ec23ab0dd 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -18,6 +18,7 @@ import (
 	"math"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -133,13 +134,13 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	return mntns, nil
 }
 
-// NewMount creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *GetFilesystemOptions) error {
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
 	fsType := vfs.getFilesystemType(fsTypeName)
 	if fsType == nil {
 		return syserror.ENODEV
 	}
-	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
 		return err
 	}
@@ -207,6 +208,68 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 	return nil
 }
 
+// UmountAt removes the Mount at the given path.
+func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
+	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
+		return syserror.EINVAL
+	}
+
+	// MNT_FORCE is currently unimplemented except for the permission check.
+	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
+		return syserror.EPERM
+	}
+
+	vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	defer vd.DecRef()
+	if vd.dentry != vd.mount.root {
+		return syserror.EINVAL
+	}
+	vfs.mountMu.Lock()
+	if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns {
+		vfs.mountMu.Unlock()
+		return syserror.EINVAL
+	}
+
+	// TODO(jamieliu): Linux special-cases umount of the caller's root, which
+	// we don't implement yet (we'll just fail it since the caller holds a
+	// reference on it).
+
+	vfs.mounts.seq.BeginWrite()
+	if opts.Flags&linux.MNT_DETACH == 0 {
+		if len(vd.mount.children) != 0 {
+			vfs.mounts.seq.EndWrite()
+			vfs.mountMu.Unlock()
+			return syserror.EBUSY
+		}
+		// We are holding a reference on vd.mount.
+		expectedRefs := int64(1)
+		if !vd.mount.umounted {
+			expectedRefs = 2
+		}
+		if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB
+			vfs.mounts.seq.EndWrite()
+			vfs.mountMu.Unlock()
+			return syserror.EBUSY
+		}
+	}
+	vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{
+		eager:               opts.Flags&linux.MNT_DETACH == 0,
+		disconnectHierarchy: true,
+	}, nil, nil)
+	vfs.mounts.seq.EndWrite()
+	vfs.mountMu.Unlock()
+	for _, vd := range vdsToDecRef {
+		vd.DecRef()
+	}
+	for _, mnt := range mountsToDecRef {
+		mnt.DecRef()
+	}
+	return nil
+}
+
 type umountRecursiveOptions struct {
 	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
 	// on umounted mounts fail.
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3aa73d911..3ecbc8fc1 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -46,6 +46,12 @@ type MknodOptions struct {
 	DevMinor uint32
 }
 
+// MountOptions contains options to VirtualFilesystem.MountAt().
+type MountOptions struct {
+	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
+	GetFilesystemOptions GetFilesystemOptions
+}
+
 // OpenOptions contains options to VirtualFilesystem.OpenAt() and
 // FilesystemImpl.OpenAt().
 type OpenOptions struct {
@@ -114,6 +120,12 @@ type StatOptions struct {
 	Sync uint32
 }
 
+// UmountOptions contains options to VirtualFilesystem.UmountAt().
+type UmountOptions struct {
+	// Flags contains flags as specified for umount2(2).
+	Flags uint32
+}
+
 // WriteOptions contains options to FileDescription.PWrite(),
 // FileDescriptionImpl.PWrite(), FileDescription.Write(), and
 // FileDescriptionImpl.Write().
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
new file mode 100644
index 000000000..8e155654f
--- /dev/null
+++ b/pkg/sentry/vfs/pathname.go
@@ -0,0 +1,153 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+var fspathBuilderPool = sync.Pool{
+	New: func() interface{} {
+		return &fspath.Builder{}
+	},
+}
+
+func getFSPathBuilder() *fspath.Builder {
+	return fspathBuilderPool.Get().(*fspath.Builder)
+}
+
+func putFSPathBuilder(b *fspath.Builder) {
+	// No methods can be called on b after b.String(), so reset it to its zero
+	// value (as returned by fspathBuilderPool.New) instead.
+	*b = fspath.Builder{}
+	fspathBuilderPool.Put(b)
+}
+
+// PathnameWithDeleted returns an absolute pathname to vd, consistent with
+// Linux's d_path(). In particular, if vd.Dentry() has been disowned,
+// PathnameWithDeleted appends " (deleted)" to the returned pathname.
+func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+
+	origD := vd.dentry
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				// GenericPrependPath() will have returned
+				// PrependPathAtVFSRootError in this case since it checks
+				// against vfsroot before mnt.root, but other implementations
+				// of FilesystemImpl.PrependPath() may return nil instead.
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				break loop
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+			// continue loop
+		case PrependPathSyntheticError:
+			// Skip prepending "/" and appending " (deleted)".
+			return b.String(), nil
+		case PrependPathAtVFSRootError, PrependPathAtNonMountRootError:
+			break loop
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	if origD.IsDisowned() {
+		b.AppendString(" (deleted)")
+	}
+	return b.String(), nil
+}
+
+// PathnameForGetcwd returns an absolute pathname to vd, consistent with
+// Linux's sys_getcwd().
+func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	if vd.dentry.IsDisowned() {
+		return "", syserror.ENOENT
+	}
+
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+	unreachable := false
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				unreachable = true
+				break loop
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+		case PrependPathAtVFSRootError:
+			break loop
+		case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+			unreachable = true
+			break loop
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	if unreachable {
+		b.PrependString("(unreachable)")
+	}
+	return b.String(), nil
+}
+
+// As of this writing, we do not have equivalents to:
+//
+// - d_absolute_path(), which returns EINVAL if (effectively) any call to
+// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError.
+//
+// - dentry_path(), which does not walk up mounts (and only returns the path
+// relative to Filesystem root), but also appends "//deleted" for disowned
+// Dentries.
+//
+// These should be added as necessary.
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f8e74355c..f1edb0680 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -119,3 +119,65 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
 		return false
 	}
 }
+
+// CheckSetStat checks that creds has permission to change the metadata of a
+// file with the given permissions, UID, and GID as specified by stat, subject
+// to the rules of Linux's fs/attr.c:setattr_prepare().
+func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+	if stat.Mask&linux.STATX_MODE != 0 {
+		if !CanActAsOwner(creds, kuid) {
+			return syserror.EPERM
+		}
+		// TODO(b/30815691): "If the calling process is not privileged (Linux:
+		// does not have the CAP_FSETID capability), and the group of the file
+		// does not match the effective group ID of the process or one of its
+		// supplementary group IDs, the S_ISGID bit will be turned off, but
+		// this will not cause an error to be returned." - chmod(2)
+	}
+	if stat.Mask&linux.STATX_UID != 0 {
+		if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) ||
+			HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
+			return syserror.EPERM
+		}
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) ||
+			HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
+			return syserror.EPERM
+		}
+	}
+	if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 {
+		if !CanActAsOwner(creds, kuid) {
+			if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) ||
+				(stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) ||
+				(stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) {
+				return syserror.EPERM
+			}
+			// isDir is irrelevant in the following call to
+			// GenericCheckPermissions since ats == MayWrite means that
+			// CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE
+			// applies, regardless of isDir.
+			if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// CanActAsOwner returns true if creds can act as the owner of a file with the
+// given owning UID, consistent with Linux's
+// fs/inode.c:inode_owner_or_capable().
+func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool {
+	if creds.EffectiveKUID == kuid {
+		return true
+	}
+	return creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(kuid).Ok()
+}
+
+// HasCapabilityOnFile returns true if creds has the given capability with
+// respect to a file with the given owning UID and GID, consistent with Linux's
+// kernel/capability.c:capable_wrt_inode_uidgid().
+func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool {
+	return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok()
+}
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
deleted file mode 100644
index 436151afa..000000000
--- a/pkg/sentry/vfs/syscalls.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// PathOperation specifies the path operated on by a VFS method.
-//
-// PathOperation is passed to VFS methods by pointer to reduce memory copying:
-// it's somewhat large and should never escape. (Options structs are passed by
-// pointer to VFS and FileDescription methods for the same reason.)
-type PathOperation struct {
-	// Root is the VFS root. References on Root are borrowed from the provider
-	// of the PathOperation.
-	//
-	// Invariants: Root.Ok().
-	Root VirtualDentry
-
-	// Start is the starting point for the path traversal. References on Start
-	// are borrowed from the provider of the PathOperation (i.e. the caller of
-	// the VFS method to which the PathOperation was passed).
-	//
-	// Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
-	Start VirtualDentry
-
-	// Path is the pathname traversed by this operation.
-	Pathname string
-
-	// If FollowFinalSymlink is true, and the Dentry traversed by the final
-	// path component represents a symbolic link, the symbolic link should be
-	// followed.
-	FollowFinalSymlink bool
-}
-
-// GetDentryAt returns a VirtualDentry representing the given path, at which a
-// file must exist. A reference is taken on the returned VirtualDentry.
-func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return VirtualDentry{}, err
-	}
-	for {
-		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
-		if err == nil {
-			vd := VirtualDentry{
-				mount:  rp.mount,
-				dentry: d,
-			}
-			rp.mount.IncRef()
-			vfs.putResolvingPath(rp)
-			return vd, nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return VirtualDentry{}, err
-		}
-	}
-}
-
-// MkdirAt creates a directory at the given path.
-func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
-	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
-	// also honored." - mkdir(2)
-	opts.Mode &= 01777
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
-	for {
-		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
-		if err == nil {
-			vfs.putResolvingPath(rp)
-			return nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return err
-		}
-	}
-}
-
-// MknodAt creates a file of the given mode at the given path. It returns an
-// error from the syserror package.
-func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil
-	}
-	for {
-		if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
-			vfs.putResolvingPath(rp)
-			return nil
-		}
-		// Handle mount traversals.
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return err
-		}
-	}
-}
-
-// OpenAt returns a FileDescription providing access to the file at the given
-// path. A reference is taken on the returned FileDescription.
-func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
-	// Remove:
-	//
-	// - O_LARGEFILE, which we always report in FileDescription status flags
-	// since only 64-bit architectures are supported at this time.
-	//
-	// - O_CLOEXEC, which affects file descriptors and therefore must be
-	// handled outside of VFS.
-	//
-	// - Unknown flags.
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
-	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
-	if opts.Flags&linux.O_SYNC != 0 {
-		opts.Flags |= linux.O_DSYNC
-	}
-	// Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
-	// with O_DIRECTORY and a writable access mode (to ensure that it fails on
-	// filesystem implementations that do not support it).
-	if opts.Flags&linux.O_TMPFILE != 0 {
-		if opts.Flags&linux.O_DIRECTORY == 0 {
-			return nil, syserror.EINVAL
-		}
-		if opts.Flags&linux.O_CREAT != 0 {
-			return nil, syserror.EINVAL
-		}
-		if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
-			return nil, syserror.EINVAL
-		}
-	}
-	// O_PATH causes most other flags to be ignored.
-	if opts.Flags&linux.O_PATH != 0 {
-		opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
-	}
-	// "On Linux, the following bits are also honored in mode: [S_ISUID,
-	// S_ISGID, S_ISVTX]" - open(2)
-	opts.Mode &= 07777
-
-	if opts.Flags&linux.O_NOFOLLOW != 0 {
-		pop.FollowFinalSymlink = false
-	}
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil, err
-	}
-	if opts.Flags&linux.O_DIRECTORY != 0 {
-		rp.mustBeDir = true
-		rp.mustBeDirOrig = true
-	}
-	for {
-		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
-		if err == nil {
-			vfs.putResolvingPath(rp)
-			return fd, nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return nil, err
-		}
-	}
-}
-
-// StatAt returns metadata for the file at the given path.
-func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	for {
-		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
-		if err == nil {
-			vfs.putResolvingPath(rp)
-			return stat, nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return linux.Statx{}, err
-		}
-	}
-}
-
-// StatusFlags returns file description status flags.
-func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	flags, err := fd.impl.StatusFlags(ctx)
-	flags |= linux.O_LARGEFILE
-	return flags, err
-}
-
-// SetStatusFlags sets file description status flags.
-func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	return fd.impl.SetStatusFlags(ctx, flags)
-}
-
-// TODO:
-//
-// - VFS.SyncAllFilesystems() for sync(2)
-//
-// - Something for syncfs(2)
-//
-// - VFS.LinkAt()
-//
-// - VFS.ReadlinkAt()
-//
-// - VFS.RenameAt()
-//
-// - VFS.RmdirAt()
-//
-// - VFS.SetStatAt()
-//
-// - VFS.StatFSAt()
-//
-// - VFS.SymlinkAt()
-//
-// - VFS.UmountAt()
-//
-// - VFS.UnlinkAt()
-//
-// - FileDescription.(almost everything)
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index 593144cb7..7a1d9e383 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -15,7 +15,10 @@
 package vfs
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -114,6 +117,12 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err
 	return syserror.EPERM
 }
 
+// PrependPath implements FilesystemImpl.PrependPath.
+func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+	b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry)))
+	return PrependPathSyntheticError{}
+}
+
 type fdTestDentry struct {
 	vfsd Dentry
 }
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index f0cd3ffe5..7262b0d0a 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -20,6 +20,7 @@
 //   VirtualFilesystem.mountMu
 //     Dentry.mu
 //       Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
+//     VirtualFilesystem.filesystemsMu
 // VirtualFilesystem.fsTypesMu
 //
 // Locking Dentry.mu in multiple Dentries requires holding
@@ -28,6 +29,11 @@ package vfs
 
 import (
 	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
@@ -67,6 +73,11 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// filesystems contains all Filesystems. filesystems is protected by
+	// filesystemsMu.
+	filesystemsMu sync.Mutex
+	filesystems   map[*Filesystem]struct{}
+
 	// fsTypes contains all FilesystemTypes that are usable in the
 	// VirtualFilesystem. fsTypes is protected by fsTypesMu.
 	fsTypesMu sync.RWMutex
@@ -77,12 +88,379 @@ type VirtualFilesystem struct {
 func New() *VirtualFilesystem {
 	vfs := &VirtualFilesystem{
 		mountpoints: make(map[*Dentry]map[*Mount]struct{}),
+		filesystems: make(map[*Filesystem]struct{}),
 		fsTypes:     make(map[string]FilesystemType),
 	}
 	vfs.mounts.Init()
 	return vfs
 }
 
+// PathOperation specifies the path operated on by a VFS method.
+//
+// PathOperation is passed to VFS methods by pointer to reduce memory copying:
+// it's somewhat large and should never escape. (Options structs are passed by
+// pointer to VFS and FileDescription methods for the same reason.)
+type PathOperation struct {
+	// Root is the VFS root. References on Root are borrowed from the provider
+	// of the PathOperation.
+	//
+	// Invariants: Root.Ok().
+	Root VirtualDentry
+
+	// Start is the starting point for the path traversal. References on Start
+	// are borrowed from the provider of the PathOperation (i.e. the caller of
+	// the VFS method to which the PathOperation was passed).
+	//
+	// Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
+	Start VirtualDentry
+
+	// Path is the pathname traversed by this operation.
+	Pathname string
+
+	// If FollowFinalSymlink is true, and the Dentry traversed by the final
+	// path component represents a symbolic link, the symbolic link should be
+	// followed.
+	FollowFinalSymlink bool
+}
+
+// GetDentryAt returns a VirtualDentry representing the given path, at which a
+// file must exist. A reference is taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return VirtualDentry{}, err
+	}
+	for {
+		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
+		if err == nil {
+			vd := VirtualDentry{
+				mount:  rp.mount,
+				dentry: d,
+			}
+			rp.mount.IncRef()
+			vfs.putResolvingPath(rp)
+			return vd, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return VirtualDentry{}, err
+		}
+	}
+}
+
+// LinkAt creates a hard link at newpop representing the existing file at
+// oldpop.
+func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
+	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	rp, err := vfs.getResolvingPath(creds, newpop)
+	if err != nil {
+		oldVD.DecRef()
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
+		if err == nil {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// MkdirAt creates a directory at the given path.
+func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
+	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
+	// also honored." - mkdir(2)
+	opts.Mode &= 0777 | linux.S_ISVTX
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// MknodAt creates a file of the given mode at the given path. It returns an
+// error from the syserror package.
+func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return nil
+	}
+	for {
+		if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		// Handle mount traversals.
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// OpenAt returns a FileDescription providing access to the file at the given
+// path. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
+	// Remove:
+	//
+	// - O_LARGEFILE, which we always report in FileDescription status flags
+	// since only 64-bit architectures are supported at this time.
+	//
+	// - O_CLOEXEC, which affects file descriptors and therefore must be
+	// handled outside of VFS.
+	//
+	// - Unknown flags.
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
+	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
+	if opts.Flags&linux.O_SYNC != 0 {
+		opts.Flags |= linux.O_DSYNC
+	}
+	// Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
+	// with O_DIRECTORY and a writable access mode (to ensure that it fails on
+	// filesystem implementations that do not support it).
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		if opts.Flags&linux.O_DIRECTORY == 0 {
+			return nil, syserror.EINVAL
+		}
+		if opts.Flags&linux.O_CREAT != 0 {
+			return nil, syserror.EINVAL
+		}
+		if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
+			return nil, syserror.EINVAL
+		}
+	}
+	// O_PATH causes most other flags to be ignored.
+	if opts.Flags&linux.O_PATH != 0 {
+		opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
+	}
+	// "On Linux, the following bits are also honored in mode: [S_ISUID,
+	// S_ISGID, S_ISVTX]" - open(2)
+	opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+	if opts.Flags&linux.O_NOFOLLOW != 0 {
+		pop.FollowFinalSymlink = false
+	}
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return nil, err
+	}
+	if opts.Flags&linux.O_DIRECTORY != 0 {
+		rp.mustBeDir = true
+		rp.mustBeDirOrig = true
+	}
+	for {
+		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return fd, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
+// ReadlinkAt returns the target of the symbolic link at the given path.
+func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return "", err
+	}
+	for {
+		target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return target, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return "", err
+		}
+	}
+}
+
+// RenameAt renames the file at oldpop to newpop.
+func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
+	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	rp, err := vfs.getResolvingPath(creds, newpop)
+	if err != nil {
+		oldVD.DecRef()
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldVD, *opts)
+		if err == nil {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// RmdirAt removes the directory at the given path.
+func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.RmdirAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// SetStatAt changes metadata for the file at the given path.
+func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// StatAt returns metadata for the file at the given path.
+func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	for {
+		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return stat, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return linux.Statx{}, err
+		}
+	}
+}
+
+// StatFSAt returns metadata for the filesystem containing the file at the
+// given path.
+func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	for {
+		statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return statfs, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return linux.Statfs{}, err
+		}
+	}
+}
+
+// SymlinkAt creates a symbolic link at the given path with the given target.
+func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// UnlinkAt deletes the non-directory file at the given path.
+func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// SyncAllFilesystems has the semantics of Linux's sync(2).
+func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
+	fss := make(map[*Filesystem]struct{})
+	vfs.filesystemsMu.Lock()
+	for fs := range vfs.filesystems {
+		if !fs.TryIncRef() {
+			continue
+		}
+		fss[fs] = struct{}{}
+	}
+	vfs.filesystemsMu.Unlock()
+	var retErr error
+	for fs := range fss {
+		if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
+			retErr = err
+		}
+		fs.DecRef()
+	}
+	return retErr
+}
+
 // A VirtualDentry represents a node in a VFS tree, by combining a Dentry
 // (which represents a node in a Filesystem's tree) and a Mount (which
 // represents the Filesystem's position in a VFS mount tree).
diff --git a/pkg/syncutil/BUILD b/pkg/syncutil/BUILD
index b06a90bef..cb1f41628 100644
--- a/pkg/syncutil/BUILD
+++ b/pkg/syncutil/BUILD
@@ -31,8 +31,6 @@ go_template(
 go_library(
     name = "syncutil",
     srcs = [
-        "downgradable_rwmutex_1_12_unsafe.go",
-        "downgradable_rwmutex_1_13_unsafe.go",
         "downgradable_rwmutex_unsafe.go",
         "memmove_unsafe.go",
         "norace_unsafe.go",
diff --git a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
deleted file mode 100644
index 7c6336e62..000000000
--- a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.13
-
-// TODO(b/133868570): Delete once Go 1.12 is no longer supported.
-
-package syncutil
-
-import _ "unsafe"
-
-//go:linkname runtimeSemrelease112 sync.runtime_Semrelease
-func runtimeSemrelease112(s *uint32, handoff bool)
-
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int) {
-	// 'skipframes' is only available starting from 1.13.
-	runtimeSemrelease112(s, handoff)
-}
diff --git a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
deleted file mode 100644
index 3c3673119..000000000
--- a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.13
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-package syncutil
-
-import _ "unsafe"
-
-//go:linkname runtimeSemrelease sync.runtime_Semrelease
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
diff --git a/pkg/syncutil/downgradable_rwmutex_unsafe.go b/pkg/syncutil/downgradable_rwmutex_unsafe.go
index 07feca402..51e11555d 100644
--- a/pkg/syncutil/downgradable_rwmutex_unsafe.go
+++ b/pkg/syncutil/downgradable_rwmutex_unsafe.go
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build go1.12
+// +build go1.13
 // +build !go1.15
 
 // Check go:linkname function signatures when updating Go version.
@@ -27,6 +27,9 @@ import (
 //go:linkname runtimeSemacquire sync.runtime_Semacquire
 func runtimeSemacquire(s *uint32)
 
+//go:linkname runtimeSemrelease sync.runtime_Semrelease
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
+
 // DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
 // method.
 type DowngradableRWMutex struct {
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index a3485b35c..f1d837196 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -38,12 +38,15 @@ go_test(
     size = "small",
     srcs = [
         "checksum_test.go",
+        "ipv6_test.go",
         "ipversion_test.go",
         "tcp_test.go",
     ],
     deps = [
         ":header",
+        "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "@com_github_google_go-cmp//cmp:go_default_library",
     ],
 )
 
@@ -55,5 +58,8 @@ go_test(
         "ndp_test.go",
     ],
     embed = [":header"],
-    deps = ["//pkg/tcpip"],
+    deps = [
+        "//pkg/tcpip",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
 )
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 0caa51c1e..fc671e439 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -90,6 +90,18 @@ const (
 	// IPv6Any is the non-routable IPv6 "any" meta address. It is also
 	// known as the unspecified address.
 	IPv6Any tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+
+	// IIDSize is the size of an interface identifier (IID), in bytes, as
+	// defined by RFC 4291 section 2.5.1.
+	IIDSize = 8
+
+	// IIDOffsetInIPv6Address is the offset, in bytes, from the start
+	// of an IPv6 address to the beginning of the interface identifier
+	// (IID) for auto-generated addresses. That is, all bytes before
+	// the IIDOffsetInIPv6Address-th byte are the prefix bytes, and all
+	// bytes including and after the IIDOffsetInIPv6Address-th byte are
+	// for the IID.
+	IIDOffsetInIPv6Address = 8
 )
 
 // IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the
@@ -266,27 +278,43 @@ func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address {
 	return solicitedNodeMulticastPrefix + addr[len(addr)-3:]
 }
 
+// EthernetAdddressToModifiedEUI64IntoBuf populates buf with a modified EUI-64
+// from a 48-bit Ethernet/MAC address, as per RFC 4291 section 2.5.1.
+//
+// buf MUST be at least 8 bytes.
+func EthernetAdddressToModifiedEUI64IntoBuf(linkAddr tcpip.LinkAddress, buf []byte) {
+	buf[0] = linkAddr[0] ^ 2
+	buf[1] = linkAddr[1]
+	buf[2] = linkAddr[2]
+	buf[3] = 0xFF
+	buf[4] = 0xFE
+	buf[5] = linkAddr[3]
+	buf[6] = linkAddr[4]
+	buf[7] = linkAddr[5]
+}
+
+// EthernetAddressToModifiedEUI64 computes a modified EUI-64 from a 48-bit
+// Ethernet/MAC address, as per RFC 4291 section 2.5.1.
+func EthernetAddressToModifiedEUI64(linkAddr tcpip.LinkAddress) [IIDSize]byte {
+	var buf [IIDSize]byte
+	EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:])
+	return buf
+}
+
 // LinkLocalAddr computes the default IPv6 link-local address from a link-layer
 // (MAC) address.
 func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address {
-	// Convert a 48-bit MAC to an EUI-64 and then prepend the link-local
-	// header, FE80::.
+	// Convert a 48-bit MAC to a modified EUI-64 and then prepend the
+	// link-local header, FE80::.
 	//
 	// The conversion is very nearly:
 	//	aa:bb:cc:dd:ee:ff => FE80::Aabb:ccFF:FEdd:eeff
 	// Note the capital A. The conversion aa->Aa involves a bit flip.
-	lladdrb := [16]byte{
-		0:  0xFE,
-		1:  0x80,
-		8:  linkAddr[0] ^ 2,
-		9:  linkAddr[1],
-		10: linkAddr[2],
-		11: 0xFF,
-		12: 0xFE,
-		13: linkAddr[3],
-		14: linkAddr[4],
-		15: linkAddr[5],
+	lladdrb := [IPv6AddressSize]byte{
+		0: 0xFE,
+		1: 0x80,
 	}
+	EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, lladdrb[IIDOffsetInIPv6Address:])
 	return tcpip.Address(lladdrb[:])
 }
 
diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
new file mode 100644
index 000000000..42c5c6fc1
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header_test
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+const linkAddr = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+
+func TestEthernetAdddressToModifiedEUI64(t *testing.T) {
+	expectedIID := [header.IIDSize]byte{0, 2, 3, 255, 254, 4, 5, 6}
+
+	if diff := cmp.Diff(expectedIID, header.EthernetAddressToModifiedEUI64(linkAddr)); diff != "" {
+		t.Errorf("EthernetAddressToModifiedEUI64(%s) mismatch (-want +got):\n%s", linkAddr, diff)
+	}
+
+	var buf [header.IIDSize]byte
+	header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:])
+	if diff := cmp.Diff(expectedIID, buf); diff != "" {
+		t.Errorf("EthernetAddressToModifiedEUI64IntoBuf(%s, _) mismatch (-want +got):\n%s", linkAddr, diff)
+	}
+}
+
+func TestLinkLocalAddr(t *testing.T) {
+	if got, want := header.LinkLocalAddr(linkAddr), tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x02\x03\xff\xfe\x04\x05\x06"); got != want {
+		t.Errorf("got LinkLocalAddr(%s) = %s, want = %s", linkAddr, got, want)
+	}
+}
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index 1ca6199ef..06e0bace2 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -17,6 +17,7 @@ package header
 import (
 	"encoding/binary"
 	"errors"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -85,6 +86,23 @@ const (
 	// within an NDPPrefixInformation.
 	ndpPrefixInformationPrefixOffset = 14
 
+	// NDPRecursiveDNSServerOptionType is the type of the Recursive DNS
+	// Server option, as per RFC 8106 section 5.1.
+	NDPRecursiveDNSServerOptionType = 25
+
+	// ndpRecursiveDNSServerLifetimeOffset is the start of the 4-byte
+	// Lifetime field within an NDPRecursiveDNSServer.
+	ndpRecursiveDNSServerLifetimeOffset = 2
+
+	// ndpRecursiveDNSServerAddressesOffset is the start of the addresses
+	// for IPv6 Recursive DNS Servers within an NDPRecursiveDNSServer.
+	ndpRecursiveDNSServerAddressesOffset = 6
+
+	// minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS
+	// Server option's length field value when it contains at least one
+	// IPv6 address.
+	minNDPRecursiveDNSServerLength = 3
+
 	// lengthByteUnits is the multiplier factor for the Length field of an
 	// NDP option. That is, the length field for NDP options is in units of
 	// 8 octets, as per RFC 4861 section 4.6.
@@ -92,13 +110,13 @@ const (
 )
 
 var (
-	// NDPPrefixInformationInfiniteLifetime is a value that represents
-	// infinity for the Valid and Preferred Lifetime fields in a NDP Prefix
-	// Information option. Its value is (2^32 - 1)s = 4294967295s
+	// NDPInfiniteLifetime is a value that represents infinity for the
+	// 4-byte lifetime fields found in various NDP options. Its value is
+	// (2^32 - 1)s = 4294967295s.
 	//
 	// This is a variable instead of a constant so that tests can change
 	// this value to a smaller value. It should only be modified by tests.
-	NDPPrefixInformationInfiniteLifetime = time.Second * 4294967295
+	NDPInfiniteLifetime = time.Second * math.MaxUint32
 )
 
 // NDPOptionIterator is an iterator of NDPOption.
@@ -118,6 +136,7 @@ var (
 	ErrNDPOptBufExhausted  = errors.New("Buffer unexpectedly exhausted")
 	ErrNDPOptZeroLength    = errors.New("NDP option has zero-valued Length field")
 	ErrNDPOptMalformedBody = errors.New("NDP option has a malformed body")
+	ErrNDPInvalidLength    = errors.New("NDP option's Length value is invalid as per relevant RFC")
 )
 
 // Next returns the next element in the backing NDPOptions, or true if we are
@@ -182,6 +201,22 @@ func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
 			}
 
 			return NDPPrefixInformation(body), false, nil
+
+		case NDPRecursiveDNSServerOptionType:
+			// RFC 8106 section 5.3.1 outlines that the RDNSS option
+			// must have a minimum length of 3 so it contains at
+			// least one IPv6 address.
+			if l < minNDPRecursiveDNSServerLength {
+				return nil, true, ErrNDPInvalidLength
+			}
+
+			opt := NDPRecursiveDNSServer(body)
+			if len(opt.Addresses()) == 0 {
+				return nil, true, ErrNDPOptMalformedBody
+			}
+
+			return opt, false, nil
+
 		default:
 			// We do not yet recognize the option, just skip for
 			// now. This is okay because RFC 4861 allows us to
@@ -434,7 +469,7 @@ func (o NDPPrefixInformation) AutonomousAddressConfigurationFlag() bool {
 //
 // Note, a value of 0 implies the prefix should not be considered as on-link,
 // and a value of infinity/forever is represented by
-// NDPPrefixInformationInfiniteLifetime.
+// NDPInfiniteLifetime.
 func (o NDPPrefixInformation) ValidLifetime() time.Duration {
 	// The field is the time in seconds, as per RFC 4861 section 4.6.2.
 	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationValidLifetimeOffset:]))
@@ -447,7 +482,7 @@ func (o NDPPrefixInformation) ValidLifetime() time.Duration {
 //
 // Note, a value of 0 implies that addresses generated from the prefix should
 // no longer remain preferred, and a value of infinity is represented by
-// NDPPrefixInformationInfiniteLifetime.
+// NDPInfiniteLifetime.
 //
 // Also note that the value of this field MUST NOT exceed the Valid Lifetime
 // field to avoid preferring addresses that are no longer valid, for the
@@ -476,3 +511,79 @@ func (o NDPPrefixInformation) Subnet() tcpip.Subnet {
 	}
 	return addrWithPrefix.Subnet()
 }
+
+// NDPRecursiveDNSServer is the NDP Recursive DNS Server option, as defined by
+// RFC 8106 section 5.1.
+//
+// To make sure that the option meets its minimum length and does not end in the
+// middle of a DNS server's IPv6 address, the length of a valid
+// NDPRecursiveDNSServer must meet the following constraint:
+//   (Length - ndpRecursiveDNSServerAddressesOffset) % IPv6AddressSize == 0
+type NDPRecursiveDNSServer []byte
+
+// Type returns the type of an NDP Recursive DNS Server option.
+//
+// Type implements NDPOption.Type.
+func (NDPRecursiveDNSServer) Type() uint8 {
+	return NDPRecursiveDNSServerOptionType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPRecursiveDNSServer) Length() int {
+	return len(o)
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPRecursiveDNSServer) serializeInto(b []byte) int {
+	used := copy(b, o)
+
+	// Zero out the reserved bytes that are before the Lifetime field.
+	for i := 0; i < ndpRecursiveDNSServerLifetimeOffset; i++ {
+		b[i] = 0
+	}
+
+	return used
+}
+
+// Lifetime returns the length of time that the DNS server addresses
+// in this option may be used for name resolution.
+//
+// Note, a value of 0 implies the addresses should no longer be used,
+// and a value of infinity/forever is represented by NDPInfiniteLifetime.
+//
+// Lifetime may panic if o does not have enough bytes to hold the Lifetime
+// field.
+func (o NDPRecursiveDNSServer) Lifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 8106 section 5.1.
+	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRecursiveDNSServerLifetimeOffset:]))
+}
+
+// Addresses returns the recursive DNS server IPv6 addresses that may be
+// used for name resolution.
+//
+// Note, some of the addresses returned MAY be link-local addresses.
+//
+// Addresses may panic if o does not hold valid IPv6 addresses.
+func (o NDPRecursiveDNSServer) Addresses() []tcpip.Address {
+	l := len(o)
+	if l < ndpRecursiveDNSServerAddressesOffset {
+		return nil
+	}
+
+	l -= ndpRecursiveDNSServerAddressesOffset
+	if l%IPv6AddressSize != 0 {
+		return nil
+	}
+
+	buf := o[ndpRecursiveDNSServerAddressesOffset:]
+	var addrs []tcpip.Address
+	for len(buf) > 0 {
+		addr := tcpip.Address(buf[:IPv6AddressSize])
+		if !IsV6UnicastAddress(addr) {
+			return nil
+		}
+		addrs = append(addrs, addr)
+		buf = buf[IPv6AddressSize:]
+	}
+	return addrs
+}
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
index ad6daafcd..2c439d70c 100644
--- a/pkg/tcpip/header/ndp_test.go
+++ b/pkg/tcpip/header/ndp_test.go
@@ -19,6 +19,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
@@ -369,6 +370,175 @@ func TestNDPPrefixInformationOption(t *testing.T) {
 	}
 }
 
+func TestNDPRecursiveDNSServerOptionSerialize(t *testing.T) {
+	b := []byte{
+		9, 8,
+		1, 2, 4, 8,
+		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	}
+	targetBuf := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+	expected := []byte{
+		25, 3, 0, 0,
+		1, 2, 4, 8,
+		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	}
+	opts := NDPOptions(targetBuf)
+	serializer := NDPOptionsSerializer{
+		NDPRecursiveDNSServer(b),
+	}
+	if got, want := opts.Serialize(serializer), len(expected); got != want {
+		t.Errorf("got Serialize = %d, want = %d", got, want)
+	}
+	if !bytes.Equal(targetBuf, expected) {
+		t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expected)
+	}
+
+	it, err := opts.Iter(true)
+	if err != nil {
+		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+	}
+
+	next, done, err := it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got := next.Type(); got != NDPRecursiveDNSServerOptionType {
+		t.Errorf("got Type = %d, want = %d", got, NDPRecursiveDNSServerOptionType)
+	}
+
+	opt, ok := next.(NDPRecursiveDNSServer)
+	if !ok {
+		t.Fatalf("next (type = %T) cannot be casted to an NDPRecursiveDNSServer", next)
+	}
+	if got := opt.Type(); got != 25 {
+		t.Errorf("got Type = %d, want = 31", got)
+	}
+	if got := opt.Length(); got != 22 {
+		t.Errorf("got Length = %d, want = 22", got)
+	}
+	if got, want := opt.Lifetime(), 16909320*time.Second; got != want {
+		t.Errorf("got Lifetime = %s, want = %s", got, want)
+	}
+	want := []tcpip.Address{
+		"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+	}
+	if got := opt.Addresses(); !cmp.Equal(got, want) {
+		t.Errorf("got Addresses = %v, want = %v", got, want)
+	}
+
+	// Iterator should not return anything else.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if !done {
+		t.Error("got Next = (_, false, _), want = (_, true, _)")
+	}
+	if next != nil {
+		t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+	}
+}
+
+func TestNDPRecursiveDNSServerOption(t *testing.T) {
+	tests := []struct {
+		name     string
+		buf      []byte
+		lifetime time.Duration
+		addrs    []tcpip.Address
+	}{
+		{
+			"Valid1Addr",
+			[]byte{
+				25, 3, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+			},
+			0,
+			[]tcpip.Address{
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+			},
+		},
+		{
+			"Valid2Addr",
+			[]byte{
+				25, 5, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+				17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
+			},
+			0,
+			[]tcpip.Address{
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+				"\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x10",
+			},
+		},
+		{
+			"Valid3Addr",
+			[]byte{
+				25, 7, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+				17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
+				17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17,
+			},
+			0,
+			[]tcpip.Address{
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+				"\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x10",
+				"\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x11",
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := NDPOptions(test.buf)
+			it, err := opts.Iter(true)
+			if err != nil {
+				t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+			}
+
+			// Iterator should get our option.
+			next, done, err := it.Next()
+			if err != nil {
+				t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+			}
+			if done {
+				t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+			}
+			if got := next.Type(); got != NDPRecursiveDNSServerOptionType {
+				t.Fatalf("got Type %= %d, want = %d", got, NDPRecursiveDNSServerOptionType)
+			}
+
+			opt, ok := next.(NDPRecursiveDNSServer)
+			if !ok {
+				t.Fatalf("next (type = %T) cannot be casted to an NDPRecursiveDNSServer", next)
+			}
+			if got := opt.Lifetime(); got != test.lifetime {
+				t.Errorf("got Lifetime = %d, want = %d", got, test.lifetime)
+			}
+			if got := opt.Addresses(); !cmp.Equal(got, test.addrs) {
+				t.Errorf("got Addresses = %v, want = %v", got, test.addrs)
+			}
+
+			// Iterator should not return anything else.
+			next, done, err = it.Next()
+			if err != nil {
+				t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+			}
+			if !done {
+				t.Error("got Next = (_, false, _), want = (_, true, _)")
+			}
+			if next != nil {
+				t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+			}
+		})
+	}
+}
+
 // TestNDPOptionsIterCheck tests that Iter will return false if the NDPOptions
 // the iterator was returned for is malformed.
 func TestNDPOptionsIterCheck(t *testing.T) {
@@ -473,6 +643,51 @@ func TestNDPOptionsIterCheck(t *testing.T) {
 			},
 			nil,
 		},
+		{
+			"InvalidRecursiveDNSServerCutsOffAddress",
+			[]byte{
+				25, 4, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+				0, 1, 2, 3, 4, 5, 6, 7,
+			},
+			ErrNDPOptMalformedBody,
+		},
+		{
+			"InvalidRecursiveDNSServerInvalidLengthField",
+			[]byte{
+				25, 2, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8,
+			},
+			ErrNDPInvalidLength,
+		},
+		{
+			"RecursiveDNSServerTooSmall",
+			[]byte{
+				25, 1, 0, 0,
+				0, 0, 0,
+			},
+			ErrNDPOptBufExhausted,
+		},
+		{
+			"RecursiveDNSServerMulticast",
+			[]byte{
+				25, 3, 0, 0,
+				0, 0, 0, 0,
+				255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+			},
+			ErrNDPOptMalformedBody,
+		},
+		{
+			"RecursiveDNSServerUnspecified",
+			[]byte{
+				25, 3, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			ErrNDPOptMalformedBody,
+		},
 	}
 
 	for _, test := range tests {
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index 4839f0a65..e156b01f6 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,5 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index 30cea8996..6c5e19e8f 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -41,6 +41,30 @@ type portDescriptor struct {
 	port      uint16
 }
 
+// Flags represents the type of port reservation.
+//
+// +stateify savable
+type Flags struct {
+	// MostRecent represents UDP SO_REUSEADDR.
+	MostRecent bool
+
+	// LoadBalanced indicates SO_REUSEPORT.
+	//
+	// LoadBalanced takes precidence over MostRecent.
+	LoadBalanced bool
+}
+
+func (f Flags) bits() reuseFlag {
+	var rf reuseFlag
+	if f.MostRecent {
+		rf |= mostRecentFlag
+	}
+	if f.LoadBalanced {
+		rf |= loadBalancedFlag
+	}
+	return rf
+}
+
 // PortManager manages allocating, reserving and releasing ports.
 type PortManager struct {
 	mu             sync.RWMutex
@@ -54,9 +78,59 @@ type PortManager struct {
 	hint uint32
 }
 
+type reuseFlag int
+
+const (
+	mostRecentFlag reuseFlag = 1 << iota
+	loadBalancedFlag
+	nextFlag
+
+	flagMask = nextFlag - 1
+)
+
 type portNode struct {
-	reuse bool
-	refs  int
+	// refs stores the count for each possible flag combination.
+	refs [nextFlag]int
+}
+
+func (p portNode) totalRefs() int {
+	var total int
+	for _, r := range p.refs {
+		total += r
+	}
+	return total
+}
+
+// flagRefs returns the number of references with all specified flags.
+func (p portNode) flagRefs(flags reuseFlag) int {
+	var total int
+	for i, r := range p.refs {
+		if reuseFlag(i)&flags == flags {
+			total += r
+		}
+	}
+	return total
+}
+
+// allRefsHave returns if all references have all specified flags.
+func (p portNode) allRefsHave(flags reuseFlag) bool {
+	for i, r := range p.refs {
+		if reuseFlag(i)&flags == flags && r > 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// intersectionRefs returns the set of flags shared by all references.
+func (p portNode) intersectionRefs() reuseFlag {
+	intersection := flagMask
+	for i, r := range p.refs {
+		if r > 0 {
+			intersection &= reuseFlag(i)
+		}
+	}
+	return intersection
 }
 
 // deviceNode is never empty. When it has no elements, it is removed from the
@@ -66,30 +140,44 @@ type deviceNode map[tcpip.NICID]portNode
 // isAvailable checks whether binding is possible by device. If not binding to a
 // device, check against all portNodes. If binding to a specific device, check
 // against the unspecified device and the provided device.
-func (d deviceNode) isAvailable(reuse bool, bindToDevice tcpip.NICID) bool {
+//
+// If either of the port reuse flags is enabled on any of the nodes, all nodes
+// sharing a port must share at least one reuse flag. This matches Linux's
+// behavior.
+func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID) bool {
+	flagBits := flags.bits()
 	if bindToDevice == 0 {
 		// Trying to binding all devices.
-		if !reuse {
+		if flagBits == 0 {
 			// Can't bind because the (addr,port) is already bound.
 			return false
 		}
+		intersection := flagMask
 		for _, p := range d {
-			if !p.reuse {
-				// Can't bind because the (addr,port) was previously bound without reuse.
+			i := p.intersectionRefs()
+			intersection &= i
+			if intersection&flagBits == 0 {
+				// Can't bind because the (addr,port) was
+				// previously bound without reuse.
 				return false
 			}
 		}
 		return true
 	}
 
+	intersection := flagMask
+
 	if p, ok := d[0]; ok {
-		if !reuse || !p.reuse {
+		intersection = p.intersectionRefs()
+		if intersection&flagBits == 0 {
 			return false
 		}
 	}
 
 	if p, ok := d[bindToDevice]; ok {
-		if !reuse || !p.reuse {
+		i := p.intersectionRefs()
+		intersection &= i
+		if intersection&flagBits == 0 {
 			return false
 		}
 	}
@@ -103,12 +191,12 @@ type bindAddresses map[tcpip.Address]deviceNode
 // isAvailable checks whether an IP address is available to bind to. If the
 // address is the "any" address, check all other addresses. Otherwise, just
 // check against the "any" address and the provided address.
-func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice tcpip.NICID) bool {
+func (b bindAddresses) isAvailable(addr tcpip.Address, flags Flags, bindToDevice tcpip.NICID) bool {
 	if addr == anyIPAddress {
 		// If binding to the "any" address then check that there are no conflicts
 		// with all addresses.
 		for _, d := range b {
-			if !d.isAvailable(reuse, bindToDevice) {
+			if !d.isAvailable(flags, bindToDevice) {
 				return false
 			}
 		}
@@ -117,14 +205,14 @@ func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice
 
 	// Check that there is no conflict with the "any" address.
 	if d, ok := b[anyIPAddress]; ok {
-		if !d.isAvailable(reuse, bindToDevice) {
+		if !d.isAvailable(flags, bindToDevice) {
 			return false
 		}
 	}
 
 	// Check that this is no conflict with the provided address.
 	if d, ok := b[addr]; ok {
-		if !d.isAvailable(reuse, bindToDevice) {
+		if !d.isAvailable(flags, bindToDevice) {
 			return false
 		}
 	}
@@ -190,17 +278,17 @@ func (s *PortManager) pickEphemeralPort(offset, count uint32, testPort func(p ui
 }
 
 // IsPortAvailable tests if the given port is available on all given protocols.
-func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
+func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	return s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice)
+	return s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice)
 }
 
-func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
+func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if addrs, ok := s.allocatedPorts[desc]; ok {
-			if !addrs.isAvailable(addr, reuse, bindToDevice) {
+			if !addrs.isAvailable(addr, flags, bindToDevice) {
 				return false
 			}
 		}
@@ -212,14 +300,14 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb
 // reserved by another endpoint. If port is zero, ReservePort will search for
 // an unreserved ephemeral port and reserve it, returning its value in the
 // "port" return value.
-func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) {
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	// If a port is specified, just try to reserve it for all network
 	// protocols.
 	if port != 0 {
-		if !s.reserveSpecificPort(networks, transport, addr, port, reuse, bindToDevice) {
+		if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice) {
 			return 0, tcpip.ErrPortInUse
 		}
 		return port, nil
@@ -227,15 +315,16 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp
 
 	// A port wasn't specified, so try to find one.
 	return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
-		return s.reserveSpecificPort(networks, transport, addr, p, reuse, bindToDevice), nil
+		return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice), nil
 	})
 }
 
 // reserveSpecificPort tries to reserve the given port on all given protocols.
-func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
-	if !s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice) {
+func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
+	if !s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice) {
 		return false
 	}
+	flagBits := flags.bits()
 
 	// Reserve port on all network protocols.
 	for _, network := range networks {
@@ -250,12 +339,9 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 			d = make(deviceNode)
 			m[addr] = d
 		}
-		if n, ok := d[bindToDevice]; ok {
-			n.refs++
-			d[bindToDevice] = n
-		} else {
-			d[bindToDevice] = portNode{reuse: reuse, refs: 1}
-		}
+		n := d[bindToDevice]
+		n.refs[flagBits]++
+		d[bindToDevice] = n
 	}
 
 	return true
@@ -263,10 +349,12 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 
 // ReleasePort releases the reservation on a port/IP combination so that it can
 // be reserved by other endpoints.
-func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, bindToDevice tcpip.NICID) {
+func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
+	flagBits := flags.bits()
+
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if m, ok := s.allocatedPorts[desc]; ok {
@@ -278,9 +366,9 @@ func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transp
 			if !ok {
 				continue
 			}
-			n.refs--
+			n.refs[flagBits]--
 			d[bindToDevice] = n
-			if n.refs == 0 {
+			if n.refs == [nextFlag]int{} {
 				delete(d, bindToDevice)
 			}
 			if len(d) == 0 {
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 19f4833fc..d6969d050 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -33,7 +33,7 @@ type portReserveTestAction struct {
 	port    uint16
 	ip      tcpip.Address
 	want    *tcpip.Error
-	reuse   bool
+	flags   Flags
 	release bool
 	device  tcpip.NICID
 }
@@ -50,7 +50,7 @@ func TestPortReservation(t *testing.T) {
 				{port: 80, ip: fakeIPAddress1, want: nil},
 				/* N.B. Order of tests matters! */
 				{port: 80, ip: anyIPAddress, want: tcpip.ErrPortInUse},
-				{port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true},
+				{port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}},
 			},
 		},
 		{
@@ -61,7 +61,7 @@ func TestPortReservation(t *testing.T) {
 				/* release fakeIPAddress, but anyIPAddress is still inuse */
 				{port: 22, ip: fakeIPAddress, release: true},
 				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
-				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true},
+				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}},
 				/* Release port 22 from any IP address, then try to reserve fake IP address on 22 */
 				{port: 22, ip: anyIPAddress, want: nil, release: true},
 				{port: 22, ip: fakeIPAddress, want: nil},
@@ -71,36 +71,36 @@ func TestPortReservation(t *testing.T) {
 			actions: []portReserveTestAction{
 				{port: 00, ip: fakeIPAddress, want: nil},
 				{port: 00, ip: fakeIPAddress, want: nil},
-				{port: 00, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 00, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 			},
 		}, {
 			tname: "bind to ip with reuseport",
 			actions: []portReserveTestAction{
-				{port: 25, ip: fakeIPAddress, reuse: true, want: nil},
-				{port: 25, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 
-				{port: 25, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse},
-				{port: 25, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 25, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 25, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
 
-				{port: 25, ip: anyIPAddress, reuse: true, want: nil},
+				{port: 25, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 			},
 		}, {
 			tname: "bind to inaddr any with reuseport",
 			actions: []portReserveTestAction{
-				{port: 24, ip: anyIPAddress, reuse: true, want: nil},
-				{port: 24, ip: anyIPAddress, reuse: true, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 
-				{port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
 
-				{port: 24, ip: fakeIPAddress, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, release: true, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, release: true, want: nil},
 
-				{port: 24, ip: anyIPAddress, release: true},
-				{port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
 
-				{port: 24, ip: anyIPAddress, release: true},
-				{port: 24, ip: anyIPAddress, reuse: false, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: nil},
 			},
 		}, {
 			tname: "bind twice with device fails",
@@ -125,88 +125,152 @@ func TestPortReservation(t *testing.T) {
 			actions: []portReserveTestAction{
 				{port: 24, ip: fakeIPAddress, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
 			tname: "bind with device",
 			actions: []portReserveTestAction{
 				{port: 24, ip: fakeIPAddress, device: 123, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 789, want: nil},
 				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
-			tname: "bind with reuse",
+			tname: "bind with reuseport",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
 			},
 		}, {
-			tname: "binding with reuse and device",
+			tname: "binding with reuseport and device",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 999, want: tcpip.ErrPortInUse},
 			},
 		}, {
-			tname: "mixing reuse and not reuse by binding to device",
+			tname: "mixing reuseport and not reuseport by binding to device",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 456, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 999, want: nil},
 			},
 		}, {
-			tname: "can't bind to 0 after mixing reuse and not reuse",
+			tname: "can't bind to 0 after mixing reuseport and not reuseport",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 456, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
 			tname: "bind and release",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
 
 				// Release the bind to device 0 and try again.
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil, release: true},
-				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil},
 			},
 		}, {
-			tname: "bind twice with reuse once",
+			tname: "bind twice with reuseport once",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
 			tname: "release an unreserved device",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil},
 				// The below don't exist.
-				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil, release: true},
-				{port: 9999, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil, release: true},
+				{port: 9999, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true},
 				// Release all.
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil, release: true},
+			},
+		}, {
+			tname: "bind with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: nil},
+			},
+		}, {
+			tname: "bind twice with reuseaddr once",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport, and then reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport, and then reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport twice, and then reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport twice, and then reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr, and then reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseport, and then reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
 			},
 		},
 	} {
@@ -216,12 +280,12 @@ func TestPortReservation(t *testing.T) {
 
 			for _, test := range test.actions {
 				if test.release {
-					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.device)
+					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device)
 					continue
 				}
-				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.reuse, test.device)
+				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device)
 				if err != test.want {
-					t.Fatalf("ReservePort(.., .., %s, %d, %t, %d) = %v, want %v", test.ip, test.port, test.reuse, test.device, err, test.want)
+					t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d) = %v, want %v", test.ip, test.port, test.flags, test.device, err, test.want)
 				}
 				if test.port == 0 && (gotPort == 0 || gotPort < FirstEphemeral) {
 					t.Fatalf("ReservePort(.., .., .., 0) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral)
diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD
index a57752a7c..d7496fde6 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 go_binary(
     name = "tun_tcp_connect",
     srcs = ["main.go"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD
index dad8ef399..875561566 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 go_binary(
     name = "tun_tcp_echo",
     srcs = ["main.go"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/link/fdbased",
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index cfdd0496e..27bd02e76 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -58,6 +58,14 @@ const (
 	// Default = true.
 	defaultDiscoverOnLinkPrefixes = true
 
+	// defaultAutoGenGlobalAddresses is the default configuration for
+	// whether or not to generate global IPv6 addresses in response to
+	// receiving a new Prefix Information option with its Autonomous
+	// Address AutoConfiguration flag set, as a host.
+	//
+	// Default = true.
+	defaultAutoGenGlobalAddresses = true
+
 	// minimumRetransmitTimer is the minimum amount of time to wait between
 	// sending NDP Neighbor solicitation messages. Note, RFC 4861 does
 	// not impose a minimum Retransmit Timer, but we do here to make sure
@@ -87,6 +95,24 @@ const (
 	//
 	// Max = 10.
 	MaxDiscoveredOnLinkPrefixes = 10
+
+	// validPrefixLenForAutoGen is the expected prefix length that an
+	// address can be generated for. Must be 64 bits as the interface
+	// identifier (IID) is 64 bits and an IPv6 address is 128 bits, so
+	// 128 - 64 = 64.
+	validPrefixLenForAutoGen = 64
+)
+
+var (
+	// MinPrefixInformationValidLifetimeForUpdate is the minimum Valid
+	// Lifetime to update the valid lifetime of a generated address by
+	// SLAAC.
+	//
+	// This is exported as a variable (instead of a constant) so tests
+	// can update it to a smaller value.
+	//
+	// Min = 2hrs.
+	MinPrefixInformationValidLifetimeForUpdate = 2 * time.Hour
 )
 
 // NDPDispatcher is the interface integrators of netstack must implement to
@@ -139,6 +165,33 @@ type NDPDispatcher interface {
 	// This function is not permitted to block indefinitely. This function
 	// is also not permitted to call into the stack.
 	OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) []tcpip.Route
+
+	// OnAutoGenAddress will be called when a new prefix with its
+	// autonomous address-configuration flag set has been received and SLAAC
+	// has been performed. Implementations may prevent the stack from
+	// assigning the address to the NIC by returning false.
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool
+
+	// OnAutoGenAddressInvalidated will be called when an auto-generated
+	// address (as part of SLAAC) has been invalidated.
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix)
+
+	// OnRecursiveDNSServerOption will be called when an NDP option with
+	// recursive DNS servers has been received. Note, addrs may contain
+	// link-local addresses.
+	//
+	// It is up to the caller to use the DNS Servers only for their valid
+	// lifetime. OnRecursiveDNSServerOption may be called for new or
+	// already known DNS servers. If called with known DNS servers, their
+	// valid lifetimes must be refreshed to lifetime (it may be increased,
+	// decreased, or completely invalidated when lifetime = 0).
+	OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration)
 }
 
 // NDPConfigurations is the NDP configurations for the netstack.
@@ -168,6 +221,17 @@ type NDPConfigurations struct {
 	// will be discovered from Router Advertisements' Prefix Information
 	// option. This configuration is ignored if HandleRAs is false.
 	DiscoverOnLinkPrefixes bool
+
+	// AutoGenGlobalAddresses determines whether or not global IPv6
+	// addresses will be generated for a NIC in response to receiving a new
+	// Prefix Information option with its Autonomous Address
+	// AutoConfiguration flag set, as a host, as per RFC 4862 (SLAAC).
+	//
+	// Note, if an address was already generated for some unique prefix, as
+	// part of SLAAC, this option does not affect whether or not the
+	// lifetime(s) of the generated address changes; this option only
+	// affects the generation of new addresses as part of SLAAC.
+	AutoGenGlobalAddresses bool
 }
 
 // DefaultNDPConfigurations returns an NDPConfigurations populated with
@@ -179,6 +243,7 @@ func DefaultNDPConfigurations() NDPConfigurations {
 		HandleRAs:              defaultHandleRAs,
 		DiscoverDefaultRouters: defaultDiscoverDefaultRouters,
 		DiscoverOnLinkPrefixes: defaultDiscoverOnLinkPrefixes,
+		AutoGenGlobalAddresses: defaultAutoGenGlobalAddresses,
 	}
 }
 
@@ -210,6 +275,9 @@ type ndpState struct {
 	// The on-link prefixes discovered through Router Advertisements' Prefix
 	// Information option.
 	onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState
+
+	// The addresses generated by SLAAC.
+	autoGenAddresses map[tcpip.Address]autoGenAddressState
 }
 
 // dadState holds the Duplicate Address Detection timer and channel to signal
@@ -270,6 +338,32 @@ type onLinkPrefixState struct {
 	doNotInvalidate *bool
 }
 
+// autoGenAddressState holds data associated with an address generated via
+// SLAAC.
+type autoGenAddressState struct {
+	invalidationTimer *time.Timer
+
+	// Used to signal the timer not to invalidate the SLAAC address (A) in
+	// a race condition (T1 is a goroutine that handles a PI for A and T2
+	// is the goroutine that handles A's invalidation timer firing):
+	//   T1: Receive a new PI for A
+	//   T1: Obtain the NIC's lock before processing the PI
+	//   T2: A's invalidation timer fires, and gets blocked on obtaining the
+	//       NIC's lock
+	//   T1: Refreshes/extends A's lifetime & releases NIC's lock
+	//   T2: Obtains NIC's lock & invalidates A immediately
+	//
+	// To resolve this, T1 will check to see if the timer already fired, and
+	// inform the timer using doNotInvalidate to not invalidate A, so that
+	// once T2 obtains the lock, it will see that it is set to true and do
+	// nothing further.
+	doNotInvalidate *bool
+
+	// Nonzero only when the address is not valid forever (invalidationTimer
+	// is not nil).
+	validUntil time.Time
+}
+
 // startDuplicateAddressDetection performs Duplicate Address Detection.
 //
 // This function must only be called by IPv6 addresses that are currently
@@ -534,19 +628,21 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	// we do not check the iterator for errors on calls to Next.
 	it, _ := ra.Options().Iter(false)
 	for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() {
-		switch opt.Type() {
-		case header.NDPPrefixInformationType:
-			if !ndp.configs.DiscoverOnLinkPrefixes {
+		switch opt := opt.(type) {
+		case header.NDPRecursiveDNSServer:
+			if ndp.nic.stack.ndpDisp == nil {
 				continue
 			}
 
-			pi := opt.(header.NDPPrefixInformation)
+			ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), opt.Addresses(), opt.Lifetime())
 
-			prefix := pi.Subnet()
+		case header.NDPPrefixInformation:
+			prefix := opt.Subnet()
 
 			// Is the prefix a link-local?
 			if header.IsV6LinkLocalAddress(prefix.ID()) {
-				// ...Yes, skip as per RFC 4861 section 6.3.4.
+				// ...Yes, skip as per RFC 4861 section 6.3.4,
+				// and RFC 4862 section 5.5.3.b (for SLAAC).
 				continue
 			}
 
@@ -557,82 +653,13 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 				continue
 			}
 
-			if !pi.OnLinkFlag() {
-				// Not on-link so don't "discover" it as an
-				// on-link prefix.
-				continue
-			}
-
-			prefixState, ok := ndp.onLinkPrefixes[prefix]
-			vl := pi.ValidLifetime()
-			switch {
-			case !ok && vl == 0:
-				// Don't know about this prefix but has a zero
-				// valid lifetime, so just ignore.
-				continue
-
-			case !ok && vl != 0:
-				// This is a new on-link prefix we are
-				// discovering.
-				//
-				// Only remember it if we currently know about
-				// less than MaxDiscoveredOnLinkPrefixes on-link
-				// prefixes.
-				if len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes {
-					ndp.rememberOnLinkPrefix(prefix, vl)
-				}
-				continue
-
-			case ok && vl == 0:
-				// We know about the on-link prefix, but it is
-				// no longer to be considered on-link, so
-				// invalidate it.
-				ndp.invalidateOnLinkPrefix(prefix)
-				continue
-			}
-
-			// This is an already discovered on-link prefix with a
-			// new non-zero valid lifetime.
-			// Update the invalidation timer.
-			timer := prefixState.invalidationTimer
-
-			if timer == nil && vl >= header.NDPPrefixInformationInfiniteLifetime {
-				// Had infinite valid lifetime before and
-				// continues to have an invalid lifetime. Do
-				// nothing further.
-				continue
-			}
-
-			if timer != nil && !timer.Stop() {
-				// If we reach this point, then we know the
-				// timer already fired after we took the NIC
-				// lock. Inform the timer to not invalidate
-				// the prefix once it obtains the lock as we
-				// just got a new PI that refeshes its lifetime
-				// to a non-zero value. See
-				// onLinkPrefixState.doNotInvalidate for more
-				// details.
-				*prefixState.doNotInvalidate = true
+			if opt.OnLinkFlag() {
+				ndp.handleOnLinkPrefixInformation(opt)
 			}
 
-			if vl >= header.NDPPrefixInformationInfiniteLifetime {
-				// Prefix is now valid forever so we don't need
-				// an invalidation timer.
-				prefixState.invalidationTimer = nil
-				ndp.onLinkPrefixes[prefix] = prefixState
-				continue
+			if opt.AutonomousAddressConfigurationFlag() {
+				ndp.handleAutonomousPrefixInformation(opt)
 			}
-
-			if timer != nil {
-				// We already have a timer so just reset it to
-				// expire after the new valid lifetime.
-				timer.Reset(vl)
-				continue
-			}
-
-			// We do not have a timer so just create a new one.
-			prefixState.invalidationTimer = ndp.prefixInvalidationCallback(prefix, vl, prefixState.doNotInvalidate)
-			ndp.onLinkPrefixes[prefix] = prefixState
 		}
 
 		// TODO(b/141556115): Do (MTU) Parameter Discovery.
@@ -734,7 +761,7 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration)
 	var timer *time.Timer
 
 	// Only create a timer if the lifetime is not infinite.
-	if l < header.NDPPrefixInformationInfiniteLifetime {
+	if l < header.NDPInfiniteLifetime {
 		timer = ndp.prefixInvalidationCallback(prefix, l, &doNotInvalidate)
 	}
 
@@ -795,3 +822,345 @@ func (ndp *ndpState) prefixInvalidationCallback(prefix tcpip.Subnet, vl time.Dur
 		ndp.invalidateOnLinkPrefix(prefix)
 	})
 }
+
+// handleOnLinkPrefixInformation handles a Prefix Information option with
+// its on-link flag set, as per RFC 4861 section 6.3.4.
+//
+// handleOnLinkPrefixInformation assumes that the prefix this pi is for is
+// not the link-local prefix and the on-link flag is set.
+//
+// The NIC that ndp belongs to and its associated stack MUST be locked.
+func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformation) {
+	prefix := pi.Subnet()
+	prefixState, ok := ndp.onLinkPrefixes[prefix]
+	vl := pi.ValidLifetime()
+
+	if !ok && vl == 0 {
+		// Don't know about this prefix but it has a zero valid
+		// lifetime, so just ignore.
+		return
+	}
+
+	if !ok && vl != 0 {
+		// This is a new on-link prefix we are discovering
+		//
+		// Only remember it if we currently know about less than
+		// MaxDiscoveredOnLinkPrefixes on-link prefixes.
+		if ndp.configs.DiscoverOnLinkPrefixes && len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes {
+			ndp.rememberOnLinkPrefix(prefix, vl)
+		}
+		return
+	}
+
+	if ok && vl == 0 {
+		// We know about the on-link prefix, but it is
+		// no longer to be considered on-link, so
+		// invalidate it.
+		ndp.invalidateOnLinkPrefix(prefix)
+		return
+	}
+
+	// This is an already discovered on-link prefix with a
+	// new non-zero valid lifetime.
+	// Update the invalidation timer.
+	timer := prefixState.invalidationTimer
+
+	if timer == nil && vl >= header.NDPInfiniteLifetime {
+		// Had infinite valid lifetime before and
+		// continues to have an invalid lifetime. Do
+		// nothing further.
+		return
+	}
+
+	if timer != nil && !timer.Stop() {
+		// If we reach this point, then we know the timer alread fired
+		// after we took the NIC lock. Inform the timer to not
+		// invalidate the prefix once it obtains the lock as we just
+		// got a new PI that refreshes its lifetime to a non-zero value.
+		// See onLinkPrefixState.doNotInvalidate for more details.
+		*prefixState.doNotInvalidate = true
+	}
+
+	if vl >= header.NDPInfiniteLifetime {
+		// Prefix is now valid forever so we don't need
+		// an invalidation timer.
+		prefixState.invalidationTimer = nil
+		ndp.onLinkPrefixes[prefix] = prefixState
+		return
+	}
+
+	if timer != nil {
+		// We already have a timer so just reset it to
+		// expire after the new valid lifetime.
+		timer.Reset(vl)
+		return
+	}
+
+	// We do not have a timer so just create a new one.
+	prefixState.invalidationTimer = ndp.prefixInvalidationCallback(prefix, vl, prefixState.doNotInvalidate)
+	ndp.onLinkPrefixes[prefix] = prefixState
+}
+
+// handleAutonomousPrefixInformation handles a Prefix Information option with
+// its autonomous flag set, as per RFC 4862 section 5.5.3.
+//
+// handleAutonomousPrefixInformation assumes that the prefix this pi is for is
+// not the link-local prefix and the autonomous flag is set.
+//
+// The NIC that ndp belongs to and its associated stack MUST be locked.
+func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInformation) {
+	vl := pi.ValidLifetime()
+	pl := pi.PreferredLifetime()
+
+	// If the preferred lifetime is greater than the valid lifetime,
+	// silently ignore the Prefix Information option, as per RFC 4862
+	// section 5.5.3.c.
+	if pl > vl {
+		return
+	}
+
+	prefix := pi.Subnet()
+
+	// Check if we already have an auto-generated address for prefix.
+	for _, ref := range ndp.nic.endpoints {
+		if ref.protocol != header.IPv6ProtocolNumber {
+			continue
+		}
+
+		if ref.configType != slaac {
+			continue
+		}
+
+		addr := ref.ep.ID().LocalAddress
+		refAddrWithPrefix := tcpip.AddressWithPrefix{Address: addr, PrefixLen: ref.ep.PrefixLen()}
+		if refAddrWithPrefix.Subnet() != prefix {
+			continue
+		}
+
+		//
+		// At this point, we know we are refreshing a SLAAC generated
+		// IPv6 address with the prefix, prefix. Do the work as outlined
+		// by RFC 4862 section 5.5.3.e.
+		//
+
+		addrState, ok := ndp.autoGenAddresses[addr]
+		if !ok {
+			panic(fmt.Sprintf("must have an autoGenAddressess entry for the SLAAC generated IPv6 address %s", addr))
+		}
+
+		// TODO(b/143713887): Handle deprecating auto-generated address
+		//                    after the preferred lifetime.
+
+		// As per RFC 4862 section 5.5.3.e, the valid lifetime of the
+		// address generated by SLAAC is as follows:
+		//
+		// 1) If the received Valid Lifetime is greater than 2 hours or
+		//    greater than RemainingLifetime, set the valid lifetime of
+		//    the address to the advertised Valid Lifetime.
+		//
+		// 2) If RemainingLifetime is less than or equal to 2 hours,
+		//    ignore the advertised Valid Lifetime.
+		//
+		// 3) Otherwise, reset the valid lifetime of the address to 2
+		//    hours.
+
+		// Handle the infinite valid lifetime separately as we do not
+		// keep a timer in this case.
+		if vl >= header.NDPInfiniteLifetime {
+			if addrState.invalidationTimer != nil {
+				// Valid lifetime was finite before, but now it
+				// is valid forever.
+				if !addrState.invalidationTimer.Stop() {
+					*addrState.doNotInvalidate = true
+				}
+				addrState.invalidationTimer = nil
+				addrState.validUntil = time.Time{}
+				ndp.autoGenAddresses[addr] = addrState
+			}
+
+			return
+		}
+
+		var effectiveVl time.Duration
+		var rl time.Duration
+
+		// If the address was originally set to be valid forever,
+		// assume the remaining time to be the maximum possible value.
+		if addrState.invalidationTimer == nil {
+			rl = header.NDPInfiniteLifetime
+		} else {
+			rl = time.Until(addrState.validUntil)
+		}
+
+		if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl {
+			effectiveVl = vl
+		} else if rl <= MinPrefixInformationValidLifetimeForUpdate {
+			ndp.autoGenAddresses[addr] = addrState
+			return
+		} else {
+			effectiveVl = MinPrefixInformationValidLifetimeForUpdate
+		}
+
+		if addrState.invalidationTimer == nil {
+			addrState.invalidationTimer = ndp.autoGenAddrInvalidationTimer(addr, effectiveVl, addrState.doNotInvalidate)
+		} else {
+			if !addrState.invalidationTimer.Stop() {
+				*addrState.doNotInvalidate = true
+			}
+			addrState.invalidationTimer.Reset(effectiveVl)
+		}
+
+		addrState.validUntil = time.Now().Add(effectiveVl)
+		ndp.autoGenAddresses[addr] = addrState
+		return
+	}
+
+	// We do not already have an address within the prefix, prefix. Do the
+	// work as outlined by RFC 4862 section 5.5.3.d if n is configured
+	// to auto-generated global addresses by SLAAC.
+
+	// Are we configured to auto-generate new global addresses?
+	if !ndp.configs.AutoGenGlobalAddresses {
+		return
+	}
+
+	// If we do not already have an address for this prefix and the valid
+	// lifetime is 0, no need to do anything further, as per RFC 4862
+	// section 5.5.3.d.
+	if vl == 0 {
+		return
+	}
+
+	// Make sure the prefix is valid (as far as its length is concerned) to
+	// generate a valid IPv6 address from an interface identifier (IID), as
+	// per RFC 4862 sectiion 5.5.3.d.
+	if prefix.Prefix() != validPrefixLenForAutoGen {
+		return
+	}
+
+	// Only attempt to generate an interface-specific IID if we have a valid
+	// link address.
+	//
+	// TODO(b/141011931): Validate a LinkEndpoint's link address
+	// (provided by LinkEndpoint.LinkAddress) before reaching this
+	// point.
+	linkAddr := ndp.nic.linkEP.LinkAddress()
+	if !header.IsValidUnicastEthernetAddress(linkAddr) {
+		return
+	}
+
+	// Generate an address within prefix from the modified EUI-64 of ndp's
+	// NIC's Ethernet MAC address.
+	addrBytes := make([]byte, header.IPv6AddressSize)
+	copy(addrBytes[:header.IIDOffsetInIPv6Address], prefix.ID()[:header.IIDOffsetInIPv6Address])
+	header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+	addr := tcpip.Address(addrBytes)
+	addrWithPrefix := tcpip.AddressWithPrefix{
+		Address:   addr,
+		PrefixLen: validPrefixLenForAutoGen,
+	}
+
+	// If the nic already has this address, do nothing further.
+	if ndp.nic.hasPermanentAddrLocked(addr) {
+		return
+	}
+
+	// Inform the integrator that we have a new SLAAC address.
+	if ndp.nic.stack.ndpDisp == nil {
+		return
+	}
+	if !ndp.nic.stack.ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addrWithPrefix) {
+		// Informed by the integrator not to add the address.
+		return
+	}
+
+	if _, err := ndp.nic.addAddressLocked(tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: addrWithPrefix,
+	}, FirstPrimaryEndpoint, permanent, slaac); err != nil {
+		panic(err)
+	}
+
+	// Setup the timers to deprecate and invalidate this newly generated
+	// address.
+
+	// TODO(b/143713887): Handle deprecating auto-generated addresses
+	//                    after the preferred lifetime.
+
+	var doNotInvalidate bool
+	var vTimer *time.Timer
+	if vl < header.NDPInfiniteLifetime {
+		vTimer = ndp.autoGenAddrInvalidationTimer(addr, vl, &doNotInvalidate)
+	}
+
+	ndp.autoGenAddresses[addr] = autoGenAddressState{
+		invalidationTimer: vTimer,
+		doNotInvalidate:   &doNotInvalidate,
+		validUntil:        time.Now().Add(vl),
+	}
+}
+
+// invalidateAutoGenAddress invalidates an auto-generated address.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) invalidateAutoGenAddress(addr tcpip.Address) {
+	if !ndp.cleanupAutoGenAddrResourcesAndNotify(addr) {
+		return
+	}
+
+	ndp.nic.removePermanentAddressLocked(addr)
+}
+
+// cleanupAutoGenAddrResourcesAndNotify cleans up an invalidated auto-generated
+// address's resources from ndp. If the stack has an NDP dispatcher, it will
+// be notified that addr has been invalidated.
+//
+// Returns true if ndp had resources for addr to cleanup.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bool {
+	state, ok := ndp.autoGenAddresses[addr]
+
+	if !ok {
+		return false
+	}
+
+	if state.invalidationTimer != nil {
+		state.invalidationTimer.Stop()
+		state.invalidationTimer = nil
+		*state.doNotInvalidate = true
+	}
+
+	state.doNotInvalidate = nil
+
+	delete(ndp.autoGenAddresses, addr)
+
+	if ndp.nic.stack.ndpDisp != nil {
+		ndp.nic.stack.ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), tcpip.AddressWithPrefix{
+			Address:   addr,
+			PrefixLen: validPrefixLenForAutoGen,
+		})
+	}
+
+	return true
+}
+
+// autoGenAddrInvalidationTimer returns a new invalidation timer for an
+// auto-generated address that fires after vl.
+//
+// doNotInvalidate is used to inform the timer when it fires at the same time
+// that an auto-generated address's valid lifetime gets refreshed. See
+// autoGenAddrState.doNotInvalidate for more details.
+func (ndp *ndpState) autoGenAddrInvalidationTimer(addr tcpip.Address, vl time.Duration, doNotInvalidate *bool) *time.Timer {
+	return time.AfterFunc(vl, func() {
+		ndp.nic.mu.Lock()
+		defer ndp.nic.mu.Unlock()
+
+		if *doNotInvalidate {
+			*doNotInvalidate = false
+			return
+		}
+
+		ndp.invalidateAutoGenAddress(addr)
+	})
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 5b901f947..d8e7ce67e 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -38,7 +38,7 @@ const (
 	linkAddr1      = "\x02\x02\x03\x04\x05\x06"
 	linkAddr2      = "\x02\x02\x03\x04\x05\x07"
 	linkAddr3      = "\x02\x02\x03\x04\x05\x08"
-	defaultTimeout = 250 * time.Millisecond
+	defaultTimeout = 100 * time.Millisecond
 )
 
 var (
@@ -47,6 +47,31 @@ var (
 	llAddr3 = header.LinkLocalAddr(linkAddr3)
 )
 
+// prefixSubnetAddr returns a prefix (Address + Length), the prefix's equivalent
+// tcpip.Subnet, and an address where the lower half of the address is composed
+// of the EUI-64 of linkAddr if it is a valid unicast ethernet address.
+func prefixSubnetAddr(offset uint8, linkAddr tcpip.LinkAddress) (tcpip.AddressWithPrefix, tcpip.Subnet, tcpip.AddressWithPrefix) {
+	prefixBytes := []byte{1, 2, 3, 4, 5, 6, 7, 8 + offset, 0, 0, 0, 0, 0, 0, 0, 0}
+	prefix := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(prefixBytes),
+		PrefixLen: 64,
+	}
+
+	subnet := prefix.Subnet()
+
+	var addr tcpip.AddressWithPrefix
+	if header.IsValidUnicastEthernetAddress(linkAddr) {
+		addrBytes := []byte(subnet.ID())
+		header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+		addr = tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(addrBytes),
+			PrefixLen: 64,
+		}
+	}
+
+	return prefix, subnet, addr
+}
+
 // TestDADDisabled tests that an address successfully resolves immediately
 // when DAD is not enabled (the default for an empty stack.Options).
 func TestDADDisabled(t *testing.T) {
@@ -103,6 +128,29 @@ type ndpPrefixEvent struct {
 	discovered bool
 }
 
+type ndpAutoGenAddrEventType int
+
+const (
+	newAddr ndpAutoGenAddrEventType = iota
+	invalidatedAddr
+)
+
+type ndpAutoGenAddrEvent struct {
+	nicID     tcpip.NICID
+	addr      tcpip.AddressWithPrefix
+	eventType ndpAutoGenAddrEventType
+}
+
+type ndpRDNSS struct {
+	addrs    []tcpip.Address
+	lifetime time.Duration
+}
+
+type ndpRDNSSEvent struct {
+	nicID tcpip.NICID
+	rdnss ndpRDNSS
+}
+
 var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
 
 // ndpDispatcher implements NDPDispatcher so tests can know when various NDP
@@ -113,6 +161,8 @@ type ndpDispatcher struct {
 	rememberRouter bool
 	prefixC        chan ndpPrefixEvent
 	rememberPrefix bool
+	autoGenAddrC   chan ndpAutoGenAddrEvent
+	rdnssC         chan ndpRDNSSEvent
 	routeTable     []tcpip.Route
 }
 
@@ -211,7 +261,7 @@ func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpi
 		}
 	}
 
-	rt := make([]tcpip.Route, 0)
+	var rt []tcpip.Route
 	exclude := tcpip.Route{
 		Destination: prefix,
 		NIC:         nicID,
@@ -226,6 +276,40 @@ func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpi
 	return rt
 }
 
+func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) bool {
+	if n.autoGenAddrC != nil {
+		n.autoGenAddrC <- ndpAutoGenAddrEvent{
+			nicID,
+			addr,
+			newAddr,
+		}
+	}
+	return true
+}
+
+func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) {
+	if n.autoGenAddrC != nil {
+		n.autoGenAddrC <- ndpAutoGenAddrEvent{
+			nicID,
+			addr,
+			invalidatedAddr,
+		}
+	}
+}
+
+// Implements stack.NDPDispatcher.OnRecursiveDNSServerOption.
+func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration) {
+	if n.rdnssC != nil {
+		n.rdnssC <- ndpRDNSSEvent{
+			nicID,
+			ndpRDNSS{
+				addrs,
+				lifetime,
+			},
+		}
+	}
+}
+
 // TestDADResolve tests that an address successfully resolves after performing
 // DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
 // Included in the subtests is a test to make sure that an invalid
@@ -247,6 +331,8 @@ func TestDADResolve(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
 			ndpDisp := ndpDispatcher{
 				dadC: make(chan ndpDADEvent),
 			}
@@ -781,16 +867,33 @@ func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
 //
 // Note, raBufWithPI does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink bool, vl uint32) tcpip.PacketBuffer {
+func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) tcpip.PacketBuffer {
 	flags := uint8(0)
 	if onLink {
-		flags |= 128
+		// The OnLink flag is the 7th bit in the flags byte.
+		flags |= 1 << 7
+	}
+	if auto {
+		// The Address Auto-Configuration flag is the 6th bit in the
+		// flags byte.
+		flags |= 1 << 6
 	}
 
+	// A valid header.NDPPrefixInformation must be 30 bytes.
 	buf := [30]byte{}
+	// The first byte in a header.NDPPrefixInformation is the Prefix Length
+	// field.
 	buf[0] = uint8(prefix.PrefixLen)
+	// The 2nd byte within a header.NDPPrefixInformation is the Flags field.
 	buf[1] = flags
+	// The Valid Lifetime field starts after the 2nd byte within a
+	// header.NDPPrefixInformation.
 	binary.BigEndian.PutUint32(buf[2:], vl)
+	// The Preferred Lifetime field starts after the 6th byte within a
+	// header.NDPPrefixInformation.
+	binary.BigEndian.PutUint32(buf[6:], pl)
+	// The Prefix Address field starts after the 14th byte within a
+	// header.NDPPrefixInformation.
 	copy(buf[14:], prefix.Address)
 	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{
 		header.NDPPrefixInformation(buf[:]),
@@ -800,6 +903,8 @@ func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, on
 // TestNoRouterDiscovery tests that router discovery will not be performed if
 // configured not to.
 func TestNoRouterDiscovery(t *testing.T) {
+	t.Parallel()
+
 	// Being configured to discover routers means handle and
 	// discover are set to true and forwarding is set to false.
 	// This tests all possible combinations of the configurations,
@@ -812,6 +917,8 @@ func TestNoRouterDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverDefaultRouters(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+			t.Parallel()
+
 			ndpDisp := ndpDispatcher{
 				routerC: make(chan ndpRouterEvent, 10),
 			}
@@ -844,6 +951,8 @@ func TestNoRouterDiscovery(t *testing.T) {
 // TestRouterDiscoveryDispatcherNoRemember tests that the stack does not
 // remember a discovered router when the dispatcher asks it not to.
 func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
+	t.Parallel()
+
 	ndpDisp := ndpDispatcher{
 		routerC: make(chan ndpRouterEvent, 10),
 	}
@@ -909,6 +1018,8 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 }
 
 func TestRouterDiscovery(t *testing.T) {
+	t.Parallel()
+
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, 10),
 		rememberRouter: true,
@@ -1040,6 +1151,8 @@ func TestRouterDiscovery(t *testing.T) {
 // TestRouterDiscoveryMaxRouters tests that only
 // stack.MaxDiscoveredDefaultRouters discovered routers are remembered.
 func TestRouterDiscoveryMaxRouters(t *testing.T) {
+	t.Parallel()
+
 	ndpDisp := ndpDispatcher{
 		routerC:        make(chan ndpRouterEvent, 10),
 		rememberRouter: true,
@@ -1104,6 +1217,8 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
 // TestNoPrefixDiscovery tests that prefix discovery will not be performed if
 // configured not to.
 func TestNoPrefixDiscovery(t *testing.T) {
+	t.Parallel()
+
 	prefix := tcpip.AddressWithPrefix{
 		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
 		PrefixLen: 64,
@@ -1121,6 +1236,8 @@ func TestNoPrefixDiscovery(t *testing.T) {
 		forwarding := i&4 == 0
 
 		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverOnLinkPrefixes(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+			t.Parallel()
+
 			ndpDisp := ndpDispatcher{
 				prefixC: make(chan ndpPrefixEvent, 10),
 			}
@@ -1140,7 +1257,7 @@ func TestNoPrefixDiscovery(t *testing.T) {
 			}
 
 			// Rx an RA with prefix with non-zero lifetime.
-			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, 10))
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 10, 0))
 
 			select {
 			case <-ndpDisp.prefixC:
@@ -1154,11 +1271,9 @@ func TestNoPrefixDiscovery(t *testing.T) {
 // TestPrefixDiscoveryDispatcherNoRemember tests that the stack does not
 // remember a discovered on-link prefix when the dispatcher asks it not to.
 func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
-	prefix := tcpip.AddressWithPrefix{
-		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
-		PrefixLen: 64,
-	}
-	subnet := prefix.Subnet()
+	t.Parallel()
+
+	prefix, subnet, _ := prefixSubnetAddr(0, "")
 
 	ndpDisp := ndpDispatcher{
 		prefixC: make(chan ndpPrefixEvent, 10),
@@ -1189,7 +1304,7 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 
 	// Rx an RA with prefix with a short lifetime.
 	const lifetime = 1
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, lifetime))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, lifetime, 0))
 	select {
 	case r := <-ndpDisp.prefixC:
 		if r.nicID != 1 {
@@ -1226,21 +1341,11 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 }
 
 func TestPrefixDiscovery(t *testing.T) {
-	prefix1 := tcpip.AddressWithPrefix{
-		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
-		PrefixLen: 64,
-	}
-	prefix2 := tcpip.AddressWithPrefix{
-		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x09\x00\x00\x00\x00\x00\x00\x00\x00"),
-		PrefixLen: 64,
-	}
-	prefix3 := tcpip.AddressWithPrefix{
-		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x09\x0a\x00\x00\x00\x00\x00\x00\x00"),
-		PrefixLen: 72,
-	}
-	subnet1 := prefix1.Subnet()
-	subnet2 := prefix2.Subnet()
-	subnet3 := prefix3.Subnet()
+	t.Parallel()
+
+	prefix1, subnet1, _ := prefixSubnetAddr(0, "")
+	prefix2, subnet2, _ := prefixSubnetAddr(1, "")
+	prefix3, subnet3, _ := prefixSubnetAddr(2, "")
 
 	ndpDisp := ndpDispatcher{
 		prefixC:        make(chan ndpPrefixEvent, 10),
@@ -1281,7 +1386,7 @@ func TestPrefixDiscovery(t *testing.T) {
 
 	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
 	// with zero valid lifetime.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 0))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly discovered a prefix with 0 lifetime")
@@ -1290,7 +1395,7 @@ func TestPrefixDiscovery(t *testing.T) {
 
 	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
 	// with non-zero lifetime.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 100))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 100, 0))
 	waitForEvent(subnet1, true, defaultTimeout)
 
 	// Should have added a device route for subnet1 through the nic.
@@ -1299,7 +1404,7 @@ func TestPrefixDiscovery(t *testing.T) {
 	}
 
 	// Receive an RA with prefix2 in a PI.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, 100))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, 100, 0))
 	waitForEvent(subnet2, true, defaultTimeout)
 
 	// Should have added a device route for subnet2 through the nic.
@@ -1308,7 +1413,7 @@ func TestPrefixDiscovery(t *testing.T) {
 	}
 
 	// Receive an RA with prefix3 in a PI.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, 100))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 100, 0))
 	waitForEvent(subnet3, true, defaultTimeout)
 
 	// Should have added a device route for subnet3 through the nic.
@@ -1317,7 +1422,7 @@ func TestPrefixDiscovery(t *testing.T) {
 	}
 
 	// Receive an RA with prefix1 in a PI with lifetime = 0.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 0))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
 	waitForEvent(subnet1, false, defaultTimeout)
 
 	// Should have removed the device route for subnet1 through the nic.
@@ -1327,7 +1432,7 @@ func TestPrefixDiscovery(t *testing.T) {
 
 	// Receive an RA with prefix2 in a PI with lesser lifetime.
 	lifetime := uint32(2)
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, lifetime))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, lifetime, 0))
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly received prefix event when updating lifetime")
@@ -1349,7 +1454,7 @@ func TestPrefixDiscovery(t *testing.T) {
 	}
 
 	// Receive RA to invalidate prefix3.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, 0))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 0, 0))
 	waitForEvent(subnet3, false, defaultTimeout)
 
 	// Should not have any routes.
@@ -1364,10 +1469,10 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 	// invalidate the prefix.
 	const testInfiniteLifetimeSeconds = 2
 	const testInfiniteLifetime = testInfiniteLifetimeSeconds * time.Second
-	saved := header.NDPPrefixInformationInfiniteLifetime
-	header.NDPPrefixInformationInfiniteLifetime = testInfiniteLifetime
+	saved := header.NDPInfiniteLifetime
+	header.NDPInfiniteLifetime = testInfiniteLifetime
 	defer func() {
-		header.NDPPrefixInformationInfiniteLifetime = saved
+		header.NDPInfiniteLifetime = saved
 	}()
 
 	prefix := tcpip.AddressWithPrefix{
@@ -1415,7 +1520,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 
 	// Receive an RA with prefix in an NDP Prefix Information option (PI)
 	// with infinite valid lifetime which should not get invalidated.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0))
 	waitForEvent(true, defaultTimeout)
 	select {
 	case <-ndpDisp.prefixC:
@@ -1425,16 +1530,16 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 
 	// Receive an RA with finite lifetime.
 	// The prefix should get invalidated after 1s.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds-1))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0))
 	waitForEvent(false, testInfiniteLifetime)
 
 	// Receive an RA with finite lifetime.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds-1))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0))
 	waitForEvent(true, defaultTimeout)
 
 	// Receive an RA with prefix with an infinite lifetime.
 	// The prefix should not be invalidated.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0))
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
@@ -1443,7 +1548,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 
 	// Receive an RA with a prefix with a lifetime value greater than the
 	// set infinite lifetime value.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds+1))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds+1, 0))
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
@@ -1452,13 +1557,15 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 
 	// Receive an RA with 0 lifetime.
 	// The prefix should get invalidated.
-	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, 0))
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 0, 0))
 	waitForEvent(false, defaultTimeout)
 }
 
 // TestPrefixDiscoveryMaxRouters tests that only
 // stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
 func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
+	t.Parallel()
+
 	ndpDisp := ndpDispatcher{
 		prefixC:        make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3),
 		rememberPrefix: true,
@@ -1537,3 +1644,606 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
 		t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt)
 	}
 }
+
+// Checks to see if list contains an IPv6 address, item.
+func contains(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
+	protocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: item,
+	}
+
+	for _, i := range list {
+		if i == protocolAddress {
+			return true
+		}
+	}
+
+	return false
+}
+
+// TestNoAutoGenAddr tests that SLAAC is not performed when configured not to.
+func TestNoAutoGenAddr(t *testing.T) {
+	t.Parallel()
+
+	prefix, _, _ := prefixSubnetAddr(0, "")
+
+	// Being configured to auto-generate addresses means handle and
+	// autogen are set to true and forwarding is set to false.
+	// This tests all possible combinations of the configurations,
+	// except for the configuration where handle = true, autogen =
+	// true and forwarding = false (the required configuration to do
+	// SLAAC) - that will done in other tests.
+	for i := 0; i < 7; i++ {
+		handle := i&1 != 0
+		autogen := i&2 != 0
+		forwarding := i&4 == 0
+
+		t.Run(fmt.Sprintf("HandleRAs(%t), AutoGenAddr(%t), Forwarding(%t)", handle, autogen, forwarding), func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+			}
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              handle,
+					AutoGenGlobalAddresses: autogen,
+				},
+				NDPDisp: &ndpDisp,
+			})
+			s.SetForwarding(forwarding)
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			// Rx an RA with prefix with non-zero lifetime.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, false, true, 10, 0))
+
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly auto-generated an address when configured not to")
+			case <-time.After(defaultTimeout):
+			}
+		})
+	}
+}
+
+// TestAutoGenAddr tests that an address is properly generated and invalidated
+// when configured to do so.
+func TestAutoGenAddr(t *testing.T) {
+	const newMinVL = 2
+	newMinVLDuration := newMinVL * time.Second
+	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	waitForEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case r := <-ndpDisp.autoGenAddrC:
+			if r.nicID != 1 {
+				t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+			}
+			if r.addr != addr {
+				t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+			}
+			if r.eventType != eventType {
+				t.Fatalf("got r.eventType = %v, want = %v", r.eventType, eventType)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timeout waiting for addr auto gen event")
+		}
+	}
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with zero valid lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 0, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly auto-generated an address with 0 lifetime")
+	case <-time.After(defaultTimeout):
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with non-zero lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+	waitForEvent(addr1, newAddr, defaultTimeout)
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+
+	// Receive an RA with prefix2 in an NDP Prefix Information option (PI)
+	// with preferred lifetime > valid lifetime
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 5, 6))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly auto-generated an address with preferred lifetime > valid lifetime")
+	case <-time.After(defaultTimeout):
+	}
+
+	// Receive an RA with prefix2 in a PI.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+	waitForEvent(addr2, newAddr, defaultTimeout)
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) {
+		t.Fatalf("Should have %s in the list of addresses", addr2)
+	}
+
+	// Refresh valid lifetime for addr of prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly auto-generated an address when we already have an address for a prefix")
+	case <-time.After(defaultTimeout):
+	}
+
+	// Wait for addr of prefix1 to be invalidated.
+	waitForEvent(addr1, invalidatedAddr, newMinVLDuration+defaultTimeout)
+	if contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+		t.Fatalf("Should not have %s in the list of addresses", addr1)
+	}
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) {
+		t.Fatalf("Should have %s in the list of addresses", addr2)
+	}
+}
+
+// TestAutoGenAddrValidLifetimeUpdates tests that the valid lifetime of an
+// auto-generated address only gets updated when required to, as specified in
+// RFC 4862 section 5.5.3.e.
+func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
+	const infiniteVL = 4294967295
+	const newMinVL = 5
+	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVL * time.Second
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	tests := []struct {
+		name string
+		ovl  uint32
+		nvl  uint32
+		evl  uint32
+	}{
+		// Should update the VL to the minimum VL for updating if the
+		// new VL is less than newMinVL but was originally greater than
+		// it.
+		{
+			"LargeVLToVLLessThanMinVLForUpdate",
+			9999,
+			1,
+			newMinVL,
+		},
+		{
+			"LargeVLTo0",
+			9999,
+			0,
+			newMinVL,
+		},
+		{
+			"InfiniteVLToVLLessThanMinVLForUpdate",
+			infiniteVL,
+			1,
+			newMinVL,
+		},
+		{
+			"InfiniteVLTo0",
+			infiniteVL,
+			0,
+			newMinVL,
+		},
+
+		// Should not update VL if original VL was less than newMinVL
+		// and the new VL is also less than newMinVL.
+		{
+			"ShouldNotUpdateWhenBothOldAndNewAreLessThanMinVLForUpdate",
+			newMinVL - 1,
+			newMinVL - 3,
+			newMinVL - 1,
+		},
+
+		// Should take the new VL if the new VL is greater than the
+		// remaining time or is greater than newMinVL.
+		{
+			"MorethanMinVLToLesserButStillMoreThanMinVLForUpdate",
+			newMinVL + 5,
+			newMinVL + 3,
+			newMinVL + 3,
+		},
+		{
+			"SmallVLToGreaterVLButStillLessThanMinVLForUpdate",
+			newMinVL - 3,
+			newMinVL - 1,
+			newMinVL - 1,
+		},
+		{
+			"SmallVLToGreaterVLThatIsMoreThaMinVLForUpdate",
+			newMinVL - 3,
+			newMinVL + 1,
+			newMinVL + 1,
+		},
+	}
+
+	const delta = 500 * time.Millisecond
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+			}
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              true,
+					AutoGenGlobalAddresses: true,
+				},
+				NDPDisp: &ndpDisp,
+			})
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			// Receive an RA with prefix with initial VL, test.ovl.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.ovl, 0))
+			select {
+			case r := <-ndpDisp.autoGenAddrC:
+				if r.nicID != 1 {
+					t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+				}
+				if r.addr != addr {
+					t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+				}
+				if r.eventType != newAddr {
+					t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+				}
+			case <-time.After(defaultTimeout):
+				t.Fatal("timeout waiting for addr auto gen event")
+			}
+
+			// Receive an new RA with prefix with new VL, test.nvl.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.nvl, 0))
+
+			//
+			// Validate that the VL for the address got set to
+			// test.evl.
+			//
+
+			// Make sure we do not get any invalidation events
+			// until atleast 500ms (delta) before test.evl.
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatalf("unexpectedly received an auto gen addr event")
+			case <-time.After(time.Duration(test.evl)*time.Second - delta):
+			}
+
+			// Wait for another second (2x delta), but now we expect
+			// the invalidation event.
+			select {
+			case r := <-ndpDisp.autoGenAddrC:
+				if r.nicID != 1 {
+					t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+				}
+				if r.addr != addr {
+					t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+				}
+				if r.eventType != invalidatedAddr {
+					t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+				}
+			case <-time.After(2 * delta):
+				t.Fatal("timeout waiting for addr auto gen event")
+			}
+		})
+	}
+}
+
+// TestAutoGenAddrRemoval tests that when auto-generated addresses are removed
+// by the user, its resources will be cleaned up and an invalidation event will
+// be sent to the integrator.
+func TestAutoGenAddrRemoval(t *testing.T) {
+	t.Parallel()
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Receive an RA with prefix with its valid lifetime = lifetime.
+	const lifetime = 5
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetime, 0))
+	select {
+	case r := <-ndpDisp.autoGenAddrC:
+		if r.nicID != 1 {
+			t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+		}
+		if r.addr != addr {
+			t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+		}
+		if r.eventType != newAddr {
+			t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+		}
+	case <-time.After(defaultTimeout):
+		t.Fatal("timeout waiting for addr auto gen event")
+	}
+
+	// Remove the address.
+	if err := s.RemoveAddress(1, addr.Address); err != nil {
+		t.Fatalf("RemoveAddress(_, %s) = %s", addr.Address, err)
+	}
+
+	// Should get the invalidation event immediately.
+	select {
+	case r := <-ndpDisp.autoGenAddrC:
+		if r.nicID != 1 {
+			t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+		}
+		if r.addr != addr {
+			t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+		}
+		if r.eventType != invalidatedAddr {
+			t.Fatalf("got r.eventType = %v, want = %v", r.eventType, newAddr)
+		}
+	case <-time.After(defaultTimeout):
+		t.Fatal("timeout waiting for addr auto gen event")
+	}
+
+	// Wait for the original valid lifetime to make sure the original timer
+	// got stopped/cleaned up.
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatalf("unexpectedly received an auto gen addr event")
+	case <-time.After(lifetime*time.Second + defaultTimeout):
+	}
+}
+
+// TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that
+// is already assigned to the NIC, the static address remains.
+func TestAutoGenAddrStaticConflict(t *testing.T) {
+	t.Parallel()
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+	}
+	e := channel.New(10, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Add the address as a static address before SLAAC tries to add it.
+	if err := s.AddProtocolAddress(1, tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr}); err != nil {
+		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr.Address, err)
+	}
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+
+	// Receive a PI where the generated address will be the same as the one
+	// that we already have assigned statically.
+	const lifetime = 5
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetime, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly received an auto gen addr event for an address we already have statically")
+	case <-time.After(defaultTimeout):
+	}
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+
+	// Should not get an invalidation event after the PI's invalidation
+	// time.
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly received an auto gen addr event")
+	case <-time.After(lifetime*time.Second + defaultTimeout):
+	}
+	if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+}
+
+// TestNDPRecursiveDNSServerDispatch tests that we properly dispatch an event
+// to the integrator when an RA is received with the NDP Recursive DNS Server
+// option with at least one valid address.
+func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		opt      header.NDPRecursiveDNSServer
+		expected *ndpRDNSS
+	}{
+		{
+			"Unspecified",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			}),
+			nil,
+		},
+		{
+			"Multicast",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+			}),
+			nil,
+		},
+		{
+			"OptionTooSmall",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				1, 2, 3, 4, 5, 6, 7, 8,
+			}),
+			nil,
+		},
+		{
+			"0Addresses",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+			}),
+			nil,
+		},
+		{
+			"Valid1Address",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+			}),
+			&ndpRDNSS{
+				[]tcpip.Address{
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+				},
+				2 * time.Second,
+			},
+		},
+		{
+			"Valid2Addresses",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 1,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 2,
+			}),
+			&ndpRDNSS{
+				[]tcpip.Address{
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x02",
+				},
+				time.Second,
+			},
+		},
+		{
+			"Valid3Addresses",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 0,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 2,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 3,
+			}),
+			&ndpRDNSS{
+				[]tcpip.Address{
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x02",
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x03",
+				},
+				0,
+			},
+		},
+	}
+
+	for _, test := range tests {
+		test := test
+
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				// We do not expect more than a single RDNSS
+				// event at any time for this test.
+				rdnssC: make(chan ndpRDNSSEvent, 1),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs: true,
+				},
+				NDPDisp: &ndpDisp,
+			})
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, header.NDPOptionsSerializer{test.opt}))
+
+			if test.expected != nil {
+				select {
+				case e := <-ndpDisp.rdnssC:
+					if e.nicID != 1 {
+						t.Errorf("got rdnss nicID = %d, want = 1", e.nicID)
+					}
+					if diff := cmp.Diff(e.rdnss.addrs, test.expected.addrs); diff != "" {
+						t.Errorf("rdnss addrs mismatch (-want +got):\n%s", diff)
+					}
+					if e.rdnss.lifetime != test.expected.lifetime {
+						t.Errorf("got rdnss lifetime = %s, want = %s", e.rdnss.lifetime, test.expected.lifetime)
+					}
+				default:
+					t.Fatal("expected an RDNSS option event")
+				}
+			}
+
+			// Should have no more RDNSS options.
+			select {
+			case e := <-ndpDisp.rdnssC:
+				t.Fatalf("unexpectedly got a new RDNSS option event: %+v", e)
+			default:
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 3f8d7312c..e8401c673 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -115,10 +115,11 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
 			},
 		},
 		ndp: ndpState{
-			configs:        stack.ndpConfigs,
-			dad:            make(map[tcpip.Address]dadState),
-			defaultRouters: make(map[tcpip.Address]defaultRouterState),
-			onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
+			configs:          stack.ndpConfigs,
+			dad:              make(map[tcpip.Address]dadState),
+			defaultRouters:   make(map[tcpip.Address]defaultRouterState),
+			onLinkPrefixes:   make(map[tcpip.Subnet]onLinkPrefixState),
+			autoGenAddresses: make(map[tcpip.Address]autoGenAddressState),
 		},
 	}
 	nic.ndp.nic = nic
@@ -244,6 +245,20 @@ func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedN
 	return nil
 }
 
+// hasPermanentAddrLocked returns true if n has a permanent (including currently
+// tentative) address, addr.
+func (n *NIC) hasPermanentAddrLocked(addr tcpip.Address) bool {
+	ref, ok := n.endpoints[NetworkEndpointID{addr}]
+
+	if !ok {
+		return false
+	}
+
+	kind := ref.getKind()
+
+	return kind == permanent || kind == permanentTentative
+}
+
 func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint {
 	return n.getRefOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, n.promiscuous)
 }
@@ -335,7 +350,7 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 			Address:   address,
 			PrefixLen: netProto.DefaultPrefixLen(),
 		},
-	}, peb, temporary)
+	}, peb, temporary, static)
 
 	n.mu.Unlock()
 	return ref
@@ -384,10 +399,10 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
 		}
 	}
 
-	return n.addAddressLocked(protocolAddress, peb, permanent)
+	return n.addAddressLocked(protocolAddress, peb, permanent, static)
 }
 
-func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind) (*referencedNetworkEndpoint, *tcpip.Error) {
+func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType) (*referencedNetworkEndpoint, *tcpip.Error) {
 	// TODO(b/141022673): Validate IP address before adding them.
 
 	// Sanity check.
@@ -417,11 +432,12 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
 	}
 
 	ref := &referencedNetworkEndpoint{
-		refs:     1,
-		ep:       ep,
-		nic:      n,
-		protocol: protocolAddress.Protocol,
-		kind:     kind,
+		refs:       1,
+		ep:         ep,
+		nic:        n,
+		protocol:   protocolAddress.Protocol,
+		kind:       kind,
+		configType: configType,
 	}
 
 	// Set up cache if link address resolution exists for this protocol.
@@ -624,9 +640,18 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
 
 	isIPv6Unicast := r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr)
 
-	// If we are removing a tentative IPv6 unicast address, stop DAD.
-	if isIPv6Unicast && kind == permanentTentative {
-		n.ndp.stopDuplicateAddressDetection(addr)
+	if isIPv6Unicast {
+		// If we are removing a tentative IPv6 unicast address, stop
+		// DAD.
+		if kind == permanentTentative {
+			n.ndp.stopDuplicateAddressDetection(addr)
+		}
+
+		// If we are removing an address generated via SLAAC, cleanup
+		// its SLAAC resources and notify the integrator.
+		if r.configType == slaac {
+			n.ndp.cleanupAutoGenAddrResourcesAndNotify(addr)
+		}
 	}
 
 	r.setKind(permanentExpired)
@@ -989,7 +1014,7 @@ const (
 	// removing the permanent address from the NIC.
 	permanent
 
-	// An expired permanent endoint is a permanent endoint that had its address
+	// An expired permanent endpoint is a permanent endpoint that had its address
 	// removed from the NIC, and it is waiting to be removed once no more routes
 	// hold a reference to it. This is achieved by decreasing its reference count
 	// by 1. If its address is re-added before the endpoint is removed, its type
@@ -1035,6 +1060,19 @@ func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep
 	}
 }
 
+type networkEndpointConfigType int32
+
+const (
+	// A statically configured endpoint is an address that was added by
+	// some user-specified action (adding an explicit address, joining a
+	// multicast group).
+	static networkEndpointConfigType = iota
+
+	// A slaac configured endpoint is an IPv6 endpoint that was
+	// added by SLAAC as per RFC 4862 section 5.5.3.
+	slaac
+)
+
 type referencedNetworkEndpoint struct {
 	ep       NetworkEndpoint
 	nic      *NIC
@@ -1050,6 +1088,10 @@ type referencedNetworkEndpoint struct {
 
 	// networkEndpointKind must only be accessed using {get,set}Kind().
 	kind networkEndpointKind
+
+	// configType is the method that was used to configure this endpoint.
+	// This must never change after the endpoint is added to a NIC.
+	configType networkEndpointConfigType
 }
 
 func (r *referencedNetworkEndpoint) getKind() networkEndpointKind {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 5746043cc..f62fd729f 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -576,6 +576,11 @@ type KeepaliveIntervalOption time.Duration
 // closed.
 type KeepaliveCountOption int
 
+// TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
+// specified timeout for a given TCP connection.
+// See: RFC5482 for details.
+type TCPUserTimeoutOption time.Duration
+
 // CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
 // the current congestion control algorithm.
 type CongestionControlOption string
@@ -924,6 +929,14 @@ type TCPStats struct {
 	// ESTABLISHED state or the CLOSE-WAIT state.
 	EstablishedResets *StatCounter
 
+	// EstablishedClosed is the number of times established TCP connections
+	// made a transition to CLOSED state.
+	EstablishedClosed *StatCounter
+
+	// EstablishedTimedout is the number of times an established connection
+	// was reset because of keep-alive time out.
+	EstablishedTimedout *StatCounter
+
 	// ListenOverflowSynDrop is the number of times the listen queue overflowed
 	// and a SYN was dropped.
 	ListenOverflowSynDrop *StatCounter
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index dd1728f9c..3b353d56c 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -28,6 +28,7 @@ go_library(
         "forwarder.go",
         "protocol.go",
         "rcv.go",
+        "rcv_state.go",
         "reno.go",
         "sack.go",
         "sack_scoreboard.go",
@@ -52,6 +53,7 @@ go_library(
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
+        "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index f543a6105..5422ae80c 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -242,6 +242,13 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 	n.initGSO()
 
+	// Now inherit any socket options that should be inherited from the
+	// listening endpoint.
+	// In case of Forwarder listenEP will be nil and hence this check.
+	if l.listenEP != nil {
+		l.listenEP.propagateInheritableOptions(n)
+	}
+
 	// Register new endpoint so that packets are routed to it.
 	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
 		n.Close()
@@ -298,8 +305,6 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		return nil, err
 	}
 	ep.mu.Lock()
-	ep.stack.Stats().TCP.CurrentEstablished.Increment()
-	ep.state = StateEstablished
 	ep.isConnectNotified = true
 	ep.mu.Unlock()
 
@@ -352,6 +357,14 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
 	}
 }
 
+// propagateInheritableOptions propagates any options set on the listening
+// endpoint to the newly created endpoint.
+func (e *endpoint) propagateInheritableOptions(n *endpoint) {
+	e.mu.Lock()
+	n.userTimeout = e.userTimeout
+	e.mu.Unlock()
+}
+
 // handleSynSegment is called in its own goroutine once the listening endpoint
 // receives a SYN segment. It is responsible for completing the handshake and
 // queueing the new endpoint for acceptance.
@@ -546,6 +559,8 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		n.tsOffset = 0
 
 		// Switch state to connected.
+		// We do not use transitionToStateEstablishedLocked here as there is
+		// no handshake state available when doing a SYN cookie based accept.
 		n.stack.Stats().TCP.CurrentEstablished.Increment()
 		n.state = StateEstablished
 		n.isConnectNotified = true
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 16f8aea12..cdd69f360 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -218,6 +218,14 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// acceptable if the ack field acknowledges the SYN.
 	if s.flagIsSet(header.TCPFlagRst) {
 		if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
+			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
+			// was acceptable then signal the user "error: connection reset", drop
+			// the segment, enter CLOSED state, delete TCB, and return."
+			h.ep.mu.Lock()
+			h.ep.workerCleanup = true
+			h.ep.mu.Unlock()
+			// Although the RFC above calls out ECONNRESET, Linux actually returns
+			// ECONNREFUSED here so we do as well.
 			return tcpip.ErrConnectionRefused
 		}
 		return nil
@@ -252,6 +260,11 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// and the handshake is completed.
 	if s.flagIsSet(header.TCPFlagAck) {
 		h.state = handshakeCompleted
+
+		h.ep.mu.Lock()
+		h.ep.transitionToStateEstablishedLocked(h)
+		h.ep.mu.Unlock()
+
 		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
 		return nil
 	}
@@ -352,6 +365,10 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
 		}
 		h.state = handshakeCompleted
+		h.ep.mu.Lock()
+		h.ep.transitionToStateEstablishedLocked(h)
+		h.ep.mu.Unlock()
+
 		return nil
 	}
 
@@ -853,7 +870,7 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	}
 	e.state = StateError
 	e.HardError = err
-	if err != tcpip.ErrConnectionReset {
+	if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
 		// The exact sequence number to be used for the RST is the same as the
 		// one used by Linux. We need to handle the case of window being shrunk
 		// which can cause sndNxt to be outside the acceptable window on the
@@ -880,6 +897,30 @@ func (e *endpoint) completeWorkerLocked() {
 	}
 }
 
+// transitionToStateEstablisedLocked transitions a given endpoint
+// to an established state using the handshake parameters provided.
+// It also initializes sender/receiver if required.
+func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
+	if e.snd == nil {
+		// Transfer handshake state to TCP connection. We disable
+		// receive window scaling if the peer doesn't support it
+		// (indicated by a negative send window scale).
+		e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
+	}
+	if e.rcv == nil {
+		rcvBufSize := seqnum.Size(e.receiveBufferSize())
+		e.rcvListMu.Lock()
+		e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
+		// Bootstrap the auto tuning algorithm. Starting at zero will
+		// result in a really large receive window after the first auto
+		// tuning adjustment.
+		e.rcvAutoParams.prevCopied = int(h.rcvWnd)
+		e.rcvListMu.Unlock()
+	}
+	h.ep.stack.Stats().TCP.CurrentEstablished.Increment()
+	e.state = StateEstablished
+}
+
 // transitionToStateCloseLocked ensures that the endpoint is
 // cleaned up from the transport demuxer, "before" moving to
 // StateClose. This will ensure that no packet will be
@@ -891,6 +932,7 @@ func (e *endpoint) transitionToStateCloseLocked() {
 	}
 	e.cleanupLocked()
 	e.state = StateClose
+	e.stack.Stats().TCP.EstablishedClosed.Increment()
 }
 
 // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
@@ -1053,14 +1095,27 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 // keepalive packets periodically when the connection is idle. If we don't hear
 // from the other side after a number of tries, we terminate the connection.
 func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
+	e.mu.RLock()
+	userTimeout := e.userTimeout
+	e.mu.RUnlock()
+
 	e.keepalive.Lock()
 	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
 		e.keepalive.Unlock()
 		return nil
 	}
 
+	// If a userTimeout is set then abort the connection if it is
+	// exceeded.
+	if userTimeout != 0 && time.Since(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 {
+		e.keepalive.Unlock()
+		e.stack.Stats().TCP.EstablishedTimedout.Increment()
+		return tcpip.ErrTimeout
+	}
+
 	if e.keepalive.unacked >= e.keepalive.count {
 		e.keepalive.Unlock()
+		e.stack.Stats().TCP.EstablishedTimedout.Increment()
 		return tcpip.ErrTimeout
 	}
 
@@ -1077,7 +1132,6 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
 // whether it is enabled for this endpoint.
 func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	e.keepalive.Lock()
-	defer e.keepalive.Unlock()
 	if receivedData {
 		e.keepalive.unacked = 0
 	}
@@ -1085,6 +1139,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	// data to send.
 	if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
 		e.keepalive.timer.disable()
+		e.keepalive.Unlock()
 		return
 	}
 	if e.keepalive.unacked > 0 {
@@ -1092,6 +1147,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	} else {
 		e.keepalive.timer.enable(e.keepalive.idle)
 	}
+	e.keepalive.Unlock()
 }
 
 // disableKeepaliveTimer stops the keepalive timer.
@@ -1146,8 +1202,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			e.lastErrorMu.Unlock()
 
 			e.mu.Lock()
-			e.stack.Stats().TCP.EstablishedResets.Increment()
-			e.stack.Stats().TCP.CurrentEstablished.Decrement()
 			e.state = StateError
 			e.HardError = err
 
@@ -1156,25 +1210,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 			return err
 		}
-
-		// Transfer handshake state to TCP connection. We disable
-		// receive window scaling if the peer doesn't support it
-		// (indicated by a negative send window scale).
-		e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
-
-		rcvBufSize := seqnum.Size(e.receiveBufferSize())
-		e.rcvListMu.Lock()
-		e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
-		// boot strap the auto tuning algorithm. Starting at zero will
-		// result in a large step function on the first proper causing
-		// the window to just go to a really large value after the first
-		// RTT itself.
-		e.rcvAutoParams.prevCopied = initialRcvWnd
-		e.rcvListMu.Unlock()
-		e.stack.Stats().TCP.CurrentEstablished.Increment()
-		e.mu.Lock()
-		e.state = StateEstablished
-		e.mu.Unlock()
 	}
 
 	e.keepalive.timer.init(&e.keepalive.waker)
@@ -1225,6 +1260,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			w: &e.snd.resendWaker,
 			f: func() *tcpip.Error {
 				if !e.snd.retransmitTimerExpired() {
+					e.stack.Stats().TCP.EstablishedTimedout.Increment()
 					return tcpip.ErrTimeout
 				}
 				return nil
@@ -1375,7 +1411,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	// Mark endpoint as closed.
 	e.mu.Lock()
 	if e.state != StateError {
-		e.stack.Stats().TCP.EstablishedResets.Increment()
 		e.stack.Stats().TCP.CurrentEstablished.Decrement()
 		e.transitionToStateCloseLocked()
 	}
@@ -1392,6 +1427,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		if s == nil {
 			break
 		}
+
 		e.tryDeliverSegmentFromClosedEndpoint(s)
 	}
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9d4a87e30..dd8b47cbe 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -30,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tmutex"
@@ -340,9 +341,11 @@ type endpoint struct {
 	// TCP should never broadcast but Linux nevertheless supports enabling/
 	// disabling SO_BROADCAST, albeit as a NOOP.
 	broadcast bool
+
 	// Values used to reserve a port or register a transport endpoint
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
+	boundPortFlags    ports.Flags
 
 	// effectiveNetProtos contains the network protocols actually in use. In
 	// most cases it will only contain "netProto", but in cases like IPv6
@@ -472,6 +475,12 @@ type endpoint struct {
 	// without hearing a response, the connection is closed.
 	keepalive keepalive
 
+	// userTimeout if non-zero specifies a user specified timeout for
+	// a connection w/ pending data to send. A connection that has pending
+	// unacked data will be forcibily aborted if the timeout is reached
+	// without any data being acked.
+	userTimeout time.Duration
+
 	// pendingAccepted is a synchronization primitive used to track number
 	// of connections that are queued up to be delivered to the accepted
 	// channel. We use this to ensure that all goroutines blocked on writing
@@ -737,9 +746,10 @@ func (e *endpoint) Close() {
 			e.isRegistered = false
 		}
 
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.isPortReserved = false
 		e.boundBindToDevice = 0
+		e.boundPortFlags = ports.Flags{}
 	}
 
 	// Mark endpoint as closed.
@@ -800,10 +810,11 @@ func (e *endpoint) cleanupLocked() {
 	}
 
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.isPortReserved = false
 	}
 	e.boundBindToDevice = 0
+	e.boundPortFlags = ports.Flags{}
 
 	e.route.Release()
 	e.stack.CompleteTransportEndpointCleanup(e)
@@ -1329,6 +1340,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 		return nil
 
+	case tcpip.TCPUserTimeoutOption:
+		e.mu.Lock()
+		e.userTimeout = time.Duration(v)
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.BroadcastOption:
 		e.mu.Lock()
 		e.broadcast = v != 0
@@ -1587,6 +1604,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.keepalive.Unlock()
 		return nil
 
+	case *tcpip.TCPUserTimeoutOption:
+		e.mu.Lock()
+		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
+		e.mu.Unlock()
+		return nil
+
 	case *tcpip.OutOfBandInlineOption:
 		// We don't currently support disabling this option.
 		*o = 1
@@ -1775,7 +1798,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 			}
 			// reusePort is false below because connect cannot reuse a port even if
 			// reusePort was set.
-			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, false /* reusePort */, e.bindToDevice) {
+			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, ports.Flags{LoadBalanced: false}, e.bindToDevice) {
 				return false, nil
 			}
 
@@ -1802,7 +1825,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	// before Connect: in such a case we don't want to hold on to
 	// reservations anymore.
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.isPortReserved = false
 	}
 
@@ -2034,28 +2057,33 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 		}
 	}
 
-	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice)
+	flags := ports.Flags{
+		LoadBalanced: e.reusePort,
+	}
+	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, flags, e.bindToDevice)
 	if err != nil {
 		return err
 	}
 
 	e.boundBindToDevice = e.bindToDevice
+	e.boundPortFlags = flags
 	e.isPortReserved = true
 	e.effectiveNetProtos = netProtos
 	e.ID.LocalPort = port
 
 	// Any failures beyond this point must remove the port registration.
-	defer func(bindToDevice tcpip.NICID) {
+	defer func(portFlags ports.Flags, bindToDevice tcpip.NICID) {
 		if err != nil {
-			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice)
+			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice)
 			e.isPortReserved = false
 			e.effectiveNetProtos = nil
 			e.ID.LocalPort = 0
 			e.ID.LocalAddress = ""
 			e.boundNICID = 0
 			e.boundBindToDevice = 0
+			e.boundPortFlags = ports.Flags{}
 		}
-	}(e.boundBindToDevice)
+	}(e.boundPortFlags, e.boundBindToDevice)
 
 	// If an address is specified, we must ensure that it's one of our
 	// local addresses.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 89b965c23..bc718064c 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -162,13 +162,26 @@ func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Transpo
 func replyWithReset(s *segment) {
 	// Get the seqnum from the packet if the ack flag is set.
 	seq := seqnum.Value(0)
+	ack := seqnum.Value(0)
+	flags := byte(header.TCPFlagRst)
+	// As per RFC 793 page 35 (Reset Generation)
+	//   1.  If the connection does not exist (CLOSED) then a reset is sent
+	//   in response to any incoming segment except another reset.  In
+	//   particular, SYNs addressed to a non-existent connection are rejected
+	//   by this means.
+
+	//   If the incoming segment has an ACK field, the reset takes its
+	//   sequence number from the ACK field of the segment, otherwise the
+	//   reset has sequence number zero and the ACK field is set to the sum
+	//   of the sequence number and segment length of the incoming segment.
+	//   The connection remains in the CLOSED state.
 	if s.flagIsSet(header.TCPFlagAck) {
 		seq = s.ackNumber
+	} else {
+		flags |= header.TCPFlagAck
+		ack = s.sequenceNumber.Add(s.logicalLen())
 	}
-
-	ack := s.sequenceNumber.Add(s.logicalLen())
-
-	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
+	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, flags, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
 }
 
 // SetOption implements TransportProtocol.SetOption.
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 5ee499c36..0a5534959 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -50,16 +50,20 @@ type receiver struct {
 	pendingRcvdSegments segmentHeap
 	pendingBufUsed      seqnum.Size
 	pendingBufSize      seqnum.Size
+
+	// Time when the last ack was received.
+	lastRcvdAckTime time.Time `state:".(unixTime)"`
 }
 
 func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver {
 	return &receiver{
-		ep:             ep,
-		rcvNxt:         irs + 1,
-		rcvAcc:         irs.Add(rcvWnd + 1),
-		rcvWnd:         rcvWnd,
-		rcvWndScale:    rcvWndScale,
-		pendingBufSize: pendingBufSize,
+		ep:              ep,
+		rcvNxt:          irs + 1,
+		rcvAcc:          irs.Add(rcvWnd + 1),
+		rcvWnd:          rcvWnd,
+		rcvWndScale:     rcvWndScale,
+		pendingBufSize:  pendingBufSize,
+		lastRcvdAckTime: time.Now(),
 	}
 }
 
@@ -360,6 +364,9 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 		return true, nil
 	}
 
+	// Store the time of the last ack.
+	r.lastRcvdAckTime = time.Now()
+
 	// Defer segment processing if it can't be consumed now.
 	if !r.consumeSegment(s, segSeq, segLen) {
 		if segLen > 0 || s.flagIsSet(header.TCPFlagFin) {
diff --git a/pkg/tcpip/transport/tcp/rcv_state.go b/pkg/tcpip/transport/tcp/rcv_state.go
new file mode 100644
index 000000000..2bf21a2e7
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rcv_state.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+)
+
+// saveLastRcvdAckTime is invoked by stateify.
+func (r *receiver) saveLastRcvdAckTime() unixTime {
+	return unixTime{r.lastRcvdAckTime.Unix(), r.lastRcvdAckTime.UnixNano()}
+}
+
+// loadLastRcvdAckTime is invoked by stateify.
+func (r *receiver) loadLastRcvdAckTime(unix unixTime) {
+	r.lastRcvdAckTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index d3f7c9125..8a947dc66 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -28,8 +28,11 @@ import (
 )
 
 const (
-	// minRTO is the minimum allowed value for the retransmit timeout.
-	minRTO = 200 * time.Millisecond
+	// MinRTO is the minimum allowed value for the retransmit timeout.
+	MinRTO = 200 * time.Millisecond
+
+	// MaxRTO is the maximum allowed value for the retransmit timeout.
+	MaxRTO = 120 * time.Second
 
 	// InitialCwnd is the initial congestion window.
 	InitialCwnd = 10
@@ -134,6 +137,10 @@ type sender struct {
 	// rttMeasureTime is the time when the rttMeasureSeqNum was sent.
 	rttMeasureTime time.Time `state:".(unixTime)"`
 
+	// firstRetransmittedSegXmitTime is the original transmit time of
+	// the first segment that was retransmitted due to RTO expiration.
+	firstRetransmittedSegXmitTime time.Time `state:".(unixTime)"`
+
 	closed      bool
 	writeNext   *segment
 	writeList   segmentList
@@ -392,8 +399,8 @@ func (s *sender) updateRTO(rtt time.Duration) {
 
 	s.rto = s.rtt.srtt + 4*s.rtt.rttvar
 	s.rtt.Unlock()
-	if s.rto < minRTO {
-		s.rto = minRTO
+	if s.rto < MinRTO {
+		s.rto = MinRTO
 	}
 }
 
@@ -438,8 +445,30 @@ func (s *sender) retransmitTimerExpired() bool {
 	s.ep.stack.Stats().TCP.Timeouts.Increment()
 	s.ep.stats.SendErrors.Timeouts.Increment()
 
-	// Give up if we've waited more than a minute since the last resend.
-	if s.rto >= 60*time.Second {
+	// Give up if we've waited more than a minute since the last resend or
+	// if a user time out is set and we have exceeded the user specified
+	// timeout since the first retransmission.
+	s.ep.mu.RLock()
+	uto := s.ep.userTimeout
+	s.ep.mu.RUnlock()
+
+	if s.firstRetransmittedSegXmitTime.IsZero() {
+		// We store the original xmitTime of the segment that we are
+		// about to retransmit as the retransmission time. This is
+		// required as by the time the retransmitTimer has expired the
+		// segment has already been sent and unacked for the RTO at the
+		// time the segment was sent.
+		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
+	}
+
+	elapsed := time.Since(s.firstRetransmittedSegXmitTime)
+	remaining := MaxRTO
+	if uto != 0 {
+		// Cap to the user specified timeout if one is specified.
+		remaining = uto - elapsed
+	}
+
+	if remaining <= 0 || s.rto >= MaxRTO {
 		return false
 	}
 
@@ -447,6 +476,11 @@ func (s *sender) retransmitTimerExpired() bool {
 	// below.
 	s.rto *= 2
 
+	// Cap RTO to remaining time.
+	if s.rto > remaining {
+		s.rto = remaining
+	}
+
 	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
 	//
 	// Retransmit timeouts:
@@ -674,7 +708,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		default:
 			s.ep.state = StateFinWait1
 		}
-		s.ep.stack.Stats().TCP.CurrentEstablished.Decrement()
 		s.ep.mu.Unlock()
 	} else {
 		// We're sending a non-FIN segment.
@@ -1169,6 +1202,8 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 		// RFC 6298 Rule 5.3
 		if s.sndUna == s.sndNxt {
 			s.outstanding = 0
+			// Reset firstRetransmittedSegXmitTime to the zero value.
+			s.firstRetransmittedSegXmitTime = time.Time{}
 			s.resendTimer.disable()
 		}
 	}
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index 12eff8afc..8b20c3455 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -48,3 +48,13 @@ func (s *sender) loadRttMeasureTime(unix unixTime) {
 func (s *sender) afterLoad() {
 	s.resendTimer.init(&s.resendWaker)
 }
+
+// saveFirstRetransmittedSegXmitTime is invoked by stateify.
+func (s *sender) saveFirstRetransmittedSegXmitTime() unixTime {
+	return unixTime{s.firstRetransmittedSegXmitTime.Unix(), s.firstRetransmittedSegXmitTime.UnixNano()}
+}
+
+// loadFirstRetransmittedSegXmitTime is invoked by stateify.
+func (s *sender) loadFirstRetransmittedSegXmitTime(unix unixTime) {
+	s.firstRetransmittedSegXmitTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index d1f0d6ce7..e8fe4dab5 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -75,6 +75,20 @@ func TestGiveUpConnect(t *testing.T) {
 	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != tcpip.ErrAborted {
 		t.Fatalf("got ep.GetSockOpt(tcpip.ErrorOption{}) = %v, want = %v", err, tcpip.ErrAborted)
 	}
+
+	// Call Connect again to retreive the handshake failure status
+	// and stats updates.
+	if err := ep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrAborted {
+		t.Fatalf("got ep.Connect(...) = %v, want = %v", err, tcpip.ErrAborted)
+	}
+
+	if got := c.Stack().Stats().TCP.FailedConnectionAttempts.Value(); got != 1 {
+		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %v, want = 1", got)
+	}
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
 }
 
 func TestConnectIncrementActiveConnection(t *testing.T) {
@@ -309,8 +323,8 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
 }
 
 func TestTCPResetsReceivedIncrement(t *testing.T) {
@@ -446,18 +460,17 @@ func TestConnectResetAfterClose(t *testing.T) {
 			checker.TCP(
 				checker.DstPort(context.TestPort),
 				checker.SeqNum(uint32(c.IRS)+2),
-				checker.AckNum(790),
-				checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+				checker.AckNum(0),
+				checker.TCPFlags(header.TCPFlagRst),
 			),
 		)
 		break
 	}
 }
 
-// TestClosingWithEnqueuedSegments tests handling of
-// still enqueued segments when the endpoint transitions
-// to StateClose. The in-flight segments would be re-enqueued
-// to a any listening endpoint.
+// TestClosingWithEnqueuedSegments tests handling of still enqueued segments
+// when the endpoint transitions to StateClose. The in-flight segments would be
+// re-enqueued to a any listening endpoint.
 func TestClosingWithEnqueuedSegments(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -541,21 +554,29 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	ep.(interface{ ResumeWork() }).ResumeWork()
 
 	// Wait for the protocolMainLoop to resume and update state.
-	time.Sleep(1 * time.Millisecond)
+	time.Sleep(10 * time.Millisecond)
 
 	// Expect the endpoint to be closed.
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
 
+	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != 1 {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = 1", got)
+	}
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
+
 	// Check if the endpoint was moved to CLOSED and netstack a reset in
 	// response to the ACK packet that we sent after last-ACK.
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(793),
-			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			checker.AckNum(0),
+			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
 }
@@ -892,7 +913,7 @@ func TestSendRstOnListenerRxAckV4(t *testing.T) {
 
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.TCPFlags(header.TCPFlagRst),
 		checker.SeqNum(200)))
 }
 
@@ -920,7 +941,7 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) {
 
 	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
 		checker.DstPort(context.TestPort),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.TCPFlags(header.TCPFlagRst),
 		checker.SeqNum(200)))
 }
 
@@ -1119,6 +1140,71 @@ func TestConnectBindToDevice(t *testing.T) {
 	}
 }
 
+func TestRstOnSynSent(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create an endpoint, don't handshake because we want to interfere with the
+	// handshake process.
+	c.Create(-1)
+
+	// Start connection attempt.
+	waitEntry, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventOut)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	addr := tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}
+	if err := c.EP.Connect(addr); err != tcpip.ErrConnectStarted {
+		t.Fatalf("got Connect(%+v) = %v, want %s", addr, err, tcpip.ErrConnectStarted)
+	}
+
+	// Receive SYN packet.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagSyn),
+		),
+	)
+
+	// Ensure that we've reached SynSent state
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+		t.Fatalf("got State() = %s, want %s", got, want)
+	}
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	// Send a packet with a proper ACK and a RST flag to cause the socket
+	// to Error and close out
+	iss := seqnum.Value(789)
+	rcvWnd := seqnum.Size(30000)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: tcpHdr.DestinationPort(),
+		DstPort: tcpHdr.SourcePort(),
+		Flags:   header.TCPFlagRst | header.TCPFlagAck,
+		SeqNum:  iss,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  rcvWnd,
+		TCPOpts: nil,
+	})
+
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(3 * time.Second):
+		t.Fatal("timed out waiting for packet to arrive")
+	}
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionRefused {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrConnectionRefused)
+	}
+
+	// Due to the RST the endpoint should be in an error state.
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+		t.Fatalf("got State() = %s, want %s", got, want)
+	}
+}
+
 func TestOutOfOrderReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -2694,6 +2780,13 @@ loop:
 	if tcp.EndpointState(c.EP.State()) != tcp.StateError {
 		t.Fatalf("got EP state is not StateError")
 	}
+
+	if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 {
+		t.Errorf("got stats.TCP.EstablishedResets.Value() = %v, want = 1", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
 }
 
 func TestSendOnResetConnection(t *testing.T) {
@@ -4262,8 +4355,9 @@ func TestKeepalive(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
+	const keepAliveInterval = 10 * time.Millisecond
 	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
-	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(10 * time.Millisecond))
+	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
 	c.EP.SetSockOpt(tcpip.KeepaliveCountOption(5))
 	c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
 
@@ -4353,19 +4447,43 @@ func TestKeepalive(t *testing.T) {
 		)
 	}
 
+	// Sleep for a litte over the KeepAlive interval to make sure
+	// the timer has time to fire after the last ACK and close the
+	// close the socket.
+	time.Sleep(keepAliveInterval + 5*time.Millisecond)
+
 	// The connection should be terminated after 5 unacked keepalives.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.SeqNum(uint32(next)),
-			checker.AckNum(uint32(790)),
-			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
 
+	if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %v, want = 1", got)
+	}
+
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
 		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
 	}
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
 }
 
 func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) {
@@ -5992,6 +6110,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
 	}
 
+	want := c.Stack().Stats().TCP.EstablishedClosed.Value() + 1
+
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
@@ -6118,8 +6238,15 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.SeqNum(uint32(ackHeaders.AckNum)),
-		checker.AckNum(uint32(ackHeaders.SeqNum)),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
+
+	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = %v", got, want)
+	}
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+	}
 }
 
 func TestTCPCloseWithData(t *testing.T) {
@@ -6290,7 +6417,147 @@ func TestTCPCloseWithData(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.SeqNum(uint32(ackHeaders.AckNum)),
-		checker.AckNum(uint32(ackHeaders.SeqNum)),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
+}
+
+func TestTCPUserTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
 
+	userTimeout := 50 * time.Millisecond
+	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+
+	// Send some data and wait before ACKing it.
+	view := buffer.NewView(3)
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	next := uint32(c.IRS) + 1
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	// Wait for a little over the minimum retransmit timeout of 200ms for
+	// the retransmitTimer to fire and close the connection.
+	time.Sleep(tcp.MinRTO + 10*time.Millisecond)
+
+	// No packet should be received as the connection should be silently
+	// closed due to timeout.
+	c.CheckNoPacket("unexpected packet received after userTimeout has expired")
+
+	next += uint32(len(view))
+
+	// The connection should be terminated after userTimeout has expired.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(next)),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+	}
+
+	if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
+	}
+}
+
+func TestKeepaliveWithUserTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
+
+	const keepAliveInterval = 10 * time.Millisecond
+	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
+	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
+	c.EP.SetSockOpt(tcpip.KeepaliveCountOption(10))
+	c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
+
+	// Set userTimeout to be the duration for 3 keepalive probes.
+	userTimeout := 30 * time.Millisecond
+	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+
+	// Check that the connection is still alive.
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+	}
+
+	// Now receive 2 keepalives, but don't ACK them. The connection should
+	// be reset when the 3rd one should be sent due to userTimeout being
+	// 30ms and each keepalive probe should be sent 10ms apart as set above after
+	// the connection has been idle for 10ms.
+	for i := 0; i < 2; i++ {
+		b := c.GetPacket()
+		checker.IPv4(t, b,
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)),
+				checker.AckNum(uint32(790)),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	// Sleep for a litte over the KeepAlive interval to make sure
+	// the timer has time to fire after the last ACK and close the
+	// close the socket.
+	time.Sleep(keepAliveInterval + 5*time.Millisecond)
+
+	// The connection should be terminated after 30ms.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(c.IRS + 1),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS+1)),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+	}
+	if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
+	}
 }
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 6cb66c1af..b0a376eba 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -231,6 +231,7 @@ func (c *Context) CheckNoPacket(errMsg string) {
 // addresses. It will fail with an error if no packet is received for
 // 2 seconds.
 func (c *Context) GetPacket() []byte {
+	c.t.Helper()
 	select {
 	case p := <-c.linkEP.C:
 		if p.Proto != ipv4.ProtocolNumber {
@@ -259,6 +260,7 @@ func (c *Context) GetPacket() []byte {
 // and destination address. If no packet is available it will return
 // nil immediately.
 func (c *Context) GetPacketNonBlocking() []byte {
+	c.t.Helper()
 	select {
 	case p := <-c.linkEP.C:
 		if p.Proto != ipv4.ProtocolNumber {
@@ -483,6 +485,7 @@ func (c *Context) CreateV6Endpoint(v6only bool) {
 // GetV6Packet reads a single packet from the link layer endpoint of the context
 // and asserts that it is an IPv6 Packet with the expected src/dest addresses.
 func (c *Context) GetV6Packet() []byte {
+	c.t.Helper()
 	select {
 	case p := <-c.linkEP.C:
 		if p.Proto != ipv6.ProtocolNumber {
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 8d4c3808f..97e4d5825 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -34,6 +34,7 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
+        "//pkg/tcpip/ports",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
         "//pkg/waiter",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 4b161e404..1ac4705af 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -107,6 +108,7 @@ type endpoint struct {
 	// Values used to reserve a port or register a transport endpoint.
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
+	boundPortFlags    ports.Flags
 
 	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
 	// applied while sending packets. Defaults to 0 as on Linux.
@@ -180,8 +182,9 @@ func (e *endpoint) Close() {
 	switch e.state {
 	case StateBound, StateConnected:
 		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.boundBindToDevice = 0
+		e.boundPortFlags = ports.Flags{}
 	}
 
 	for _, mem := range e.multicastMemberships {
@@ -895,7 +898,8 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 	} else {
 		if e.ID.LocalPort != 0 {
 			// Release the ephemeral port.
-			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
+			e.boundPortFlags = ports.Flags{}
 		}
 		e.state = StateInitial
 	}
@@ -1042,16 +1046,23 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
 	if e.ID.LocalPort == 0 {
-		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.reusePort, e.bindToDevice)
+		flags := ports.Flags{
+			LoadBalanced: e.reusePort,
+			// FIXME(b/129164367): Support SO_REUSEADDR.
+			MostRecent: false,
+		}
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, flags, e.bindToDevice)
 		if err != nil {
 			return id, e.bindToDevice, err
 		}
+		e.boundPortFlags = flags
 		id.LocalPort = port
 	}
 
 	err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice)
 	if err != nil {
-		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.bindToDevice)
+		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.boundPortFlags, e.bindToDevice)
+		e.boundPortFlags = ports.Flags{}
 	}
 	return id, e.bindToDevice, err
 }
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 847d2f91c..6226b63f8 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "compat.go",
         "compat_amd64.go",
+        "compat_arm64.go",
         "config.go",
         "controller.go",
         "debug.go",
@@ -110,7 +111,6 @@ go_test(
         "//pkg/control/server",
         "//pkg/log",
         "//pkg/p9",
-        "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 07e35ab10..352e710d2 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -21,10 +21,8 @@ import (
 	"syscall"
 
 	"github.com/golang/protobuf/proto"
-	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
 	"gvisor.dev/gvisor/pkg/sentry/strace"
@@ -53,9 +51,9 @@ type compatEmitter struct {
 }
 
 func newCompatEmitter(logFD int) (*compatEmitter, error) {
-	nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
+	nameMap, ok := getSyscallNameMap()
 	if !ok {
-		return nil, fmt.Errorf("amd64 Linux syscall table not found")
+		return nil, fmt.Errorf("Linux syscall table not found")
 	}
 
 	c := &compatEmitter{
@@ -86,16 +84,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
 }
 
 func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
-	regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+	regs := us.Registers
 
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	sysnr := regs.OrigRax
+	sysnr := syscallNum(regs)
 	tr := c.trackers[sysnr]
 	if tr == nil {
 		switch sysnr {
-		case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
+		case syscall.SYS_PRCTL:
 			// args: cmd, ...
 			tr = newArgsTracker(0)
 
@@ -112,10 +110,14 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 			tr = newArgsTracker(2)
 
 		default:
-			tr = &onceTracker{}
+			tr = newArchArgsTracker(sysnr)
+			if tr == nil {
+				tr = &onceTracker{}
+			}
 		}
 		c.trackers[sysnr] = tr
 	}
+
 	if tr.shouldReport(regs) {
 		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
 		tr.onReported(regs)
@@ -139,10 +141,10 @@ func (c *compatEmitter) Close() error {
 // the syscall and arguments.
 type syscallTracker interface {
 	// shouldReport returns true is the syscall should be reported.
-	shouldReport(regs *rpb.AMD64Registers) bool
+	shouldReport(regs *rpb.Registers) bool
 
 	// onReported marks the syscall as reported.
-	onReported(regs *rpb.AMD64Registers)
+	onReported(regs *rpb.Registers)
 }
 
 // onceTracker reports only a single time, used for most syscalls.
@@ -150,10 +152,45 @@ type onceTracker struct {
 	reported bool
 }
 
-func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool {
+func (o *onceTracker) shouldReport(_ *rpb.Registers) bool {
 	return !o.reported
 }
 
-func (o *onceTracker) onReported(_ *rpb.AMD64Registers) {
+func (o *onceTracker) onReported(_ *rpb.Registers) {
 	o.reported = true
 }
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+	// argsIdx is the syscall arguments to use as unique ID.
+	argsIdx  []int
+	reported map[string]struct{}
+	count    int
+}
+
+func newArgsTracker(argIdx ...int) *argsTracker {
+	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
+}
+
+// key returns the command based on the syscall argument index.
+func (a *argsTracker) key(regs *rpb.Registers) string {
+	var rv string
+	for _, idx := range a.argsIdx {
+		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	}
+	return rv
+}
+
+func (a *argsTracker) shouldReport(regs *rpb.Registers) bool {
+	if a.count >= reportLimit {
+		return false
+	}
+	_, ok := a.reported[a.key(regs)]
+	return !ok
+}
+
+func (a *argsTracker) onReported(regs *rpb.Registers) {
+	a.count++
+	a.reported[a.key(regs)] = struct{}{}
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 43cd0db94..42b0ca8b0 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -16,62 +16,81 @@ package boot
 
 import (
 	"fmt"
+	"syscall"
 
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
 )
 
 // reportLimit is the max number of events that should be reported per tracker.
 const reportLimit = 100
 
-// argsTracker reports only once for each different combination of arguments.
-// It's used for generic syscalls like ioctl to report once per 'cmd'.
-type argsTracker struct {
-	// argsIdx is the syscall arguments to use as unique ID.
-	argsIdx  []int
-	reported map[string]struct{}
-	count    int
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Amd64{
+			Amd64: &rpb.AMD64Registers{},
+		},
+	}
 }
 
-func newArgsTracker(argIdx ...int) *argsTracker {
-	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
-}
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
 
-// cmd returns the command based on the syscall argument index.
-func (a *argsTracker) key(regs *rpb.AMD64Registers) string {
-	var rv string
-	for _, idx := range a.argsIdx {
-		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	switch argIdx {
+	case 0:
+		return uint32(amd64Regs.Rdi)
+	case 1:
+		return uint32(amd64Regs.Rsi)
+	case 2:
+		return uint32(amd64Regs.Rdx)
+	case 3:
+		return uint32(amd64Regs.R10)
+	case 4:
+		return uint32(amd64Regs.R8)
+	case 5:
+		return uint32(amd64Regs.R9)
 	}
-	return rv
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
 
-func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 {
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+
 	switch argIdx {
 	case 0:
-		return uint32(regs.Rdi)
+		amd64Regs.Rdi = argVal
 	case 1:
-		return uint32(regs.Rsi)
+		amd64Regs.Rsi = argVal
 	case 2:
-		return uint32(regs.Rdx)
+		amd64Regs.Rdx = argVal
 	case 3:
-		return uint32(regs.R10)
+		amd64Regs.R10 = argVal
 	case 4:
-		return uint32(regs.R8)
+		amd64Regs.R8 = argVal
 	case 5:
-		return uint32(regs.R9)
+		amd64Regs.R9 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 	}
-	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
 
-func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool {
-	if a.count >= reportLimit {
-		return false
-	}
-	_, ok := a.reported[a.key(regs)]
-	return !ok
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.AMD64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+	return amd64Regs.OrigRax
 }
 
-func (a *argsTracker) onReported(regs *rpb.AMD64Registers) {
-	a.count++
-	a.reported[a.key(regs)] = struct{}{}
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	switch sysnr {
+	case syscall.SYS_ARCH_PRCTL:
+		// args: cmd, ...
+		return newArgsTracker(0)
+	}
+	return nil
 }
diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go
new file mode 100644
index 000000000..f784cd237
--- /dev/null
+++ b/runsc/boot/compat_arm64.go
@@ -0,0 +1,91 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Arm64{
+			Arm64: &rpb.ARM64Registers{},
+		},
+	}
+}
+
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		return uint32(arm64Regs.R0)
+	case 1:
+		return uint32(arm64Regs.R1)
+	case 2:
+		return uint32(arm64Regs.R2)
+	case 3:
+		return uint32(arm64Regs.R3)
+	case 4:
+		return uint32(arm64Regs.R4)
+	case 5:
+		return uint32(arm64Regs.R5)
+	}
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		arm64Regs.R0 = argVal
+	case 1:
+		arm64Regs.R1 = argVal
+	case 2:
+		arm64Regs.R2 = argVal
+	case 3:
+		arm64Regs.R3 = argVal
+	case 4:
+		arm64Regs.R4 = argVal
+	case 5:
+		arm64Regs.R5 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+	}
+}
+
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.ARM64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+	return arm64Regs.R8
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	// currently, no arch specific syscalls need to be handled here.
+	return nil
+}
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index 388298d8d..839c5303b 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -16,8 +16,6 @@ package boot
 
 import (
 	"testing"
-
-	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 )
 
 func TestOnceTracker(t *testing.T) {
@@ -35,31 +33,34 @@ func TestOnceTracker(t *testing.T) {
 
 func TestArgsTracker(t *testing.T) {
 	for _, tc := range []struct {
-		name string
-		idx  []int
-		rdi1 uint64
-		rdi2 uint64
-		rsi1 uint64
-		rsi2 uint64
-		want bool
+		name   string
+		idx    []int
+		arg1_1 uint64
+		arg1_2 uint64
+		arg2_1 uint64
+		arg2_2 uint64
+		want   bool
 	}{
-		{name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false},
-		{name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false},
-		{name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true},
-		{name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true},
-		{name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
-		{name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false},
-		{name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true},
+		{name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false},
+		{name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false},
+		{name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true},
+		{name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true},
+		{name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false},
+		{name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false},
+		{name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			c := newArgsTracker(tc.idx...)
-			regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1}
+			regs := newRegs()
+			setArgVal(0, tc.arg1_1, regs)
+			setArgVal(1, tc.arg2_1, regs)
 			if !c.shouldReport(regs) {
 				t.Error("first call to shouldReport, got: false, want: true")
 			}
 			c.onReported(regs)
 
-			regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2
+			setArgVal(0, tc.arg1_2, regs)
+			setArgVal(1, tc.arg2_2, regs)
 			if got := c.shouldReport(regs); tc.want != got {
 				t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want)
 			}
@@ -70,7 +71,9 @@ func TestArgsTracker(t *testing.T) {
 func TestArgsTrackerLimit(t *testing.T) {
 	c := newArgsTracker(0, 1)
 	for i := 0; i < reportLimit; i++ {
-		regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)}
+		regs := newRegs()
+		setArgVal(0, 123, regs)
+		setArgVal(1, uint64(i), regs)
 		if !c.shouldReport(regs) {
 			t.Error("shouldReport before limit was reached, got: false, want: true")
 		}
@@ -78,7 +81,9 @@ func TestArgsTrackerLimit(t *testing.T) {
 	}
 
 	// Should hit the count limit now.
-	regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456}
+	regs := newRegs()
+	setArgVal(0, 123, regs)
+	setArgVal(1, 123456, regs)
 	if c.shouldReport(regs) {
 		t.Error("shouldReport after limit was reached, got: true, want: false")
 	}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index f62be4c59..9c9e94864 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -152,7 +152,9 @@ func newController(fd int, l *Loader) (*controller, error) {
 	srv.Register(&debug{})
 	srv.Register(&control.Logging{})
 	if l.conf.ProfileEnable {
-		srv.Register(&control.Profile{})
+		srv.Register(&control.Profile{
+			Kernel: l.k,
+		})
 	}
 
 	return &controller{
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index bc9ffaf81..421ccd255 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"path/filepath"
 	"sort"
 	"strconv"
@@ -52,7 +51,7 @@ const (
 	rootDevice = "9pfs-/"
 
 	// MountPrefix is the annotation prefix for mount hints.
-	MountPrefix = "gvisor.dev/spec/mount"
+	MountPrefix = "dev.gvisor.spec.mount."
 
 	// Filesystems that runsc supports.
 	bind     = "bind"
@@ -490,14 +489,15 @@ type podMountHints struct {
 func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
 	mnts := make(map[string]*mountHint)
 	for k, v := range spec.Annotations {
-		// Look for 'gvisor.dev/spec/mount' annotations and parse them.
+		// Look for 'dev.gvisor.spec.mount' annotations and parse them.
 		if strings.HasPrefix(k, MountPrefix) {
-			parts := strings.Split(k, "/")
-			if len(parts) != 5 {
+			// Remove the prefix and split the rest.
+			parts := strings.Split(k[len(MountPrefix):], ".")
+			if len(parts) != 2 {
 				return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
 			}
-			name := parts[3]
-			if len(name) == 0 || path.Clean(name) != name {
+			name := parts[0]
+			if len(name) == 0 {
 				return nil, fmt.Errorf("invalid mount name: %s", name)
 			}
 			mnt := mnts[name]
@@ -505,7 +505,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
 				mnt = &mountHint{name: name}
 				mnts[name] = mnt
 			}
-			if err := mnt.setField(parts[4], v); err != nil {
+			if err := mnt.setField(parts[1], v); err != nil {
 				return nil, err
 			}
 		}
@@ -575,6 +575,11 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 func (c *containerMounter) processHints(conf *Config) error {
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfs {
+			continue
+		}
 		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
 		inode, err := c.mountSharedMaster(ctx, conf, hint)
 		if err != nil {
@@ -851,7 +856,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 		return fmt.Errorf("mount %q error: %v", m.Destination, err)
 	}
 
-	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts)
 	return nil
 }
 
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 0396a4cfb..912037075 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -15,7 +15,6 @@
 package boot
 
 import (
-	"path"
 	"reflect"
 	"strings"
 	"testing"
@@ -26,19 +25,19 @@ import (
 func TestPodMountHintsHappy(t *testing.T) {
 	spec := &specs.Spec{
 		Annotations: map[string]string{
-			path.Join(MountPrefix, "mount1", "source"): "foo",
-			path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-			path.Join(MountPrefix, "mount1", "share"):  "pod",
+			MountPrefix + "mount1.source": "foo",
+			MountPrefix + "mount1.type":   "tmpfs",
+			MountPrefix + "mount1.share":  "pod",
 
-			path.Join(MountPrefix, "mount2", "source"):  "bar",
-			path.Join(MountPrefix, "mount2", "type"):    "bind",
-			path.Join(MountPrefix, "mount2", "share"):   "container",
-			path.Join(MountPrefix, "mount2", "options"): "rw,private",
+			MountPrefix + "mount2.source":  "bar",
+			MountPrefix + "mount2.type":    "bind",
+			MountPrefix + "mount2.share":   "container",
+			MountPrefix + "mount2.options": "rw,private",
 		},
 	}
 	podHints, err := newPodMountHints(spec)
 	if err != nil {
-		t.Errorf("newPodMountHints failed: %v", err)
+		t.Fatalf("newPodMountHints failed: %v", err)
 	}
 
 	// Check that fields were set correctly.
@@ -86,95 +85,95 @@ func TestPodMountHintsErrors(t *testing.T) {
 		{
 			name: "too short",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1"): "foo",
+				MountPrefix + "mount1": "foo",
 			},
 			error: "invalid mount annotation",
 		},
 		{
 			name: "no name",
 			annotations: map[string]string{
-				MountPrefix + "//source": "foo",
+				MountPrefix + ".source": "foo",
 			},
 			error: "invalid mount name",
 		},
 		{
 			name: "missing source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "type"):  "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"): "pod",
+				MountPrefix + "mount1.type":  "tmpfs",
+				MountPrefix + "mount1.share": "pod",
 			},
 			error: "source field",
 		},
 		{
 			name: "missing type",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "type field",
 		},
 		{
 			name: "missing share",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
 			},
 			error: "share field",
 		},
 		{
 			name: "invalid field name",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "invalid"): "foo",
+				MountPrefix + "mount1.invalid": "foo",
 			},
 			error: "invalid mount annotation",
 		},
 		{
 			name: "invalid source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "source cannot be empty",
 		},
 		{
 			name: "invalid type",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "invalid-type",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "invalid-type",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "invalid type",
 		},
 		{
 			name: "invalid share",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "invalid-share",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "invalid-share",
 			},
 			error: "invalid share",
 		},
 		{
 			name: "invalid options",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"):  "foo",
-				path.Join(MountPrefix, "mount1", "type"):    "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):   "pod",
-				path.Join(MountPrefix, "mount1", "options"): "invalid-option",
+				MountPrefix + "mount1.source":  "foo",
+				MountPrefix + "mount1.type":    "tmpfs",
+				MountPrefix + "mount1.share":   "pod",
+				MountPrefix + "mount1.options": "invalid-option",
 			},
 			error: "unknown mount option",
 		},
 		{
 			name: "duplicate source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
 
-				path.Join(MountPrefix, "mount2", "source"): "foo",
-				path.Join(MountPrefix, "mount2", "type"):   "bind",
-				path.Join(MountPrefix, "mount2", "share"):  "container",
+				MountPrefix + "mount2.source": "foo",
+				MountPrefix + "mount2.type":   "bind",
+				MountPrefix + "mount2.share":  "container",
 			},
 			error: "have the same mount source",
 		},
@@ -202,36 +201,36 @@ func TestGetMountAccessType(t *testing.T) {
 		{
 			name: "container=exclusive",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): source,
-				path.Join(MountPrefix, "mount1", "type"):   "bind",
-				path.Join(MountPrefix, "mount1", "share"):  "container",
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
 			},
 			want: FileAccessExclusive,
 		},
 		{
 			name: "pod=shared",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): source,
-				path.Join(MountPrefix, "mount1", "type"):   "bind",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			want: FileAccessShared,
 		},
 		{
 			name: "shared=shared",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): source,
-				path.Join(MountPrefix, "mount1", "type"):   "bind",
-				path.Join(MountPrefix, "mount1", "share"):  "shared",
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "shared",
 			},
 			want: FileAccessShared,
 		},
 		{
 			name: "default=shared",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): source + "mismatch",
-				path.Join(MountPrefix, "mount1", "type"):   "bind",
-				path.Join(MountPrefix, "mount1", "share"):  "container",
+				MountPrefix + "mount1.source": source + "mismatch",
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
 			},
 			want: FileAccessShared,
 		},
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index df6052c88..bc1d0c1bb 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -93,10 +93,6 @@ type Loader struct {
 	// spec is the base configuration for the root container.
 	spec *specs.Spec
 
-	// startSignalForwarding enables forwarding of signals to the sandboxed
-	// container. It should be called after the init process is loaded.
-	startSignalForwarding func() func()
-
 	// stopSignalForwarding disables forwarding of signals to the sandboxed
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
@@ -336,29 +332,6 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
 	}
 
-	// Handle signals by forwarding them to the root container process
-	// (except for panic signal, which should cause a panic).
-	l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
-		// Panic signal should cause a panic.
-		if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
-			panic("Signal-induced panic")
-		}
-
-		// Otherwise forward to root container.
-		deliveryMode := DeliverToProcess
-		if args.Console {
-			// Since we are running with a console, we should
-			// forward the signal to the foreground process group
-			// so that job control signals like ^C can be handled
-			// properly.
-			deliveryMode = DeliverToForegroundProcessGroup
-		}
-		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
-		if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
-			log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
-		}
-	})
-
 	// Create the control server using the provided FD.
 	//
 	// This must be done *after* we have initialized the kernel since the
@@ -566,8 +539,27 @@ func (l *Loader) run() error {
 		ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
 	}
 
-	// Start signal forwarding only after an init process is created.
-	l.stopSignalForwarding = l.startSignalForwarding()
+	// Handle signals by forwarding them to the root container process
+	// (except for panic signal, which should cause a panic).
+	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
+		// Panic signal should cause a panic.
+		if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+			panic("Signal-induced panic")
+		}
+
+		// Otherwise forward to root container.
+		deliveryMode := DeliverToProcess
+		if l.console {
+			// Since we are running with a console, we should forward the signal to
+			// the foreground process group so that job control signals like ^C can
+			// be handled properly.
+			deliveryMode = DeliverToForegroundProcessGroup
+		}
+		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
+			log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err)
+		}
+	})
 
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index f98c5fd36..dd4926bb9 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -80,7 +80,8 @@ type CreateLinksAndRoutesArgs struct {
 	LoopbackLinks []LoopbackLink
 	FDBasedLinks  []FDBasedLink
 
-	DefaultGateway DefaultRoute
+	Defaultv4Gateway DefaultRoute
+	Defaultv6Gateway DefaultRoute
 }
 
 // Empty returns true if route hasn't been set.
@@ -122,10 +123,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		nicID++
 		nicids[link.Name] = nicID
 
-		ep := loopback.New()
+		linkEP := loopback.New()
 
 		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
-		if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, true /* loopback */); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, true /* loopback */); err != nil {
 			return err
 		}
 
@@ -157,7 +158,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		mac := tcpip.LinkAddress(link.LinkAddress)
-		ep, err := fdbased.New(&fdbased.Options{
+		linkEP, err := fdbased.New(&fdbased.Options{
 			FDs:                FDs,
 			MTU:                uint32(link.MTU),
 			EthernetHeader:     true,
@@ -172,7 +173,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
-		if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, false /* loopback */); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
 			return err
 		}
 
@@ -186,12 +187,24 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 	}
 
-	if !args.DefaultGateway.Route.Empty() {
-		nicID, ok := nicids[args.DefaultGateway.Name]
+	if !args.Defaultv4Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv4Gateway.Name]
 		if !ok {
-			return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name)
 		}
-		route, err := args.DefaultGateway.Route.toTcpipRoute(nicID)
+		route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID)
+		if err != nil {
+			return err
+		}
+		routes = append(routes, route)
+	}
+
+	if !args.Defaultv6Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv6Gateway.Name]
+		if !ok {
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name)
+		}
+		route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID)
 		if err != nil {
 			return err
 		}
@@ -208,11 +221,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error {
 	if loopback {
 		if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil {
-			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v) failed: %v", id, name, err)
+			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, ep, err)
 		}
 	} else {
 		if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil {
-			return fmt.Errorf("CreateNamedNIC(%v, %v) failed: %v", id, name, err)
+			return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, ep, err)
 		}
 	}
 
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 7313e473f..f37415810 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -32,16 +32,17 @@ import (
 
 // Debug implements subcommands.Command for the "debug" command.
 type Debug struct {
-	pid          int
-	stacks       bool
-	signal       int
-	profileHeap  string
-	profileCPU   string
-	profileDelay int
-	trace        string
-	strace       string
-	logLevel     string
-	logPackets   string
+	pid         int
+	stacks      bool
+	signal      int
+	profileHeap string
+	profileCPU  string
+	trace       string
+	strace      string
+	logLevel    string
+	logPackets  string
+	duration    time.Duration
+	ps          bool
 }
 
 // Name implements subcommands.Command.
@@ -65,12 +66,13 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
 	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
 	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
-	f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile")
+	f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
 	f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
 	f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`)
 	f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).")
 	f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.")
+	f.BoolVar(&d.ps, "ps", false, "lists processes")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -163,7 +165,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		if err := c.Sandbox.StartCPUProfile(f); err != nil {
 			return Errorf(err.Error())
 		}
-		log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
+		log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU)
 	}
 	if d.trace != "" {
 		delay = true
@@ -181,8 +183,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		if err := c.Sandbox.StartTrace(f); err != nil {
 			return Errorf(err.Error())
 		}
-		log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace)
-
+		log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace)
 	}
 
 	if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 {
@@ -241,9 +242,20 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("Logging options changed")
 	}
+	if d.ps {
+		pList, err := c.Processes()
+		if err != nil {
+			Fatalf("getting processes for container: %v", err)
+		}
+		o, err := control.ProcessListToJSON(pList)
+		if err != nil {
+			Fatalf("generating JSON: %v", err)
+		}
+		log.Infof(o)
+	}
 
 	if delay {
-		time.Sleep(time.Duration(d.profileDelay) * time.Second)
+		time.Sleep(d.duration)
 	}
 
 	return subcommands.ExitSuccess
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 7d67c3a75..5ed131a7f 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/kr/pty"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/testutil"
@@ -219,9 +220,9 @@ func TestJobControlSignalExec(t *testing.T) {
 	// Make sure all the processes are running.
 	expectedPL := []*control.Process{
 		// Root container process.
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		// Bash from exec process.
-		{PID: 2, Cmd: "bash"},
+		{PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Error(err)
@@ -231,7 +232,7 @@ func TestJobControlSignalExec(t *testing.T) {
 	ptyMaster.Write([]byte("sleep 100\n"))
 
 	// Wait for it to start. Sleep's PPID is bash's PID.
-	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
+	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}})
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Error(err)
 	}
@@ -361,7 +362,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 
 	// Wait for bash to start.
 	expectedPL := []*control.Process{
-		{PID: 1, Cmd: "bash"},
+		{PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Fatal(err)
@@ -371,7 +372,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	ptyMaster.Write([]byte("sleep 100\n"))
 
 	// Wait for sleep to start.
-	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep"})
+	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Fatal(err)
 	}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 07eacaac0..2ced028f6 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
@@ -52,13 +53,14 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
 			err = fmt.Errorf("error getting process data from container: %v", err)
 			return &backoff.PermanentError{Err: err}
 		}
-		if !procListsEqual(got, want) {
-			return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
+		if r, err := procListsEqual(got, want); !r {
+			return fmt.Errorf("container got process list: %s, want: %s: error: %v",
+				procListToString(got), procListToString(want), err)
 		}
 		return nil
 	}
 	// Gives plenty of time as tests can run slow under --race.
-	return testutil.Poll(cb, 30*time.Second)
+	return testutil.Poll(cb, 10*time.Second)
 }
 
 func waitForProcessCount(cont *Container, want int) error {
@@ -91,22 +93,34 @@ func blockUntilWaitable(pid int) error {
 
 // procListsEqual is used to check whether 2 Process lists are equal for all
 // implemented fields.
-func procListsEqual(got, want []*control.Process) bool {
+func procListsEqual(got, want []*control.Process) (bool, error) {
 	if len(got) != len(want) {
-		return false
+		return false, nil
 	}
 	for i := range got {
 		pd1 := got[i]
 		pd2 := want[i]
-		// Zero out unimplemented and timing dependant fields.
+		// Zero out timing dependant fields.
 		pd1.Time = ""
 		pd1.STime = ""
 		pd1.C = 0
-		if *pd1 != *pd2 {
-			return false
+		// Ignore TTY field too, since it's not relevant in the cases
+		// where we use this method. Tests that care about the TTY
+		// field should check for it themselves.
+		pd1.TTY = ""
+		pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1})
+		if err != nil {
+			return false, err
+		}
+		pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2})
+		if err != nil {
+			return false, err
+		}
+		if pd1Json != pd2Json {
+			return false, nil
 		}
 	}
-	return true
+	return true, nil
 }
 
 // getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
@@ -116,7 +130,11 @@ func getAndCheckProcLists(cont *Container, want []*control.Process) error {
 	if err != nil {
 		return fmt.Errorf("error getting process data from container: %v", err)
 	}
-	if procListsEqual(got, want) {
+	equal, err := procListsEqual(got, want)
+	if err != nil {
+		return err
+	}
+	if equal {
 		return nil
 	}
 	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
@@ -288,11 +306,12 @@ func TestLifecycle(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 		}
 		// Create the container.
@@ -590,18 +609,20 @@ func TestExec(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 
@@ -1062,18 +1083,20 @@ func TestPauseResume(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "bash",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "bash",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 
@@ -1126,11 +1149,12 @@ func TestPauseResume(t *testing.T) {
 
 		expectedPL2 := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 		}
 
@@ -1241,18 +1265,20 @@ func TestCapabilities(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "exe",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "exe",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
@@ -2112,6 +2138,95 @@ func TestOverlayfsStaleRead(t *testing.T) {
 	}
 }
 
+// TestTTYField checks TTY field returned by container.Processes().
+func TestTTYField(t *testing.T) {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	testApp, err := testutil.FindFile("runsc/container/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	testCases := []struct {
+		name         string
+		useTTY       bool
+		wantTTYField string
+	}{
+		{
+			name:         "no tty",
+			useTTY:       false,
+			wantTTYField: "?",
+		},
+		{
+			name:         "tty used",
+			useTTY:       true,
+			wantTTYField: "pts/0",
+		},
+	}
+
+	for _, test := range testCases {
+		t.Run(test.name, func(t *testing.T) {
+			conf := testutil.TestConfig()
+
+			// We will run /bin/sleep, possibly with an open TTY.
+			cmd := []string{"/bin/sleep", "10000"}
+			if test.useTTY {
+				// Run inside the "pty-runner".
+				cmd = append([]string{testApp, "pty-runner"}, cmd...)
+			}
+
+			spec := testutil.NewSpecWithArgs(cmd...)
+			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(rootDir)
+			defer os.RemoveAll(bundleDir)
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.UniqueContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Wait for sleep to be running, and check the TTY
+			// field.
+			var gotTTYField string
+			cb := func() error {
+				ps, err := c.Processes()
+				if err != nil {
+					err = fmt.Errorf("error getting process data from container: %v", err)
+					return &backoff.PermanentError{Err: err}
+				}
+				for _, p := range ps {
+					if strings.Contains(p.Cmd, "sleep") {
+						gotTTYField = p.TTY
+						return nil
+					}
+				}
+				return fmt.Errorf("sleep not running")
+			}
+			if err := testutil.Poll(cb, 30*time.Second); err != nil {
+				t.Fatalf("error waiting for sleep process: %v", err)
+			}
+
+			if gotTTYField != test.wantTTYField {
+				t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField)
+			}
+		})
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index a5a62378c..4ad09ceab 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -123,11 +123,11 @@ func execMany(execs []execDesc) error {
 
 func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
 	for _, spec := range pod {
-		spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source
-		spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type
-		spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod"
+		spec.Annotations[boot.MountPrefix+name+".source"] = mount.Source
+		spec.Annotations[boot.MountPrefix+name+".type"] = mount.Type
+		spec.Annotations[boot.MountPrefix+name+".share"] = "pod"
 		if len(mount.Options) > 0 {
-			spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",")
+			spec.Annotations[boot.MountPrefix+name+".options"] = strings.Join(mount.Options, ",")
 		}
 	}
 }
@@ -156,13 +156,13 @@ func TestMultiContainerSanity(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -202,13 +202,13 @@ func TestMultiPIDNS(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -264,7 +264,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -274,7 +274,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 		}
 
 		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -306,7 +306,7 @@ func TestMultiContainerWait(t *testing.T) {
 
 	// Check via ps that multiple processes are running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -351,7 +351,7 @@ func TestMultiContainerWait(t *testing.T) {
 	// After Wait returns, ensure that the root container is running and
 	// the child has finished.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
@@ -383,7 +383,7 @@ func TestExecWait(t *testing.T) {
 
 	// Check via ps that process is running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Fatalf("failed to wait for sleep to start: %v", err)
@@ -418,7 +418,7 @@ func TestExecWait(t *testing.T) {
 
 	// Wait for the exec'd process to exit.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Fatalf("failed to wait for second container to stop: %v", err)
@@ -505,7 +505,7 @@ func TestMultiContainerSignal(t *testing.T) {
 
 		// Check via ps that container 1 process is running.
 		expectedPL := []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
@@ -519,7 +519,7 @@ func TestMultiContainerSignal(t *testing.T) {
 
 		// Make sure process 1 is still running.
 		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -633,9 +633,10 @@ func TestMultiContainerDestroy(t *testing.T) {
 		if err != nil {
 			t.Fatalf("error getting process data from sandbox: %v", err)
 		}
-		expectedPL := []*control.Process{{PID: 1, Cmd: "sleep"}}
-		if !procListsEqual(pss, expectedPL) {
-			t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL))
+		expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
+		if r, err := procListsEqual(pss, expectedPL); !r {
+			t.Errorf("container got process list: %s, want: %s: error: %v",
+				procListToString(pss), procListToString(expectedPL), err)
 		}
 
 		// Check that cont.Destroy is safe to call multiple times.
@@ -669,7 +670,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Check root's container process list doesn't include other containers.
 	expectedPL0 := []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -677,8 +678,8 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Same for the other container.
 	expectedPL1 := []*control.Process{
-		{PID: 2, Cmd: "sh"},
-		{PID: 3, PPID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}},
+		{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -692,7 +693,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 	if _, err := containers[1].Execute(args); err != nil {
 		t.Fatalf("error exec'ing: %v", err)
 	}
-	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep"})
+	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}})
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
 	}
@@ -1513,7 +1514,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Ensure container is running
 	c := containers[2]
 	expectedPL := []*control.Process{
-		{PID: 3, Cmd: "sleep"},
+		{PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -1541,7 +1542,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 			continue // container[2] has been killed.
 		}
 		pl := []*control.Process{
-			{PID: kernel.ThreadID(i + 1), Cmd: "sleep"},
+			{PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}},
 		}
 		if err := waitForProcessList(c, pl); err != nil {
 			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
@@ -1561,7 +1562,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
 	// and RPC errors out.
 	impossiblePL := []*control.Process{
-		{PID: 100, Cmd: "non-existent-process"},
+		{PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}},
 	}
 	if err := waitForProcessList(c, impossiblePL); err == nil {
 		t.Fatalf("Sandbox was not killed after gofer death")
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index 9bf9e6e9d..bfd338bb6 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -15,5 +15,6 @@ go_binary(
         "//pkg/unet",
         "//runsc/testutil",
         "@com_github_google_subcommands//:go_default_library",
+        "@com_github_kr_pty//:go_default_library",
     ],
 )
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index 913d781c6..a1c8a741a 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -19,6 +19,7 @@ package main
 import (
 	"context"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"log"
 	"net"
@@ -31,6 +32,7 @@ import (
 
 	"flag"
 	"github.com/google/subcommands"
+	"github.com/kr/pty"
 	"gvisor.dev/gvisor/runsc/testutil"
 )
 
@@ -41,6 +43,7 @@ func main() {
 	subcommands.Register(new(fdReceiver), "")
 	subcommands.Register(new(fdSender), "")
 	subcommands.Register(new(forkBomb), "")
+	subcommands.Register(new(ptyRunner), "")
 	subcommands.Register(new(reaper), "")
 	subcommands.Register(new(syscall), "")
 	subcommands.Register(new(taskTree), "")
@@ -352,3 +355,40 @@ func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...inter
 
 	return subcommands.ExitSuccess
 }
+
+type ptyRunner struct{}
+
+// Name implements subcommands.Command.
+func (*ptyRunner) Name() string {
+	return "pty-runner"
+}
+
+// Synopsis implements subcommands.Command.
+func (*ptyRunner) Synopsis() string {
+	return "runs the given command with an open pty terminal"
+}
+
+// Usage implements subcommands.Command.
+func (*ptyRunner) Usage() string {
+	return "pty-runner [command]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*ptyRunner) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
+	c := exec.Command(fs.Args()[0], fs.Args()[1:]...)
+	f, err := pty.Start(c)
+	if err != nil {
+		fmt.Printf("pty.Start failed: %v", err)
+		return subcommands.ExitFailure
+	}
+	defer f.Close()
+
+	// Copy stdout from the command to keep this process alive until the
+	// subprocess exits.
+	io.Copy(os.Stdout, f)
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 57f6ae8de..9b6346ca2 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -380,6 +380,16 @@ func (d *Docker) FindPort(sandboxPort int) (int, error) {
 	return port, nil
 }
 
+// FindIP returns the IP address of the container as a string.
+func (d *Docker) FindIP() (string, error) {
+	const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}`
+	out, err := do("inspect", "-f", format, d.Name)
+	if err != nil {
+		return "", fmt.Errorf("error retrieving IP: %v", err)
+	}
+	return strings.TrimSpace(out), nil
+}
+
 // SandboxPid returns the PID to the sandbox process.
 func (d *Docker) SandboxPid() (int, error) {
 	out, err := do("inspect", "-f={{.State.Pid}}", d.Name)
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index c9add64ec..b59e1a70e 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -199,6 +199,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 // The reason that the file is not opened initially as read-write is for better
 // performance with 'overlay2' storage driver. overlay2 eagerly copies the
 // entire file up when it's opened in write mode, and would perform badly when
+// multiple files are only being opened for read (esp. startup).
 type localFile struct {
 	p9.DefaultWalkGetAttr
 
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 27459e6d1..8001949d5 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/platform",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
         "//pkg/urpc",
         "//runsc/boot",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index d42de0176..be8b72b3e 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -28,6 +28,7 @@ import (
 	"github.com/vishvananda/netlink"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot"
@@ -183,36 +184,39 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 			continue
 		}
 
-		// Keep only IPv4 addresses.
-		var ip4addrs []*net.IPNet
+		var ipAddrs []*net.IPNet
 		for _, ifaddr := range allAddrs {
 			ipNet, ok := ifaddr.(*net.IPNet)
 			if !ok {
 				return fmt.Errorf("address is not IPNet: %+v", ifaddr)
 			}
-			if ipNet.IP.To4() == nil {
-				log.Warningf("IPv6 is not supported, skipping: %v", ipNet)
-				continue
-			}
-			ip4addrs = append(ip4addrs, ipNet)
+			ipAddrs = append(ipAddrs, ipNet)
 		}
-		if len(ip4addrs) == 0 {
-			log.Warningf("No IPv4 address found for interface %q, skipping", iface.Name)
+		if len(ipAddrs) == 0 {
+			log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
 			continue
 		}
 
 		// Scrape the routes before removing the address, since that
 		// will remove the routes as well.
-		routes, def, err := routesForIface(iface)
+		routes, defv4, defv6, err := routesForIface(iface)
 		if err != nil {
 			return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
 		}
-		if def != nil {
-			if !args.DefaultGateway.Route.Empty() {
-				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway)
+		if defv4 != nil {
+			if !args.Defaultv4Gateway.Route.Empty() {
+				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
 			}
-			args.DefaultGateway.Route = *def
-			args.DefaultGateway.Name = iface.Name
+			args.Defaultv4Gateway.Route = *defv4
+			args.Defaultv4Gateway.Name = iface.Name
+		}
+
+		if defv6 != nil {
+			if !args.Defaultv6Gateway.Route.Empty() {
+				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
+			}
+			args.Defaultv6Gateway.Route = *defv6
+			args.Defaultv6Gateway.Name = iface.Name
 		}
 
 		link := boot.FDBasedLink{
@@ -247,6 +251,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 			}
 			args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
 		}
+
 		if link.GSOMaxSize == 0 && softwareGSO {
 			// Hardware GSO is disabled. Let's enable software GSO.
 			link.GSOMaxSize = stack.SoftwareGSOMaxSize
@@ -255,7 +260,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 
 		// Collect the addresses for the interface, enable forwarding,
 		// and remove them from the host.
-		for _, addr := range ip4addrs {
+		for _, addr := range ipAddrs {
 			link.Addresses = append(link.Addresses, addr.IP)
 
 			// Steal IP address from NIC.
@@ -351,46 +356,56 @@ func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink,
 }
 
 // routesForIface iterates over all routes for the given interface and converts
-// them to boot.Routes.
-func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
+// them to boot.Routes. It also returns the a default v4/v6 route if found.
+func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
 	link, err := netlink.LinkByIndex(iface.Index)
 	if err != nil {
-		return nil, nil, err
+		return nil, nil, nil, err
 	}
 	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
 	if err != nil {
-		return nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
+		return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
 	}
 
-	var def *boot.Route
+	var defv4, defv6 *boot.Route
 	var routes []boot.Route
 	for _, r := range rs {
 		// Is it a default route?
 		if r.Dst == nil {
 			if r.Gw == nil {
-				return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
-			}
-			if r.Gw.To4() == nil {
-				log.Warningf("IPv6 is not supported, skipping default route: %v", r)
-				continue
-			}
-			if def != nil {
-				return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r)
+				return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
 			}
 			// Create a catch all route to the gateway.
-			def = &boot.Route{
-				Destination: net.IPNet{
-					IP:   net.IPv4zero,
-					Mask: net.IPMask(net.IPv4zero),
-				},
-				Gateway: r.Gw,
+			switch len(r.Gw) {
+			case header.IPv4AddressSize:
+				if defv4 != nil {
+					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
+				}
+				defv4 = &boot.Route{
+					Destination: net.IPNet{
+						IP:   net.IPv4zero,
+						Mask: net.IPMask(net.IPv4zero),
+					},
+					Gateway: r.Gw,
+				}
+			case header.IPv6AddressSize:
+				if defv6 != nil {
+					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
+				}
+
+				defv6 = &boot.Route{
+					Destination: net.IPNet{
+						IP:   net.IPv6zero,
+						Mask: net.IPMask(net.IPv6zero),
+					},
+					Gateway: r.Gw,
+				}
+			default:
+				return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
 			}
 			continue
 		}
-		if r.Dst.IP.To4() == nil {
-			log.Warningf("IPv6 is not supported, skipping route: %v", r)
-			continue
-		}
+
 		dst := *r.Dst
 		dst.IP = dst.IP.Mask(dst.Mask)
 		routes = append(routes, boot.Route{
@@ -398,7 +413,7 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
 			Gateway:     r.Gw,
 		})
 	}
-	return routes, def, nil
+	return routes, defv4, defv6, nil
 }
 
 // removeAddress removes IP address from network device. It's equivalent to:
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ee9327fc8..805233184 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -1004,16 +1004,22 @@ func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
 // DestroyContainer destroys the given container. If it is the root container,
 // then the entire sandbox is destroyed.
 func (s *Sandbox) DestroyContainer(cid string) error {
+	if err := s.destroyContainer(cid); err != nil {
+		// If the sandbox isn't running, the container has already been destroyed,
+		// ignore the error in this case.
+		if s.IsRunning() {
+			return err
+		}
+	}
+	return nil
+}
+
+func (s *Sandbox) destroyContainer(cid string) error {
 	if s.IsRootContainer(cid) {
 		log.Debugf("Destroying root container %q by destroying sandbox", cid)
 		return s.destroy()
 	}
 
-	if !s.IsRunning() {
-		// Sandbox isn't running anymore, container is already destroyed.
-		return nil
-	}
-
 	log.Debugf("Destroying container %q in sandbox %q", cid, s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
diff --git a/scripts/dev.sh b/scripts/dev.sh
index c67003018..6238b4d0b 100755
--- a/scripts/dev.sh
+++ b/scripts/dev.sh
@@ -54,9 +54,10 @@ declare OUTPUT="$(build //runsc)"
 if [[ ${REFRESH} -eq 0 ]]; then
   install_runsc "${RUNTIME}"   --net-raw
   install_runsc "${RUNTIME}-d" --net-raw --debug --strace --log-packets
+  install_runsc "${RUNTIME}-p" --net-raw --profile
 
   echo
-  echo "Runtimes ${RUNTIME} and ${RUNTIME}-d (debug enabled) setup."
+  echo "Runtimes ${RUNTIME}, ${RUNTIME}-d (debug enabled), and ${RUNTIME}-p installed."
   echo "Use --runtime="${RUNTIME}" with your Docker command."
   echo "  docker run --rm --runtime="${RUNTIME}" hello-world"
   echo
diff --git a/scripts/go.sh b/scripts/go.sh
index 0dbfb7747..626ed8fa4 100755
--- a/scripts/go.sh
+++ b/scripts/go.sh
@@ -25,6 +25,8 @@ tools/go_branch.sh
 # Checkout the new branch.
 git checkout go && git clean -f
 
+go version
+
 # Build everything.
 go build ./...
 
diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh
new file mode 100755
index 000000000..c47cbd675
--- /dev/null
+++ b/scripts/iptables_tests.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+install_runsc_for_test iptables
+
+# Build the docker image for the test.
+run //test/iptables/runner --norun
+
+# TODO(gvisor.dev/issue/170): Also test this on runsc once iptables are better
+# supported
+test //test/iptables:iptables_test "--test_arg=--runtime=runc" \
+  "--test_arg=--image=bazel/test/iptables/runner:runner"
diff --git a/scripts/simple_tests.sh b/scripts/simple_tests.sh
index ef25afc2e..3a15050c2 100755
--- a/scripts/simple_tests.sh
+++ b/scripts/simple_tests.sh
@@ -17,4 +17,4 @@
 source $(dirname $0)/common.sh
 
 # Run all simple tests (locally).
-test //pkg/... //runsc/... //tools/... //benchmarks/...
+test //pkg/... //runsc/... //tools/... //benchmarks/... //benchmarks/runner:runner_test
diff --git a/test/iptables/BUILD b/test/iptables/BUILD
new file mode 100644
index 000000000..fa833c3b2
--- /dev/null
+++ b/test/iptables/BUILD
@@ -0,0 +1,31 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "iptables",
+    srcs = [
+        "filter_input.go",
+        "iptables.go",
+        "iptables_util.go",
+    ],
+    importpath = "gvisor.dev/gvisor/test/iptables",
+    visibility = ["//test/iptables:__subpackages__"],
+)
+
+go_test(
+    name = "iptables_test",
+    srcs = [
+        "iptables_test.go",
+    ],
+    embed = [":iptables"],
+    tags = [
+        "local",
+        "manual",
+    ],
+    deps = [
+        "//pkg/log",
+        "//runsc/dockerutil",
+        "//runsc/testutil",
+    ],
+)
diff --git a/test/iptables/README.md b/test/iptables/README.md
new file mode 100644
index 000000000..b37cb2a96
--- /dev/null
+++ b/test/iptables/README.md
@@ -0,0 +1,44 @@
+# iptables Tests
+
+iptables tests are run via `scripts/iptables\_test.sh`.
+
+## Test Structure
+
+Each test implements `TestCase`, providing (1) a function to run inside the
+container and (2) a function to run locally. Those processes are given each
+others' IP addresses. The test succeeds when both functions succeed.
+
+The function inside the container (`ContainerAction`) typically sets some
+iptables rules and then tries to send or receive packets. The local function
+(`LocalAction`) will typically just send or receive packets.
+
+### Adding Tests
+
+1) Add your test to the `iptables` package.
+
+2) Register the test in an `init` function via `RegisterTestCase` (see
+`filter_input.go` as an example).
+
+3) Add it to `iptables_test.go` (see the other tests in that file).
+
+Your test is now runnable with bazel!
+
+## Run individual tests
+
+Build the testing Docker container:
+
+```bash
+$ bazel run //test/iptables/runner -- --norun
+```
+
+Run an individual test via:
+
+```bash
+$ bazel test //test/iptables:iptables_test --test_filter=<TESTNAME>
+```
+
+To run an individual test with `runc`:
+
+```bash
+$ bazel test //test/iptables:iptables_test --test_filter=<TESTNAME> --test_arg=--runtime=runc
+```
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
new file mode 100644
index 000000000..923f44e68
--- /dev/null
+++ b/test/iptables/filter_input.go
@@ -0,0 +1,124 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+	"net"
+	"time"
+)
+
+const (
+	dropPort         = 2401
+	acceptPort       = 2402
+	sendloopDuration = 2 * time.Second
+	network          = "udp4"
+)
+
+func init() {
+	RegisterTestCase(FilterInputDropUDP{})
+	RegisterTestCase(FilterInputDropUDPPort{})
+	RegisterTestCase(FilterInputDropDifferentUDPPort{})
+}
+
+// FilterInputDropUDP tests that we can drop UDP traffic.
+type FilterInputDropUDP struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropUDP) Name() string {
+	return "FilterInputDropUDP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropUDP) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropUDP) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// FilterInputDropUDPPort tests that we can drop UDP traffic by port.
+type FilterInputDropUDPPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropUDPPort) Name() string {
+	return "FilterInputDropUDPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropUDPPort) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on dropPort.
+	if err := listenUDP(dropPort, sendloopDuration); err == nil {
+		return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+	} else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+		return fmt.Errorf("error reading: %v", err)
+	}
+
+	// At this point we know that reading timed out and never received a
+	// packet.
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropUDPPort) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// FilterInputDropDifferentUDPPort tests that dropping traffic for a single UDP port
+// doesn't drop packets on other ports.
+type FilterInputDropDifferentUDPPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropDifferentUDPPort) Name() string {
+	return "FilterInputDropDifferentUDPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropDifferentUDPPort) ContainerAction(ip net.IP) error {
+	if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+		return err
+	}
+
+	// Listen for UDP packets on another port.
+	if err := listenUDP(acceptPort, sendloopDuration); err != nil {
+		return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", acceptPort, err)
+	}
+
+	return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropDifferentUDPPort) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables.go b/test/iptables/iptables.go
new file mode 100644
index 000000000..2e565d988
--- /dev/null
+++ b/test/iptables/iptables.go
@@ -0,0 +1,53 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package iptables contains a set of iptables tests implemented as TestCases
+package iptables
+
+import (
+	"fmt"
+	"net"
+)
+
+// IPExchangePort is the port the container listens on to receive the IP
+// address of the local process.
+const IPExchangePort = 2349
+
+// A TestCase contains one action to run in the container and one to run
+// locally. The actions run concurrently and each must succeed for the test
+// pass.
+type TestCase interface {
+	// Name returns the name of the test.
+	Name() string
+
+	// ContainerAction runs inside the container. It receives the IP of the
+	// local process.
+	ContainerAction(ip net.IP) error
+
+	// LocalAction runs locally. It receives the IP of the container.
+	LocalAction(ip net.IP) error
+}
+
+// Tests maps test names to TestCase.
+//
+// New TestCases are added by calling RegisterTestCase in an init function.
+var Tests = map[string]TestCase{}
+
+// RegisterTestCase registers tc so it can be run.
+func RegisterTestCase(tc TestCase) {
+	if _, ok := Tests[tc.Name()]; ok {
+		panic(fmt.Sprintf("TestCase %s already registered.", tc.Name()))
+	}
+	Tests[tc.Name()] = tc
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
new file mode 100644
index 000000000..bfbf1bb87
--- /dev/null
+++ b/test/iptables/iptables_test.go
@@ -0,0 +1,179 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+	"net"
+	"os"
+	"path"
+	"testing"
+	"time"
+
+	"flag"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/dockerutil"
+	"gvisor.dev/gvisor/runsc/testutil"
+)
+
+const timeout time.Duration = 10 * time.Second
+
+var image = flag.String("image", "bazel/test/iptables/runner:runner", "image to run tests in")
+
+type result struct {
+	output string
+	err    error
+}
+
+// singleTest runs a TestCase. Each test follows a pattern:
+// - Create a container.
+// - Get the container's IP.
+// - Send the container our IP.
+// - Start a new goroutine running the local action of the test.
+// - Wait for both the container and local actions to finish.
+//
+// Container output is logged to $TEST_UNDECLARED_OUTPUTS_DIR if it exists, or
+// to stderr.
+func singleTest(test TestCase) error {
+	if _, ok := Tests[test.Name()]; !ok {
+		return fmt.Errorf("no test found with name %q. Has it been registered?", test.Name())
+	}
+
+	// Create and start the container.
+	cont := dockerutil.MakeDocker("gvisor-iptables")
+	defer cont.CleanUp()
+	resultChan := make(chan *result)
+	go func() {
+		output, err := cont.RunFg("--cap-add=NET_ADMIN", *image, "-name", test.Name())
+		logContainer(output, err)
+		resultChan <- &result{output, err}
+	}()
+
+	// Get the container IP.
+	ip, err := getIP(cont)
+	if err != nil {
+		return fmt.Errorf("failed to get container IP: %v", err)
+	}
+
+	// Give the container our IP.
+	if err := sendIP(ip); err != nil {
+		return fmt.Errorf("failed to send IP to container: %v", err)
+	}
+
+	// Run our side of the test.
+	errChan := make(chan error)
+	go func() {
+		errChan <- test.LocalAction(ip)
+	}()
+
+	// Wait for both the container and local tests to finish.
+	var res *result
+	to := time.After(timeout)
+	for localDone := false; res == nil || !localDone; {
+		select {
+		case res = <-resultChan:
+			log.Infof("Container finished.")
+		case err, localDone = <-errChan:
+			log.Infof("Local finished.")
+			if err != nil {
+				return fmt.Errorf("local test failed: %v", err)
+			}
+		case <-to:
+			return fmt.Errorf("timed out after %f seconds", timeout.Seconds())
+		}
+	}
+
+	return res.err
+}
+
+func getIP(cont dockerutil.Docker) (net.IP, error) {
+	// The container might not have started yet, so retry a few times.
+	var ipStr string
+	to := time.After(timeout)
+	for ipStr == "" {
+		ipStr, _ = cont.FindIP()
+		select {
+		case <-to:
+			return net.IP{}, fmt.Errorf("timed out getting IP after %f seconds", timeout.Seconds())
+		default:
+			time.Sleep(250 * time.Millisecond)
+		}
+	}
+	ip := net.ParseIP(ipStr)
+	if ip == nil {
+		return net.IP{}, fmt.Errorf("invalid IP: %q", ipStr)
+	}
+	log.Infof("Container has IP of %s", ipStr)
+	return ip, nil
+}
+
+func sendIP(ip net.IP) error {
+	contAddr := net.TCPAddr{
+		IP:   ip,
+		Port: IPExchangePort,
+	}
+	var conn *net.TCPConn
+	// The container may not be listening when we first connect, so retry
+	// upon error.
+	cb := func() error {
+		c, err := net.DialTCP("tcp4", nil, &contAddr)
+		conn = c
+		return err
+	}
+	if err := testutil.Poll(cb, timeout); err != nil {
+		return fmt.Errorf("timed out waiting to send IP, most recent error: %v", err)
+	}
+	if _, err := conn.Write([]byte{0}); err != nil {
+		return fmt.Errorf("error writing to container: %v", err)
+	}
+	return nil
+}
+
+func logContainer(output string, err error) {
+	msg := fmt.Sprintf("Container error: %v\nContainer output:\n%v", err, output)
+	if artifactsDir := os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); artifactsDir != "" {
+		fpath := path.Join(artifactsDir, "container.log")
+		if file, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE, 0644); err != nil {
+			log.Warningf("Failed to open log file %q: %v", fpath, err)
+		} else {
+			defer file.Close()
+			if _, err := file.Write([]byte(msg)); err == nil {
+				return
+			}
+			log.Warningf("Failed to write to log file %s: %v", fpath, err)
+		}
+	}
+
+	// We couldn't write to the output directory -- just log to stderr.
+	log.Infof(msg)
+}
+
+func TestFilterInputDropUDP(t *testing.T) {
+	if err := singleTest(FilterInputDropUDP{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDropUDPPort(t *testing.T) {
+	if err := singleTest(FilterInputDropUDPPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestFilterInputDropDifferentUDPPort(t *testing.T) {
+	if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
new file mode 100644
index 000000000..3a4d11f1a
--- /dev/null
+++ b/test/iptables/iptables_util.go
@@ -0,0 +1,82 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+	"fmt"
+	"net"
+	"os/exec"
+	"time"
+)
+
+const iptablesBinary = "iptables"
+
+// filterTable calls `iptables -t filter` with the given args.
+func filterTable(args ...string) error {
+	args = append([]string{"-t", "filter"}, args...)
+	cmd := exec.Command(iptablesBinary, args...)
+	if out, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("error running iptables with args %v\nerror: %v\noutput: %s", args, err, string(out))
+	}
+	return nil
+}
+
+// listenUDP listens on a UDP port and returns the value of net.Conn.Read() for
+// the first read on that port.
+func listenUDP(port int, timeout time.Duration) error {
+	localAddr := net.UDPAddr{
+		Port: port,
+	}
+	conn, err := net.ListenUDP(network, &localAddr)
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+	conn.SetDeadline(time.Now().Add(timeout))
+	_, err = conn.Read([]byte{0})
+	return err
+}
+
+// sendUDPLoop sends 1 byte UDP packets repeatedly to the IP and port specified
+// over a duration.
+func sendUDPLoop(ip net.IP, port int, duration time.Duration) error {
+	// Send packets for a few seconds.
+	remote := net.UDPAddr{
+		IP:   ip,
+		Port: port,
+	}
+	conn, err := net.DialUDP(network, nil, &remote)
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	to := time.After(duration)
+	for timedOut := false; !timedOut; {
+		// This may return an error (connection refused) if the remote
+		// hasn't started listening yet or they're dropping our
+		// packets. So we ignore Write errors and depend on the remote
+		// to report a failure if it doesn't get a packet it needs.
+		conn.Write([]byte{0})
+		select {
+		case <-to:
+			timedOut = true
+		default:
+			time.Sleep(200 * time.Millisecond)
+		}
+	}
+
+	return nil
+}
diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD
new file mode 100644
index 000000000..1c59e26b9
--- /dev/null
+++ b/test/iptables/runner/BUILD
@@ -0,0 +1,16 @@
+load("@io_bazel_rules_docker//container:container.bzl", "container_image")
+load("@io_bazel_rules_docker//go:image.bzl", "go_image")
+
+package(licenses = ["notice"])
+
+container_image(
+    name = "iptables-base",
+    base = "@iptables-test//image",
+)
+
+go_image(
+    name = "runner",
+    srcs = ["main.go"],
+    base = ":iptables-base",
+    deps = ["//test/iptables"],
+)
diff --git a/test/iptables/runner/Dockerfile b/test/iptables/runner/Dockerfile
new file mode 100644
index 000000000..b77db44a1
--- /dev/null
+++ b/test/iptables/runner/Dockerfile
@@ -0,0 +1,4 @@
+# This Dockerfile builds the image hosted at
+# gcr.io/gvisor-presubmit/iptables-test.
+FROM ubuntu
+RUN apt update && apt install -y iptables
diff --git a/test/iptables/runner/main.go b/test/iptables/runner/main.go
new file mode 100644
index 000000000..3c794114e
--- /dev/null
+++ b/test/iptables/runner/main.go
@@ -0,0 +1,70 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package main runs iptables tests from within a docker container.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"net"
+
+	"gvisor.dev/gvisor/test/iptables"
+)
+
+var name = flag.String("name", "", "name of the test to run")
+
+func main() {
+	flag.Parse()
+
+	// Find out which test we're running.
+	test, ok := iptables.Tests[*name]
+	if !ok {
+		log.Fatalf("No test found named %q", *name)
+	}
+	log.Printf("Running test %q", *name)
+
+	// Get the IP of the local process.
+	ip, err := getIP()
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	// Run the test.
+	if err := test.ContainerAction(ip); err != nil {
+		log.Fatalf("Failed running test %q: %v", *name, err)
+	}
+}
+
+// getIP listens for a connection from the local process and returns the source
+// IP of that connection.
+func getIP() (net.IP, error) {
+	localAddr := net.TCPAddr{
+		Port: iptables.IPExchangePort,
+	}
+	listener, err := net.ListenTCP("tcp4", &localAddr)
+	if err != nil {
+		return net.IP{}, fmt.Errorf("failed listening for IP: %v", err)
+	}
+	defer listener.Close()
+	conn, err := listener.AcceptTCP()
+	if err != nil {
+		return net.IP{}, fmt.Errorf("failed accepting IP: %v", err)
+	}
+	defer conn.Close()
+	log.Printf("Connected to %v", conn.RemoteAddr())
+
+	return conn.RemoteAddr().(*net.TCPAddr).IP, nil
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 722d14b53..a3a85917d 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -376,6 +376,8 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:rlimits_test")
 
+syscall_test(test = "//test/syscalls/linux:rseq_test")
+
 syscall_test(test = "//test/syscalls/linux:rtsignal_test")
 
 syscall_test(test = "//test/syscalls/linux:sched_test")
@@ -669,6 +671,7 @@ syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
     size = "medium",
+    add_hostinet = True,
     shard_count = 10,
     test = "//test/syscalls/linux:udp_socket_test",
 )
@@ -714,6 +717,11 @@ syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
 
 syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
 
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:xattr_test",
+)
+
 go_binary(
     name = "syscall_test_runner",
     testonly = 1,
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index dcf5b73ed..aaf77c65b 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -9,6 +9,7 @@ def syscall_test(
         use_tmpfs = False,
         add_overlay = False,
         add_uds_tree = False,
+        add_hostinet = False,
         tags = None):
     _syscall_test(
         test = test,
@@ -65,6 +66,18 @@ def syscall_test(
             file_access = "shared",
         )
 
+    if add_hostinet:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = use_tmpfs,
+            network = "host",
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+        )
+
 def _syscall_test(
         test,
         shard_count,
@@ -72,6 +85,7 @@ def _syscall_test(
         platform,
         use_tmpfs,
         tags,
+        network = "none",
         file_access = "exclusive",
         overlay = False,
         add_uds_tree = False):
@@ -85,6 +99,8 @@ def _syscall_test(
         name += "_shared"
     if overlay:
         name += "_overlay"
+    if network != "none":
+        name += "_" + network + "net"
 
     if tags == None:
         tags = []
@@ -107,6 +123,7 @@ def _syscall_test(
         # Arguments are passed directly to syscall_test_runner binary.
         "--test-name=" + test_name,
         "--platform=" + platform,
+        "--network=" + network,
         "--use-tmpfs=" + str(use_tmpfs),
         "--file-access=" + file_access,
         "--overlay=" + str(overlay),
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 9cca78a93..e6568128e 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -6,6 +6,17 @@ package(
     licenses = ["notice"],
 )
 
+exports_files(
+    [
+        "socket.cc",
+        "socket_ip_loopback_blocking.cc",
+        "socket_ipv4_udp_unbound_loopback.cc",
+        "tcp_socket.cc",
+        "udp_socket.cc",
+    ],
+    visibility = ["//:sandbox"],
+)
+
 cc_binary(
     name = "sigaltstack_check",
     testonly = 1,
@@ -743,6 +754,7 @@ cc_binary(
         "//test/util:eventfd_util",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
+        "//test/util:save_util",
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:timer_util",
@@ -1842,6 +1854,22 @@ cc_binary(
 )
 
 cc_binary(
+    name = "rseq_test",
+    testonly = 1,
+    srcs = ["rseq.cc"],
+    data = ["//test/syscalls/linux/rseq"],
+    linkstatic = 1,
+    deps = [
+        "//test/syscalls/linux/rseq:lib",
+        "//test/util:logging",
+        "//test/util:multiprocess_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
     name = "rtsignal_test",
     testonly = 1,
     srcs = ["rtsignal.cc"],
@@ -3245,8 +3273,6 @@ cc_binary(
     testonly = 1,
     srcs = ["tcp_socket.cc"],
     linkstatic = 1,
-    # FIXME(b/135470853)
-    tags = ["flaky"],
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
@@ -3697,3 +3723,24 @@ cc_binary(
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_binary(
+    name = "xattr_test",
+    testonly = 1,
+    srcs = [
+        "file_base.h",
+        "xattr.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 8a45be12a..4f3aa81d6 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -14,6 +14,7 @@
 
 #include <fcntl.h>
 #include <signal.h>
+#include <sys/types.h>
 #include <syscall.h>
 #include <unistd.h>
 
@@ -32,6 +33,7 @@
 #include "test/util/eventfd_util.h"
 #include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
+#include "test/util/save_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/timer_util.h"
@@ -910,8 +912,166 @@ TEST(FcntlTest, GetOwn) {
   FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
 
-  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), 0);
+  MaybeSave();
+}
+
+TEST(FcntlTest, GetOwnEx) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &owner),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST(FcntlTest, SetOwnExInvalidType) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = __pid_type(-1);
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(FcntlTest, SetOwnExInvalidTid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_TID;
+  owner.pid = -1;
+
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExInvalidPid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PID;
+  owner.pid = -1;
+
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExInvalidPgrp) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PGRP;
+  owner.pid = -1;
+
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExTid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_TID;
+  EXPECT_THAT(owner.pid = syscall(__NR_gettid), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallSucceeds());
+
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid);
+  MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExPid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PID;
+  EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallSucceeds());
+
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid);
+  MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExPgrp) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PGRP;
+  EXPECT_THAT(owner.pid = getpgrp(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallSucceeds());
+
+  // NOTE(igudger): I don't understand why, but this is flaky on Linux.
+  // GetOwnExPgrp (below) does not have this issue.
+  SKIP_IF(!IsRunningOnGvisor());
+
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), -owner.pid);
+  MaybeSave();
+}
+
+TEST(FcntlTest, GetOwnExTid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex set_owner = {};
+  set_owner.type = F_OWNER_TID;
+  EXPECT_THAT(set_owner.pid = syscall(__NR_gettid), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+              SyscallSucceeds());
+
+  f_owner_ex got_owner = {};
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(got_owner.type, set_owner.type);
+  EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+TEST(FcntlTest, GetOwnExPid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex set_owner = {};
+  set_owner.type = F_OWNER_PID;
+  EXPECT_THAT(set_owner.pid = getpid(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+              SyscallSucceeds());
+
+  f_owner_ex got_owner = {};
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(got_owner.type, set_owner.type);
+  EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+TEST(FcntlTest, GetOwnExPgrp) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex set_owner = {};
+  set_owner.type = F_OWNER_PGRP;
+  EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+              SyscallSucceeds());
+
+  f_owner_ex got_owner = {};
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
               SyscallSucceedsWithValue(0));
+  EXPECT_EQ(got_owner.type, set_owner.type);
+  EXPECT_EQ(got_owner.pid, set_owner.pid);
 }
 
 }  // namespace
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
index c4f8bff08..b0a07a064 100644
--- a/test/syscalls/linux/ioctl.cc
+++ b/test/syscalls/linux/ioctl.cc
@@ -215,7 +215,8 @@ TEST_F(IoctlTest, FIOASYNCSelfTarget2) {
   auto mask_cleanup =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
 
-  pid_t pid = getpid();
+  pid_t pid = -1;
+  EXPECT_THAT(pid = getpid(), SyscallSucceeds());
   EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
 
   int set = 1;
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 072230d85..9cb4566db 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -26,25 +26,6 @@
 namespace gvisor {
 namespace testing {
 
-// Possible values of the "st" field in a /proc/net/{tcp,udp} entry. Source:
-// Linux kernel, include/net/tcp_states.h.
-enum {
-  TCP_ESTABLISHED = 1,
-  TCP_SYN_SENT,
-  TCP_SYN_RECV,
-  TCP_FIN_WAIT1,
-  TCP_FIN_WAIT2,
-  TCP_TIME_WAIT,
-  TCP_CLOSE,
-  TCP_CLOSE_WAIT,
-  TCP_LAST_ACK,
-  TCP_LISTEN,
-  TCP_CLOSING,
-  TCP_NEW_SYN_RECV,
-
-  TCP_MAX_STATES
-};
-
 // Extracts the IP address from an inet sockaddr in network byte order.
 uint32_t IPFromInetSockaddr(const struct sockaddr* addr);
 
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 512de5ee0..8cf08991b 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -37,6 +37,7 @@
 #include <map>
 #include <memory>
 #include <ostream>
+#include <regex>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -51,6 +52,7 @@
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/util/capability_util.h"
@@ -1988,6 +1990,44 @@ TEST(Proc, GetdentsEnoent) {
               SyscallFailsWithErrno(ENOENT));
 }
 
+void CheckSyscwFromIOFile(const std::string& path, const std::string& regex) {
+  std::string output;
+  ASSERT_NO_ERRNO(GetContents(path, &output));
+  ASSERT_THAT(output, ContainsRegex(absl::StrCat("syscw:\\s+", regex, "\n")));
+}
+
+// Checks that there is variable accounting of IO between threads/tasks.
+TEST(Proc, PidTidIOAccounting) {
+  absl::Notification notification;
+
+  // Run a thread with a bunch of writes. Check that io account records exactly
+  // the number of write calls. File open/close is there to prevent buffering.
+  ScopedThread writer([&notification] {
+    const int num_writes = 100;
+    for (int i = 0; i < num_writes; i++) {
+      auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+      ASSERT_NO_ERRNO(SetContents(path.path(), "a"));
+    }
+    notification.Notify();
+    const std::string& writer_dir =
+        absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io");
+
+    CheckSyscwFromIOFile(writer_dir, std::to_string(num_writes));
+  });
+
+  // Run a thread and do no writes. Check that no writes are recorded.
+  ScopedThread noop([&notification] {
+    notification.WaitForNotification();
+    const std::string& noop_dir =
+        absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io");
+
+    CheckSyscwFromIOFile(noop_dir, "0");
+  });
+
+  writer.Join();
+  noop.Join();
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc
index 2659f6a98..5b6e3e3cd 100644
--- a/test/syscalls/linux/proc_net_tcp.cc
+++ b/test/syscalls/linux/proc_net_tcp.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <netinet/tcp.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc
index f06f1a24b..786b4b4af 100644
--- a/test/syscalls/linux/proc_net_udp.cc
+++ b/test/syscalls/linux/proc_net_udp.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <netinet/tcp.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
new file mode 100644
index 000000000..106c045e3
--- /dev/null
+++ b/test/syscalls/linux/rseq.cc
@@ -0,0 +1,198 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/rseq/test.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Syscall test for rseq (restartable sequences).
+//
+// We must be very careful about how these tests are written. Each thread may
+// only have one struct rseq registration, which may be done automatically at
+// thread start (as of 2019-11-13, glibc does *not* support rseq and thus does
+// not do so).
+//
+// Testing of rseq is thus done primarily in a child process with no
+// registration. This means exec'ing a nostdlib binary, as rseq registration can
+// only be cleared by execve (or knowing the old rseq address), and glibc (based
+// on the current unmerged patches) register rseq before calling main()).
+
+int RSeq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
+  return syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
+}
+
+// Returns true if this kernel supports the rseq syscall.
+PosixErrorOr<bool> RSeqSupported() {
+  // We have to be careful here, there are three possible cases:
+  //
+  // 1. rseq is not supported -> ENOSYS
+  // 2. rseq is supported and not registered -> success, but we should
+  //    unregister.
+  // 3. rseq is supported and registered -> EINVAL (most likely).
+
+  // The only validation done on new registrations is that rseq is aligned and
+  // writable.
+  rseq rseq = {};
+  int ret = RSeq(&rseq, sizeof(rseq), 0, 0);
+  if (ret == 0) {
+    // Successfully registered, rseq is supported. Unregister.
+    ret = RSeq(&rseq, sizeof(rseq), kRseqFlagUnregister, 0);
+    if (ret != 0) {
+      return PosixError(errno);
+    }
+    return true;
+  }
+
+  switch (errno) {
+    case ENOSYS:
+      // Not supported.
+      return false;
+    case EINVAL:
+      // Supported, but already registered. EINVAL returned because we provided
+      // a different address.
+      return true;
+    default:
+      // Unknown error.
+      return PosixError(errno);
+  }
+}
+
+constexpr char kRseqBinary[] = "test/syscalls/linux/rseq/rseq";
+
+void RunChildTest(std::string test_case, int want_status) {
+  std::string path = RunfilePath(kRseqBinary);
+
+  pid_t child_pid = -1;
+  int execve_errno = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(path, {path, test_case}, {}, &child_pid, &execve_errno));
+
+  ASSERT_GT(child_pid, 0);
+  ASSERT_EQ(execve_errno, 0);
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  ASSERT_EQ(status, want_status);
+}
+
+// Test that rseq must be aligned.
+TEST(RseqTest, Unaligned) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestUnaligned, 0);
+}
+
+// Sanity test that registration works.
+TEST(RseqTest, Register) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestRegister, 0);
+}
+
+// Registration can't be done twice.
+TEST(RseqTest, DoubleRegister) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestDoubleRegister, 0);
+}
+
+// Registration can be done again after unregister.
+TEST(RseqTest, RegisterUnregister) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestRegisterUnregister, 0);
+}
+
+// The pointer to rseq must match on register/unregister.
+TEST(RseqTest, UnregisterDifferentPtr) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestUnregisterDifferentPtr, 0);
+}
+
+// The signature must match on register/unregister.
+TEST(RseqTest, UnregisterDifferentSignature) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestUnregisterDifferentSignature, 0);
+}
+
+// The CPU ID is initialized.
+TEST(RseqTest, CPU) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestCPU, 0);
+}
+
+// Critical section is eventually aborted.
+TEST(RseqTest, Abort) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbort, 0);
+}
+
+// Abort may be before the critical section.
+TEST(RseqTest, AbortBefore) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortBefore, 0);
+}
+
+// Signature must match.
+TEST(RseqTest, AbortSignature) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortSignature, SIGSEGV);
+}
+
+// Abort must not be in the critical section.
+TEST(RseqTest, AbortPreCommit) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortPreCommit, SIGSEGV);
+}
+
+// rseq.rseq_cs is cleared on abort.
+TEST(RseqTest, AbortClearsCS) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortClearsCS, 0);
+}
+
+// rseq.rseq_cs is cleared on abort outside of critical section.
+TEST(RseqTest, InvalidAbortClearsCS) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestInvalidAbortClearsCS, 0);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD
new file mode 100644
index 000000000..5cfe4e56f
--- /dev/null
+++ b/test/syscalls/linux/rseq/BUILD
@@ -0,0 +1,59 @@
+# This package contains a standalone rseq test binary. This binary must not
+# depend on libc, which might use rseq itself.
+
+load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier")
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(licenses = ["notice"])
+
+genrule(
+    name = "rseq_binary",
+    srcs = [
+        "critical.h",
+        "critical.S",
+        "rseq.cc",
+        "syscalls.h",
+        "start.S",
+        "test.h",
+        "types.h",
+        "uapi.h",
+    ],
+    outs = ["rseq"],
+    cmd = " ".join([
+        "$(CC)",
+        "$(CC_FLAGS) ",
+        "-I.",
+        "-Wall",
+        "-Werror",
+        "-O2",
+        "-std=c++17",
+        "-static",
+        "-nostdlib",
+        "-ffreestanding",
+        "-o",
+        "$(location rseq)",
+        "$(location critical.S)",
+        "$(location rseq.cc)",
+        "$(location start.S)",
+    ]),
+    toolchains = [
+        ":no_pie_cc_flags",
+        "@bazel_tools//tools/cpp:current_cc_toolchain",
+    ],
+    visibility = ["//:sandbox"],
+)
+
+cc_flags_supplier(
+    name = "no_pie_cc_flags",
+    features = ["-pie"],
+)
+
+cc_library(
+    name = "lib",
+    testonly = 1,
+    hdrs = [
+        "test.h",
+        "uapi.h",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/test/syscalls/linux/rseq/critical.S b/test/syscalls/linux/rseq/critical.S
new file mode 100644
index 000000000..8c0687e6d
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical.S
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+  .text
+  .globl  rseq_loop
+  .type   rseq_loop, @function
+
+rseq_loop:
+  jmp begin
+
+  // Abort block before the critical section.
+  // Abort signature is 4 nops for simplicity.
+  .byte 0x90, 0x90, 0x90, 0x90
+  .globl  rseq_loop_early_abort
+rseq_loop_early_abort:
+  ret
+
+begin:
+  // r->rseq_cs = cs
+  movq %rsi, 8(%rdi)
+
+  // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+  // section. Thus it must be set in or immediately before the critical section
+  // to ensure it is not cleared before the section begins.
+  .globl  rseq_loop_start
+rseq_loop_start:
+  jmp rseq_loop_start
+
+  // "Pre-commit": extra instructions inside the critical section.  These are
+  // used as the abort point in TestAbortPreCommit, which is not valid.
+  .globl  rseq_loop_pre_commit
+rseq_loop_pre_commit:
+  // Extra abort signature + nop for TestAbortPostCommit.
+  .byte 0x90, 0x90, 0x90, 0x90
+  nop
+
+  // "Post-commit": never reached in this case.
+  .globl  rseq_loop_post_commit
+rseq_loop_post_commit:
+
+  // Abort signature is 4 nops for simplicity.
+  .byte 0x90, 0x90, 0x90, 0x90
+
+  .globl  rseq_loop_abort
+rseq_loop_abort:
+  ret
+
+  .size  rseq_loop,.-rseq_loop
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/critical.h b/test/syscalls/linux/rseq/critical.h
new file mode 100644
index 000000000..ac987a25e
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical.h
@@ -0,0 +1,39 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
+
+#include "test/syscalls/linux/rseq/types.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+
+constexpr uint32_t kRseqSignature = 0x90909090;
+
+extern "C" {
+
+extern void rseq_loop(struct rseq* r, struct rseq_cs* cs);
+extern void* rseq_loop_early_abort;
+extern void* rseq_loop_start;
+extern void* rseq_loop_pre_commit;
+extern void* rseq_loop_post_commit;
+extern void* rseq_loop_abort;
+
+extern int rseq_getpid(struct rseq* r, struct rseq_cs* cs);
+extern void* rseq_getpid_start;
+extern void* rseq_getpid_post_commit;
+extern void* rseq_getpid_abort;
+
+}  // extern "C"
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
diff --git a/test/syscalls/linux/rseq/rseq.cc b/test/syscalls/linux/rseq/rseq.cc
new file mode 100644
index 000000000..f036db26d
--- /dev/null
+++ b/test/syscalls/linux/rseq/rseq.cc
@@ -0,0 +1,366 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/rseq/critical.h"
+#include "test/syscalls/linux/rseq/syscalls.h"
+#include "test/syscalls/linux/rseq/test.h"
+#include "test/syscalls/linux/rseq/types.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+
+namespace gvisor {
+namespace testing {
+
+extern "C" int main(int argc, char** argv, char** envp);
+
+// Standalone initialization before calling main().
+extern "C" void __init(uintptr_t* sp) {
+  int argc = sp[0];
+  char** argv = reinterpret_cast<char**>(&sp[1]);
+  char** envp = &argv[argc + 1];
+
+  // Call main() and exit.
+  sys_exit_group(main(argc, argv, envp));
+
+  // sys_exit_group does not return
+}
+
+int strcmp(const char* s1, const char* s2) {
+  const unsigned char* p1 = reinterpret_cast<const unsigned char*>(s1);
+  const unsigned char* p2 = reinterpret_cast<const unsigned char*>(s2);
+
+  while (*p1 == *p2) {
+    if (!*p1) {
+      return 0;
+    }
+    ++p1;
+    ++p2;
+  }
+  return static_cast<int>(*p1) - static_cast<int>(*p2);
+}
+
+int sys_rseq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
+  return raw_syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
+}
+
+// Test that rseq must be aligned.
+int TestUnaligned() {
+  constexpr uintptr_t kRequiredAlignment = alignof(rseq);
+
+  char buf[2 * kRequiredAlignment] = {};
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(&buf[0]);
+  if ((ptr & (kRequiredAlignment - 1)) == 0) {
+    // buf is already aligned. Misalign it.
+    ptr++;
+  }
+
+  int ret = sys_rseq(reinterpret_cast<rseq*>(ptr), sizeof(rseq), 0, 0);
+  if (sys_errno(ret) != EINVAL) {
+    return 1;
+  }
+  return 0;
+}
+
+// Sanity test that registration works.
+int TestRegister() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+  return 0;
+};
+
+// Registration can't be done twice.
+int TestDoubleRegister() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != EBUSY) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// Registration can be done again after unregister.
+int TestRegisterUnregister() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, 0);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// The pointer to rseq must match on register/unregister.
+int TestUnregisterDifferentPtr() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq r2 = {};
+  if (int ret = sys_rseq(&r2, sizeof(r2), kRseqFlagUnregister, 0);
+      sys_errno(ret) != EINVAL) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// The signature must match on register/unregister.
+int TestUnregisterDifferentSignature() {
+  constexpr int kSignature = 0;
+
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kSignature); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, kSignature + 1);
+      sys_errno(ret) != EPERM) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// The CPU ID is initialized.
+int TestCPU() {
+  struct rseq r = {};
+  r.cpu_id = kRseqCPUIDUninitialized;
+
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (__atomic_load_n(&r.cpu_id, __ATOMIC_RELAXED) < 0) {
+    return 1;
+  }
+  if (__atomic_load_n(&r.cpu_id_start, __ATOMIC_RELAXED) < 0) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// Critical section is eventually aborted.
+int TestAbort() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  // Loops until abort. If this returns then abort occurred.
+  rseq_loop(&r, &cs);
+
+  return 0;
+};
+
+// Abort may be before the critical section.
+int TestAbortBefore() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_early_abort);
+
+  // Loops until abort. If this returns then abort occurred.
+  rseq_loop(&r, &cs);
+
+  return 0;
+};
+
+// Signature must match.
+int TestAbortSignature() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  // Loops until abort. This should SIGSEGV on abort.
+  rseq_loop(&r, &cs);
+
+  return 1;
+};
+
+// Abort must not be in the critical section.
+int TestAbortPreCommit() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_pre_commit);
+
+  // Loops until abort. This should SIGSEGV on abort.
+  rseq_loop(&r, &cs);
+
+  return 1;
+};
+
+// rseq.rseq_cs is cleared on abort.
+int TestAbortClearsCS() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  // Loops until abort. If this returns then abort occurred.
+  rseq_loop(&r, &cs);
+
+  if (__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// rseq.rseq_cs is cleared on abort outside of critical section.
+int TestInvalidAbortClearsCS() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  __atomic_store_n(&r.rseq_cs, &cs, __ATOMIC_RELAXED);
+
+  // When the next abort condition occurs, the kernel will clear cs once it
+  // determines we aren't in the critical section.
+  while (1) {
+    if (!__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) {
+      break;
+    }
+  }
+
+  return 0;
+};
+
+// Exit codes:
+//  0 - Pass
+//  1 - Fail
+//  2 - Missing argument
+//  3 - Unknown test case
+extern "C" int main(int argc, char** argv, char** envp) {
+  if (argc != 2) {
+    // Usage: rseq <test case>
+    return 2;
+  }
+
+  if (strcmp(argv[1], kRseqTestUnaligned) == 0) {
+    return TestUnaligned();
+  }
+  if (strcmp(argv[1], kRseqTestRegister) == 0) {
+    return TestRegister();
+  }
+  if (strcmp(argv[1], kRseqTestDoubleRegister) == 0) {
+    return TestDoubleRegister();
+  }
+  if (strcmp(argv[1], kRseqTestRegisterUnregister) == 0) {
+    return TestRegisterUnregister();
+  }
+  if (strcmp(argv[1], kRseqTestUnregisterDifferentPtr) == 0) {
+    return TestUnregisterDifferentPtr();
+  }
+  if (strcmp(argv[1], kRseqTestUnregisterDifferentSignature) == 0) {
+    return TestUnregisterDifferentSignature();
+  }
+  if (strcmp(argv[1], kRseqTestCPU) == 0) {
+    return TestCPU();
+  }
+  if (strcmp(argv[1], kRseqTestAbort) == 0) {
+    return TestAbort();
+  }
+  if (strcmp(argv[1], kRseqTestAbortBefore) == 0) {
+    return TestAbortBefore();
+  }
+  if (strcmp(argv[1], kRseqTestAbortSignature) == 0) {
+    return TestAbortSignature();
+  }
+  if (strcmp(argv[1], kRseqTestAbortPreCommit) == 0) {
+    return TestAbortPreCommit();
+  }
+  if (strcmp(argv[1], kRseqTestAbortClearsCS) == 0) {
+    return TestAbortClearsCS();
+  }
+  if (strcmp(argv[1], kRseqTestInvalidAbortClearsCS) == 0) {
+    return TestInvalidAbortClearsCS();
+  }
+
+  return 3;
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/rseq/start.S b/test/syscalls/linux/rseq/start.S
new file mode 100644
index 000000000..b9611b276
--- /dev/null
+++ b/test/syscalls/linux/rseq/start.S
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+  .text
+  .align 4
+  .type  _start,@function
+  .globl  _start
+
+_start:
+  movq  %rsp,%rdi
+  call  __init
+  hlt
+
+  .size  _start,.-_start
+  .section  .note.GNU-stack,"",@progbits
+
+  .text
+  .globl  raw_syscall
+  .type   raw_syscall, @function
+
+raw_syscall:
+  mov  %rdi,%rax      // syscall #
+  mov  %rsi,%rdi      // arg0
+  mov  %rdx,%rsi      // arg1
+  mov  %rcx,%rdx      // arg2
+  mov  %r8,%r10       // arg3 (goes in r10 instead of rcx for system calls)
+  mov  %r9,%r8        // arg4
+  mov  0x8(%rsp),%r9  // arg5
+  syscall
+  ret
+
+  .size  raw_syscall,.-raw_syscall
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/syscalls.h b/test/syscalls/linux/rseq/syscalls.h
new file mode 100644
index 000000000..e5299c188
--- /dev/null
+++ b/test/syscalls/linux/rseq/syscalls.h
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
+
+#include "test/syscalls/linux/rseq/types.h"
+
+#ifdef __x86_64__
+// Syscall numbers.
+constexpr int kGetpid = 39;
+constexpr int kExitGroup = 231;
+#else
+#error "Unknown architecture"
+#endif
+
+namespace gvisor {
+namespace testing {
+
+// Standalone system call interfaces.
+// Note that these are all "raw" system call interfaces which encode
+// errors by setting the return value to a small negative number.
+// Use sys_errno() to check system call return values for errors.
+
+// Maximum Linux error number.
+constexpr int kMaxErrno = 4095;
+
+// Errno values.
+#define EPERM 1
+#define EFAULT 14
+#define EBUSY 16
+#define EINVAL 22
+
+// Get the error number from a raw system call return value.
+// Returns a positive error number or 0 if there was no error.
+static inline int sys_errno(uintptr_t rval) {
+  if (rval >= static_cast<uintptr_t>(-kMaxErrno)) {
+    return -static_cast<int>(rval);
+  }
+  return 0;
+}
+
+extern "C" uintptr_t raw_syscall(int number, ...);
+
+static inline void sys_exit_group(int status) {
+  raw_syscall(kExitGroup, status);
+}
+static inline int sys_getpid() {
+  return static_cast<int>(raw_syscall(kGetpid));
+}
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
diff --git a/test/syscalls/linux/rseq/test.h b/test/syscalls/linux/rseq/test.h
new file mode 100644
index 000000000..3b7bb74b1
--- /dev/null
+++ b/test/syscalls/linux/rseq/test.h
@@ -0,0 +1,43 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
+
+namespace gvisor {
+namespace testing {
+
+// Test cases supported by rseq binary.
+
+inline constexpr char kRseqTestUnaligned[] = "unaligned";
+inline constexpr char kRseqTestRegister[] = "register";
+inline constexpr char kRseqTestDoubleRegister[] = "double-register";
+inline constexpr char kRseqTestRegisterUnregister[] = "register-unregister";
+inline constexpr char kRseqTestUnregisterDifferentPtr[] =
+    "unregister-different-ptr";
+inline constexpr char kRseqTestUnregisterDifferentSignature[] =
+    "unregister-different-signature";
+inline constexpr char kRseqTestCPU[] = "cpu";
+inline constexpr char kRseqTestAbort[] = "abort";
+inline constexpr char kRseqTestAbortBefore[] = "abort-before";
+inline constexpr char kRseqTestAbortSignature[] = "abort-signature";
+inline constexpr char kRseqTestAbortPreCommit[] = "abort-precommit";
+inline constexpr char kRseqTestAbortClearsCS[] = "abort-clears-cs";
+inline constexpr char kRseqTestInvalidAbortClearsCS[] =
+    "invalid-abort-clears-cs";
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
diff --git a/test/syscalls/linux/rseq/types.h b/test/syscalls/linux/rseq/types.h
new file mode 100644
index 000000000..b6afe9817
--- /dev/null
+++ b/test/syscalls/linux/rseq/types.h
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
+
+using size_t = __SIZE_TYPE__;
+using uintptr_t = __UINTPTR_TYPE__;
+
+using uint8_t = __UINT8_TYPE__;
+using uint16_t = __UINT16_TYPE__;
+using uint32_t = __UINT32_TYPE__;
+using uint64_t = __UINT64_TYPE__;
+
+using int8_t = __INT8_TYPE__;
+using int16_t = __INT16_TYPE__;
+using int32_t = __INT32_TYPE__;
+using int64_t = __INT64_TYPE__;
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
new file mode 100644
index 000000000..e3ff0579a
--- /dev/null
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -0,0 +1,54 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
+
+// User-kernel ABI for restartable sequences.
+
+// Standard types.
+//
+// N.B. This header will be included in targets that do have the standard
+// library, so we can't shadow the standard type names.
+using __u32 = __UINT32_TYPE__;
+using __u64 = __UINT64_TYPE__;
+
+#ifdef __x86_64__
+// Syscall numbers.
+constexpr int kRseqSyscall = 334;
+#else
+#error "Unknown architecture"
+#endif  // __x86_64__
+
+struct rseq_cs {
+  __u32 version;
+  __u32 flags;
+  __u64 start_ip;
+  __u64 post_commit_offset;
+  __u64 abort_ip;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+// N.B. alignment is enforced by the kernel.
+struct rseq {
+  __u32 cpu_id_start;
+  __u32 cpu_id;
+  struct rseq_cs* rseq_cs;
+  __u32 flags;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+constexpr int kRseqFlagUnregister = 1 << 0;
+
+constexpr int kRseqCPUIDUninitialized = -1;
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index e4641c62e..34b1058a9 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -97,12 +97,22 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
     sockets_to_close_.erase(socket_id);
   }
 
-  // Bind a socket with the reuse option and bind_to_device options.  Checks
+  // SetDevice changes the bind_to_device option. It does not bind or re-bind.
+  void SetDevice(int socket_id, int device_id) {
+    auto socket_fd = sockets_to_close_[socket_id]->get();
+    string device_name;
+    ASSERT_NO_FATAL_FAILURE(GetDevice(device_id, &device_name));
+    EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE,
+                           device_name.c_str(), device_name.size() + 1),
+                SyscallSucceedsWithValue(0));
+  }
+
+  // Bind a socket with the reuse options and bind_to_device options. Checks
   // that all steps succeed and that the bind command's error matches want.
   // Sets the socket_id to uniquely identify the socket bound if it is not
   // nullptr.
-  void BindSocket(bool reuse, int device_id = 0, int want = 0,
-                  int *socket_id = nullptr) {
+  void BindSocket(bool reuse_port, bool reuse_addr, int device_id = 0,
+                  int want = 0, int *socket_id = nullptr) {
     next_socket_id_++;
     sockets_to_close_[next_socket_id_] = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
     auto socket_fd = sockets_to_close_[next_socket_id_]->get();
@@ -110,13 +120,20 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
       *socket_id = next_socket_id_;
     }
 
-    // If reuse is indicated, do that.
-    if (reuse) {
+    // If reuse_port is indicated, do that.
+    if (reuse_port) {
       EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
                              sizeof(kSockOptOn)),
                   SyscallSucceedsWithValue(0));
     }
 
+    // If reuse_addr is indicated, do that.
+    if (reuse_addr) {
+      EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                             sizeof(kSockOptOn)),
+                  SyscallSucceedsWithValue(0));
+    }
+
     // If the device is non-zero, bind to that device.
     if (device_id != 0) {
       string device_name;
@@ -182,129 +199,308 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
 };
 
 TEST_P(BindToDeviceSequenceTest, BindTwiceWithDeviceFails) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 3));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 3, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 3, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindToDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 1));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 2));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 1));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 2));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindToDeviceAndThenWithoutDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindWithoutDevice) {
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ false));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindWithDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 456, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 789, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 789, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindWithReuse) {
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
   ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true, /* bind_to_device */ 0));
+      BindSocket(/* reusePort */ true, /* reuse_addr */ false));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false,
+      /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindingWithReuseAndDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 456));
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 789));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 999, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse_port */ true, /* reuse_addr */ false));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 999, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, MixingReuseAndNotReuseByBindingToDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 456, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 789, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 999, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 789, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 999, 0));
 }
 
 TEST_P(BindToDeviceSequenceTest, CannotBindTo0AfterMixingReuseAndNotReuse) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 456));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindAndRelease) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
   int to_release;
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, 0, &to_release));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 345, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 789));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, 0, &to_release));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 345, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789));
   // Release the bind to device 0 and try again.
   ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 345));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 345));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindTwiceWithReuseOnce) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
   ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+      BindSocket(/* reusePort */ false, /* reuse_addr */ true));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+}
+
+TEST_P(BindToDeviceSequenceTest,
+       CannotBindTo0AfterMixingReuseAddrAndNotReuseAddr) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReusePort) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReusePort) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReusePortThenReuseAddrReusePort) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrThenReuseAddr) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+// This behavior seems like a bug?
+TEST_P(BindToDeviceSequenceTest,
+       BindReuseAddrThenReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0));
+}
+
+// Repro test for gvisor.dev/issue/1217. Not replicated in ports_test.go as this
+// test is different from the others and wouldn't fit well there.
+TEST_P(BindToDeviceSequenceTest, BindAndReleaseDifferentDevice) {
+  int to_release;
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 3, 0, &to_release));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 3, EADDRINUSE));
+  // Change the device. Since the socket was already bound, this should have no
+  // effect.
+  SetDevice(to_release, 2);
+  // Release the bind to device 3 and try again.
+  ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3));
 }
 
 INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceSequenceTest,
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 96a1731cf..761c3a9fe 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -206,7 +206,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   }
   // TODO(b/138400178): Fix cooperative S/R failure when ds.reset() is invoked
   // before function end.
-  // ds.reset()
+  // ds.reset();
 }
 
 TEST_P(SocketInetLoopbackTest, TCPbacklog) {
@@ -603,6 +603,60 @@ TEST_P(SocketInetLoopbackTest, TCPTimeWaitTest_NoRandomSave) {
               SyscallSucceeds());
 }
 
+TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the userTimeout on the listening socket.
+  constexpr int kUserTimeout = 10;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kUserTimeout, sizeof(kUserTimeout)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+  // Verify that the accepted socket inherited the user timeout set on
+  // listening socket.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(accepted.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kUserTimeout);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
@@ -635,7 +689,9 @@ INSTANTIATE_TEST_SUITE_P(
 
 using SocketInetReusePortTest = ::testing::TestWithParam<TestParam>;
 
-TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
+// TODO(gvisor.dev/issue/940): Remove _NoRandomSave when portHint/stack.Seed is
+// saved/restored.
+TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -643,6 +699,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
   sockaddr_storage listen_addr = listener.addr;
   sockaddr_storage conn_addr = connector.addr;
   constexpr int kThreadCount = 3;
+  constexpr int kConnectAttempts = 4096;
 
   // Create the listening socket.
   FileDescriptor listener_fds[kThreadCount];
@@ -657,7 +714,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
     ASSERT_THAT(
         bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
         SyscallSucceeds());
-    ASSERT_THAT(listen(fd, 40), SyscallSucceeds());
+    ASSERT_THAT(listen(fd, kConnectAttempts / 3), SyscallSucceeds());
 
     // On the first bind we need to determine which port was bound.
     if (i != 0) {
@@ -676,7 +733,6 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
   }
 
-  constexpr int kConnectAttempts = 10000;
   std::atomic<int> connects_received = ATOMIC_VAR_INIT(0);
   std::unique_ptr<ScopedThread> listen_thread[kThreadCount];
   int accept_counts[kThreadCount] = {};
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index c74273436..57ce8e169 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -812,5 +812,68 @@ TEST_P(TCPSocketPairTest, TestTCPCloseWithData) {
   ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
 }
 
+TEST_P(TCPSocketPairTest, TCPUserTimeoutDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);  // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kZero = 0;
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kZero, sizeof(kZero)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);  // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutBelowZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kNeg = -10;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kNeg, sizeof(kNeg)),
+              SyscallFailsWithErrno(EINVAL));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);  // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kAbove = 10;
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kAbove, sizeof(kAbove)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kAbove);
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 8a28202a8..4cf1f76f1 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -65,6 +65,21 @@ TEST_P(UnixSocketPairTest, BindToBadName) {
       SyscallFailsWithErrno(ENOENT));
 }
 
+TEST_P(UnixSocketPairTest, BindToBadFamily) {
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  constexpr char kBadName[] = "/some/path/that/does/not/exist";
+  sockaddr_un sockaddr;
+  sockaddr.sun_family = AF_INET;
+  memcpy(sockaddr.sun_path, kBadName, sizeof(kBadName));
+
+  EXPECT_THAT(
+      bind(pair->first_fd(), reinterpret_cast<struct sockaddr*>(&sockaddr),
+           sizeof(sockaddr)),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 TEST_P(UnixSocketPairTest, RecvmmsgTimeoutAfterRecv) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
   char sent_data[10];
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 99863b0ed..6b99c021d 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -967,6 +967,78 @@ TEST_P(SimpleTcpSocketTest, BlockingConnectRefused) {
   EXPECT_THAT(close(s.release()), SyscallSucceeds());
 }
 
+// Test that connecting to a non-listening port and thus receiving a RST is
+// handled appropriately by the socket - the port that the socket was bound to
+// is released and the expected error is returned.
+TEST_P(SimpleTcpSocketTest, CleanupOnConnectionRefused) {
+  // Create a socket that is known to not be listening. As is it bound but not
+  // listening, when another socket connects to the port, it will refuse..
+  FileDescriptor bound_s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage bound_addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t bound_addrlen = sizeof(bound_addr);
+
+  ASSERT_THAT(
+      bind(bound_s.get(), reinterpret_cast<struct sockaddr*>(&bound_addr),
+           bound_addrlen),
+      SyscallSucceeds());
+
+  // Get the addresses the socket is bound to because the port is chosen by the
+  // stack.
+  ASSERT_THAT(getsockname(bound_s.get(),
+                          reinterpret_cast<struct sockaddr*>(&bound_addr),
+                          &bound_addrlen),
+              SyscallSucceeds());
+
+  // Create, initialize, and bind the socket that is used to test connecting to
+  // the non-listening port.
+  FileDescriptor client_s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  // Initialize client address to the loopback one.
+  sockaddr_storage client_addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t client_addrlen = sizeof(client_addr);
+
+  ASSERT_THAT(
+      bind(client_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr),
+           client_addrlen),
+      SyscallSucceeds());
+
+  ASSERT_THAT(getsockname(client_s.get(),
+                          reinterpret_cast<struct sockaddr*>(&client_addr),
+                          &client_addrlen),
+              SyscallSucceeds());
+
+  // Now the test: connect to the bound but not listening socket with the
+  // client socket. The bound socket should return a RST and cause the client
+  // socket to return an error and clean itself up immediately.
+  // The error being ECONNREFUSED diverges with RFC 793, page 37, but does what
+  // Linux does.
+  ASSERT_THAT(connect(client_s.get(),
+                      reinterpret_cast<const struct sockaddr*>(&bound_addr),
+                      bound_addrlen),
+              SyscallFailsWithErrno(ECONNREFUSED));
+
+  FileDescriptor new_s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Test binding to the address from the client socket. This should be okay
+  // if it was dropped correctly.
+  ASSERT_THAT(
+      bind(new_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr),
+           client_addrlen),
+      SyscallSucceeds());
+
+  // Attempt #2, with the new socket and reused addr our connect should fail in
+  // the same way as before, not with an EADDRINUSE.
+  ASSERT_THAT(connect(client_s.get(),
+                      reinterpret_cast<const struct sockaddr*>(&bound_addr),
+                      bound_addrlen),
+              SyscallFailsWithErrno(ECONNREFUSED));
+}
+
 // Test that we get an ECONNREFUSED with a nonblocking socket.
 TEST_P(SimpleTcpSocketTest, NonBlockingConnectRefused) {
   FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
@@ -1175,6 +1247,31 @@ TEST_P(SimpleTcpSocketTest, SetMaxSegFailsForInvalidMSSValues) {
   }
 }
 
+TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  {
+    constexpr int kTCPUserTimeout = -1;
+    EXPECT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                           &kTCPUserTimeout, sizeof(kTCPUserTimeout)),
+                SyscallFailsWithErrno(EINVAL));
+  }
+
+  // kTCPUserTimeout is in milliseconds.
+  constexpr int kTCPUserTimeout = 100;
+  ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kTCPUserTimeout, sizeof(kTCPUserTimeout)),
+              SyscallSucceedsWithValue(0));
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPUserTimeout);
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 63b92d6a7..dc35c2f50 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -656,6 +656,9 @@ TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
 }
 
 TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
+  // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+  SKIP_IF(IsRunningWithHostinet());
+
   // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
@@ -673,6 +676,9 @@ TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
 }
 
 TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
+  // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+  SKIP_IF(IsRunningWithHostinet());
+
   // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
@@ -878,6 +884,10 @@ TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
 }
 
 TEST_P(UdpSocketTest, ReadShutdown) {
+  // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+  // MSG_DONTWAIT blocks indefinitely.
+  SKIP_IF(IsRunningWithHostinet());
+
   char received[512];
   EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
               SyscallFailsWithErrno(EWOULDBLOCK));
@@ -900,6 +910,10 @@ TEST_P(UdpSocketTest, ReadShutdown) {
 }
 
 TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
+  // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+  // MSG_DONTWAIT blocks indefinitely.
+  SKIP_IF(IsRunningWithHostinet());
+
   char received[512];
   EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
               SyscallFailsWithErrno(EWOULDBLOCK));
@@ -1189,6 +1203,10 @@ TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
 }
 
 TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
+  // TODO(gvisor.dev/issue/1202): SO_TIMESTAMP socket option not supported by
+  // hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   int v = -1;
   socklen_t optlen = sizeof(v);
   ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
@@ -1198,6 +1216,10 @@ TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
 }
 
 TEST_P(UdpSocketTest, SoTimestamp) {
+  // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+  // supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1241,6 +1263,9 @@ TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
 }
 
 TEST_P(UdpSocketTest, TimestampIoctl) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1259,7 +1284,10 @@ TEST_P(UdpSocketTest, TimestampIoctl) {
   ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
 }
 
-TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
+TEST_P(UdpSocketTest, TimestampIoctlNothingRead) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1270,6 +1298,10 @@ TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
 // Test that the timestamp accessed via SIOCGSTAMP is still accessible after
 // SO_TIMESTAMP is enabled and used to retrieve a timestamp.
 TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
+  // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+  // supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1304,7 +1336,6 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
   msg.msg_controllen = sizeof(cmsgbuf);
   ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
   struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  cmsg = CMSG_FIRSTHDR(&msg);
   ASSERT_NE(cmsg, nullptr);
 
   // The ioctl should return the exact same values as before.
@@ -1314,5 +1345,154 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
   ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
 }
 
+// Test that a socket with IP_TOS or IPV6_TCLASS set will set the TOS byte on
+// outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SetAndReceiveTOS) {
+  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  int recv_level = SOL_IP;
+  int recv_type = IP_RECVTOS;
+  if (GetParam() != AddressFamily::kIpv4) {
+    recv_level = SOL_IPV6;
+    recv_type = IPV6_RECVTCLASS;
+  }
+  ASSERT_THAT(
+      setsockopt(s_, recv_level, recv_type, &kSockOptOn, sizeof(kSockOptOn)),
+      SyscallSucceeds());
+
+  // Set socket TOS.
+  int sent_level = recv_level;
+  int sent_type = IP_TOS;
+  if (sent_level == SOL_IPV6) {
+    sent_type = IPV6_TCLASS;
+  }
+  int sent_tos = IPTOS_LOWDELAY;  // Choose some TOS value.
+  ASSERT_THAT(
+      setsockopt(t_, sent_level, sent_type, &sent_tos, sizeof(sent_tos)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  struct msghdr sent_msg = {};
+  struct iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = &sent_data[0];
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  // Receive message.
+  struct msghdr received_msg = {};
+  struct iovec received_iov = {};
+  char received_data[kDataLength];
+  received_iov.iov_base = &received_data[0];
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  size_t cmsg_data_len = sizeof(int8_t);
+  if (sent_type == IPV6_TCLASS) {
+    cmsg_data_len = sizeof(int);
+  }
+  std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  received_msg.msg_control = &received_cmsgbuf[0];
+  received_msg.msg_controllen = received_cmsgbuf.size();
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, sent_level);
+  EXPECT_EQ(cmsg->cmsg_type, sent_type);
+  int8_t received_tos = 0;
+  memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+  EXPECT_EQ(received_tos, sent_tos);
+}
+
+// Test that sendmsg with IP_TOS and IPV6_TCLASS control messages will set the
+// TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SendAndReceiveTOS) {
+  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  int recv_level = SOL_IP;
+  int recv_type = IP_RECVTOS;
+  if (GetParam() != AddressFamily::kIpv4) {
+    recv_level = SOL_IPV6;
+    recv_type = IPV6_RECVTCLASS;
+  }
+  int recv_opt = kSockOptOn;
+  ASSERT_THAT(
+      setsockopt(s_, recv_level, recv_type, &recv_opt, sizeof(recv_opt)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  int sent_level = recv_level;
+  int sent_type = IP_TOS;
+  int sent_tos = IPTOS_LOWDELAY;  // Choose some TOS value.
+
+  struct msghdr sent_msg = {};
+  struct iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = &sent_data[0];
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+  size_t cmsg_data_len = sizeof(int8_t);
+  if (sent_level == SOL_IPV6) {
+    sent_type = IPV6_TCLASS;
+    cmsg_data_len = sizeof(int);
+  }
+  std::vector<char> sent_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  sent_msg.msg_control = &sent_cmsgbuf[0];
+  sent_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+
+  // Manually add control message.
+  struct cmsghdr* sent_cmsg = CMSG_FIRSTHDR(&sent_msg);
+  sent_cmsg->cmsg_len = CMSG_LEN(cmsg_data_len);
+  sent_cmsg->cmsg_level = sent_level;
+  sent_cmsg->cmsg_type = sent_type;
+  *(int8_t*)CMSG_DATA(sent_cmsg) = sent_tos;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  // Receive message.
+  struct msghdr received_msg = {};
+  struct iovec received_iov = {};
+  char received_data[kDataLength];
+  received_iov.iov_base = &received_data[0];
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  received_msg.msg_control = &received_cmsgbuf[0];
+  received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, sent_level);
+  EXPECT_EQ(cmsg->cmsg_type, sent_type);
+  int8_t received_tos = 0;
+  memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+  EXPECT_EQ(received_tos, sent_tos);
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
new file mode 100644
index 000000000..75740238c
--- /dev/null
+++ b/test/syscalls/linux/xattr.cc
@@ -0,0 +1,488 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/capability_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class XattrTest : public FileTest {};
+
+TEST_F(XattrTest, XattrNullName) {
+  const char* path = test_file_name_.c_str();
+
+  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, XattrEmptyName) {
+  const char* path = test_file_name_.c_str();
+
+  EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, XattrLargeName) {
+  const char* path = test_file_name_.c_str();
+  std::string name = "user.";
+  name += std::string(XATTR_NAME_MAX - name.length(), 'a');
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+                SyscallSucceeds());
+    EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+                SyscallSucceedsWithValue(0));
+  }
+
+  name += "a";
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, XattrInvalidPrefix) {
+  const char* path = test_file_name_.c_str();
+  std::string name(XATTR_NAME_MAX, 'a');
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_F(XattrTest, XattrReadOnly) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+                SyscallSucceeds());
+  }
+
+  ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR));
+
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+              SyscallFailsWithErrno(EACCES));
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    char buf = '-';
+    EXPECT_THAT(getxattr(path, name, &buf, size),
+                SyscallSucceedsWithValue(size));
+    EXPECT_EQ(buf, val);
+  }
+}
+
+TEST_F(XattrTest, XattrWriteOnly) {
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IWUSR));
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+                SyscallSucceeds());
+  }
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
+}
+
+TEST_F(XattrTest, XattrTrustedWithNonadmin) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  const char* path = test_file_name_.c_str();
+  const char name[] = "trusted.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, XattrOnDirectory) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, XattrOnSymlink) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, XattrOnInvalidFileTypes) {
+  char name[] = "user.abc";
+
+  char char_device[] = "/dev/zero";
+  EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(getxattr(char_device, name, NULL, 0),
+              SyscallFailsWithErrno(ENODATA));
+
+  // Use tmpfs, where creation of named pipes is supported.
+  const std::string fifo = NewTempAbsPathInDir("/dev/shm");
+  const char* path = fifo.c_str();
+  EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, NULL, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(getxattr(path, name, NULL, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  size_t size = 1;
+  EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  std::vector<char> expected_buf = {'a', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, XATTR_SIZE_MAX),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, SetxattrSizeTooLarge) {
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+
+  // Note that each particular fs implementation may stipulate a lower size
+  // limit, in which case we actually may fail (e.g. error with ENOSPC) for
+  // some sizes under XATTR_SIZE_MAX.
+  size_t size = XATTR_SIZE_MAX + 1;
+  std::vector<char> val(size);
+  EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+              SyscallFailsWithErrno(E2BIG));
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(getxattr(path, name, nullptr, 0),
+                SyscallFailsWithErrno(ENODATA));
+  }
+}
+
+TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
+              SyscallFailsWithErrno(EFAULT));
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(getxattr(path, name, nullptr, 0),
+                SyscallFailsWithErrno(ENODATA));
+  }
+}
+
+TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val(XATTR_SIZE_MAX + 1);
+  std::fill(val.begin(), val.end(), 'a');
+  size_t size = 1;
+  EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  std::vector<char> expected_buf = {'a', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), size),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  std::vector<char> expected_buf = {'a', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(2));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, SetxattrCreateFlag) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
+              SyscallFailsWithErrno(EEXIST));
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrReplaceFlag) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
+              SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
+              SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrInvalidFlags) {
+  const char* path = test_file_name_.c_str();
+  int invalid_flags = 0xff;
+  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(XattrTest, Getxattr) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  int val = 1234;
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  int buf = 0;
+  EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  size_t size = val.size();
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, 1), SyscallFailsWithErrno(ERANGE));
+  EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, 1, /*flags=*/0), SyscallSucceeds());
+
+  std::vector<char> buf(XATTR_SIZE_MAX);
+  std::fill(buf.begin(), buf.end(), '-');
+  std::vector<char> expected_buf = buf;
+  expected_buf[0] = 'a';
+  EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, GetxattrZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
+              SyscallSucceeds());
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, 0),
+              SyscallSucceedsWithValue(sizeof(val)));
+  EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, GetxattrSizeTooLarge) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf(XATTR_SIZE_MAX + 1);
+  std::fill(buf.begin(), buf.end(), '-');
+  std::vector<char> expected_buf = buf;
+  expected_buf[0] = 'a';
+  EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+              SyscallSucceedsWithValue(sizeof(val)));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, GetxattrNullValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, size),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+  // Set value with zero size.
+  EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
+  // Get value with nonzero size.
+  EXPECT_THAT(getxattr(path, name, nullptr, size), SyscallSucceedsWithValue(0));
+
+  // Set value with nonzero size.
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+  // Get value with zero size.
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(size));
+}
+
+TEST_F(XattrTest, GetxattrNonexistentName) {
+  // TODO(b/127675828): Support getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  std::string name = "user.nonexistent";
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallFailsWithErrno(ENODATA));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index accf46347..b9fd885ff 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -46,6 +46,7 @@ var (
 	debug      = flag.Bool("debug", false, "enable debug logs")
 	strace     = flag.Bool("strace", false, "enable strace logs")
 	platform   = flag.String("platform", "ptrace", "platform to run on")
+	network    = flag.String("network", "none", "network stack to run on (sandbox, host, none)")
 	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
 	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
 	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
@@ -137,7 +138,7 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
 
 	args := []string{
 		"-root", rootDir,
-		"-network=none",
+		"-network", *network,
 		"-log-format=text",
 		"-TESTONLY-unsafe-nonroot=true",
 		"-net-raw=true",
@@ -335,10 +336,11 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 		})
 	}
 
-	// Set environment variable that indicates we are
-	// running in gVisor and with the given platform.
+	// Set environment variables that indicate we are
+	// running in gVisor with the given platform and network.
 	platformVar := "TEST_ON_GVISOR"
-	env := append(os.Environ(), platformVar+"="+*platform)
+	networkVar := "GVISOR_NETWORK"
+	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
 
 	// Remove env variables that cause the gunit binary to write output
 	// files, since they will stomp on eachother, and on the output files
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 88b1e7911..042cec94a 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -105,6 +105,15 @@ PosixErrorOr<struct stat> Stat(absl::string_view path) {
   return stat_buf;
 }
 
+PosixErrorOr<struct stat> Lstat(absl::string_view path) {
+  struct stat stat_buf;
+  int res = lstat(std::string(path).c_str(), &stat_buf);
+  if (res < 0) {
+    return PosixError(errno, absl::StrCat("lstat ", path));
+  }
+  return stat_buf;
+}
+
 PosixErrorOr<struct stat> Fstat(int fd) {
   struct stat stat_buf;
   int res = fstat(fd, &stat_buf);
@@ -127,7 +136,7 @@ PosixErrorOr<bool> Exists(absl::string_view path) {
 }
 
 PosixErrorOr<bool> IsDirectory(absl::string_view path) {
-  ASSIGN_OR_RETURN_ERRNO(struct stat stat_buf, Stat(path));
+  ASSIGN_OR_RETURN_ERRNO(struct stat stat_buf, Lstat(path));
   if (S_ISDIR(stat_buf.st_mode)) {
     return true;
   }
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 9cb050735..848504c88 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -41,6 +41,7 @@ namespace gvisor {
 namespace testing {
 
 #define TEST_ON_GVISOR "TEST_ON_GVISOR"
+#define GVISOR_NETWORK "GVISOR_NETWORK"
 
 bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; }
 
@@ -60,6 +61,11 @@ Platform GvisorPlatform() {
   abort();
 }
 
+bool IsRunningWithHostinet() {
+  char* env = getenv(GVISOR_NETWORK);
+  return env && strcmp(env, "host") == 0;
+}
+
 // Inline cpuid instruction.  Preserve %ebx/%rbx register. In PIC compilations
 // %ebx contains the address of the global offset table. %rbx is occasionally
 // used to address stack variables in presence of dynamic allocas.
diff --git a/test/util/test_util.h b/test/util/test_util.h
index ee6c2bf4d..b3235c7e3 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -220,6 +220,7 @@ enum class Platform {
 };
 bool IsRunningOnGvisor();
 Platform GvisorPlatform();
+bool IsRunningWithHostinet();
 
 #ifdef __linux__
 void SetupGvisorDeathTest();