Merge branch 'master' into iptables-1-pkg

Change-Id: I7457a11de4725e1bf3811420c505d225b1cb6943
author: Kevin Krakauer <krakauer@google.com> 2019-06-12 15:21:22 -0700
committer: Kevin Krakauer <krakauer@google.com> 2019-06-12 15:21:22 -0700
commit: 0bbbcafd68154e7c7b46692b84a39fb6bb5f1568 (patch)
tree: d8fba01ad76900715665b0418a786de2d77e2a05
parent: 06a83df533244dc2b3b8adfc1bf0608d3753c1d9 (diff)
parent: 70578806e8d3e01fae2249b3e602cd5b05d378a0 (diff)
216 files changed, 9012 insertions, 4720 deletions
diff --git a/.bazelrc b/.bazelrc
index b76976995..f6b21086d 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -1,4 +1,4 @@
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/BUILD b/BUILD
index 391791ca9..6d5e800ca 100644
--- a/BUILD
+++ b/BUILD
@@ -1,6 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//go:def.bzl", "go_path")
+load("@bazel_gazelle//:def.bzl", "gazelle")
 
 # The sandbox filegroup is used for sandbox-internal dependencies.
 package_group(
@@ -22,3 +23,9 @@ go_path(
         "//runsc",
     ],
 )
+
+# gazelle is a set of build tools.
+#
+# To update the WORKSPACE from go.mod, use:
+#   bazel run //:gazelle -- update-repos -from_file=go.mod
+gazelle(name = "gazelle")
diff --git a/README.md b/README.md
index f0252025c..17a15ad43 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
 ![gVisor](g3doc/logo.png)
 
+[![Status](https://storage.googleapis.com/gvisor-build-badges/build.svg)](https://storage.googleapis.com/gvisor-build-badges/build.html)
+[![gVisor chat](https://badges.gitter.im/gvisor/community.png)](https://gitter.im/gvisor/community)
+
 ## What is gVisor?
 
 **gVisor** is a user-space kernel, written in Go, that implements a substantial
@@ -36,8 +39,6 @@ be found at [gvisor.dev][gvisor-dev].
 
 ## Installing from source
 
-[![Status](https://storage.googleapis.com/gvisor-build-badges/build.svg)](https://storage.googleapis.com/gvisor-build-badges/build.html)
-
 gVisor currently requires x86\_64 Linux to build, though support for other
 architectures may become available in the future.
 
@@ -83,7 +84,7 @@ sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin
 The test suite can be run with Bazel:
 
 ```
-bazel test ...
+bazel test //...
 ```
 
 or in a Docker container:
diff --git a/WORKSPACE b/WORKSPACE
index 5da06317f..5155dc527 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -6,6 +6,7 @@ http_archive(
     sha256 = "a82a352bffae6bee4e95f68a8d80a70e87f42c4741e6a448bec11998fcc82329",
     url = "https://github.com/bazelbuild/rules_go/releases/download/0.18.5/rules_go-0.18.5.tar.gz",
 )
+
 http_archive(
     name = "bazel_gazelle",
     sha256 = "3c681998538231a2d24d0c07ed5a7658cb72bfb5fd4bf9911157c0e9ac6a2687",
@@ -37,86 +38,116 @@ http_archive(
 # External repositories, in sorted order.
 go_repository(
     name = "com_github_cenkalti_backoff",
-    commit = "66e726b43552c0bab0539b28e640b89fd6862115",
+    commit = "2146c9339422",
     importpath = "github.com/cenkalti/backoff",
 )
 
 go_repository(
     name = "com_github_gofrs_flock",
-    commit = "886344bea0798d02ff3fae16a922be5f6b26cee0",
+    commit = "886344bea079",
     importpath = "github.com/gofrs/flock",
 )
 
 go_repository(
     name = "com_github_golang_mock",
-    commit = "600781dde9cca80734169b9e969d9054ccc57937",
     importpath = "github.com/golang/mock",
+    tag = "v1.3.1",
 )
 
 go_repository(
     name = "com_github_google_go-cmp",
-    commit = "3af367b6b30c263d47e8895973edcca9a49cf029",
     importpath = "github.com/google/go-cmp",
+    tag = "v0.2.0",
 )
 
 go_repository(
     name = "com_github_google_subcommands",
-    commit = "ce3d4cfc062faac7115d44e5befec8b5a08c3faa",
+    commit = "636abe8753b8",
     importpath = "github.com/google/subcommands",
 )
 
 go_repository(
     name = "com_github_google_uuid",
-    commit = "dec09d789f3dba190787f8b4454c7d3c936fed9e",
+    commit = "dec09d789f3d",
     importpath = "github.com/google/uuid",
 )
 
 go_repository(
     name = "com_github_kr_pty",
-    commit = "282ce0e5322c82529687d609ee670fac7c7d917c",
     importpath = "github.com/kr/pty",
+    tag = "v1.1.1",
 )
 
 go_repository(
     name = "com_github_opencontainers_runtime-spec",
-    commit = "b2d941ef6a780da2d9982c1fb28d77ad97f54fc7",
+    commit = "b2d941ef6a78",
     importpath = "github.com/opencontainers/runtime-spec",
 )
 
 go_repository(
     name = "com_github_syndtr_gocapability",
-    commit = "d98352740cb2c55f81556b63d4a1ec64c5a319c2",
+    commit = "d98352740cb2",
     importpath = "github.com/syndtr/gocapability",
 )
 
 go_repository(
     name = "com_github_vishvananda_netlink",
-    commit = "adb577d4a45e341da53c4d9196ad4222c9a23e69",
+    commit = "adb577d4a45e",
     importpath = "github.com/vishvananda/netlink",
 )
 
 go_repository(
     name = "com_github_vishvananda_netns",
-    commit = "be1fbeda19366dea804f00efff2dd73a1642fdcc",
+    commit = "be1fbeda1936",
     importpath = "github.com/vishvananda/netns",
 )
 
 go_repository(
+    name = "org_golang_x_crypto",
+    commit = "c2843e01d9a2",
+    importpath = "golang.org/x/crypto",
+)
+
+go_repository(
     name = "org_golang_x_net",
-    commit = "b3c676e531a6dc479fa1b35ac961c13f5e2b4d2e",
+    commit = "d8887717615a",
     importpath = "golang.org/x/net",
 )
 
 go_repository(
+    name = "org_golang_x_text",
+    importpath = "golang.org/x/text",
+    tag = "v0.3.0",
+)
+
+go_repository(
+    name = "org_golang_x_tools",
+    commit = "36563e24a262",
+    importpath = "golang.org/x/tools",
+)
+
+go_repository(
+    name = "org_golang_x_sync",
+    commit = "112230192c58",
+    importpath = "golang.org/x/sync",
+)
+
+go_repository(
     name = "org_golang_x_sys",
-    commit = "0dd5e194bbf5eb84a39666eb4c98a4d007e4203a",
+    commit = "d0b11bdaac8a",
     importpath = "golang.org/x/sys",
 )
 
 go_repository(
     name = "com_github_google_btree",
-    commit = "4030bb1f1f0c35b30ca7009e9ebd06849dd45306",
     importpath = "github.com/google/btree",
+    tag = "v1.0.0",
+)
+
+go_repository(
+    name = "com_github_golang_protobuf",
+    importpath = "github.com/golang/protobuf",
+    tag = "v1.3.1",
 )
 
 # System Call test dependencies.
@@ -142,10 +173,10 @@ http_archive(
 
 http_archive(
     name = "com_google_googletest",
-    sha256 = "574e884a41f0a9b76f849a5cdd89c393651e7537e5daa725cf12511232cbd74b",
-    strip_prefix = "googletest-61cdca569b1f7e4629f8b949f0a9606c28281a6b",
+    sha256 = "db657310d3c5ca2d3f674e3a4b79718d1d39da70604568ee0568ba8e39065ef4",
+    strip_prefix = "googletest-31200def0dec8a624c861f919e86e4444e6e6ee7",
     urls = [
-        "https://mirror.bazel.build/github.com/google/googletest/archive/61cdca569b1f7e4629f8b949f0a9606c28281a6b.tar.gz",
-        "https://github.com/google/googletest/archive/61cdca569b1f7e4629f8b949f0a9606c28281a6b.tar.gz",
+        "https://mirror.bazel.build/github.com/google/googletest/archive/31200def0dec8a624c861f919e86e4444e6e6ee7.tar.gz",
+        "https://github.com/google/googletest/archive/31200def0dec8a624c861f919e86e4444e6e6ee7.tar.gz",
     ],
 )
diff --git a/cloudbuild/go.Dockerfile b/cloudbuild/go.Dockerfile
new file mode 100644
index 000000000..226442fd2
--- /dev/null
+++ b/cloudbuild/go.Dockerfile
@@ -0,0 +1,2 @@
+FROM ubuntu
+RUN apt-get -q update && apt-get install -qqy git rsync
diff --git a/cloudbuild/go.yaml b/cloudbuild/go.yaml
new file mode 100644
index 000000000..a38ef71fc
--- /dev/null
+++ b/cloudbuild/go.yaml
@@ -0,0 +1,22 @@
+steps:
+- name: 'gcr.io/cloud-builders/git'
+  args: ['fetch', '--all', '--unshallow']
+- name: 'gcr.io/cloud-builders/bazel'
+  args: ['build', ':gopath']
+- name: 'gcr.io/cloud-builders/docker'
+  args: ['build', '-t', 'gcr.io/$PROJECT_ID/go-branch', '-f', 'cloudbuild/go.Dockerfile', '.']
+- name: 'gcr.io/$PROJECT_ID/go-branch'
+  args: ['tools/go_branch.sh']
+- name: 'gcr.io/cloud-builders/git'
+  args: ['checkout', 'go']
+- name: 'gcr.io/cloud-builders/git'
+  args: ['clean', '-f']
+- name: 'golang'
+  args: ['go', 'build', './...']
+- name: 'gcr.io/cloud-builders/git'
+  entrypoint: 'bash'
+  args:
+    - '-c'
+    - 'if [[ "$BRANCH_NAME" == "master" ]]; then git push "${_ORIGIN}" go:go; fi'
+substitutions:
+    _ORIGIN: origin
diff --git a/go.mod b/go.mod
new file mode 100644
index 000000000..e58b84cfb
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,21 @@
+module gvisor.googlesource.com/gvisor
+
+go 1.12
+
+require (
+	github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
+	github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
+	github.com/golang/mock v1.3.1
+	github.com/golang/protobuf v1.3.1
+	github.com/google/btree v1.0.0
+	github.com/google/go-cmp v0.2.0
+	github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
+	github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
+	github.com/kr/pty v1.1.1
+	github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
+	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+	github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
+	github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
+	golang.org/x/net v0.0.0-20190311183353-d8887717615a
+	golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+)
diff --git a/kokoro/common.cfg b/kokoro/common.cfg
index f6776ae84..cad873fe1 100644
--- a/kokoro/common.cfg
+++ b/kokoro/common.cfg
@@ -11,7 +11,7 @@ before_action {
 # Configure bazel to access RBE.
 bazel_setting {
   # Our GCP project name
-  project_id: "copybara-shentu"
+  project_id: "gvisor-rbe"
 
   # Use RBE for execution as well as caching.
   local_execution: false
diff --git a/kokoro/run_build.sh b/kokoro/run_build.sh
index 63fffda48..9deafe9bb 100755..120000
--- a/kokoro/run_build.sh
+++ b/kokoro/run_build.sh
@@ -1,42 +1 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Fail on any error.
-set -e
-# Display commands to stderr.
-set -x
-
-# Install the latest version of Bazel.
-use_bazel.sh latest
-
-# Log the bazel path and version.
-which bazel
-bazel version
-
-cd git/repo
-
-# Build runsc.
-bazel build //runsc
-
-# Move the runsc binary into "latest" directory, and also a directory with the
-# current date.
-latest_dir="${KOKORO_ARTIFACTS_DIR}"/latest
-today_dir="${KOKORO_ARTIFACTS_DIR}"/"$(date -Idate)"
-mkdir -p "${latest_dir}" "${today_dir}"
-cp bazel-bin/runsc/linux_amd64_pure_stripped/runsc "${latest_dir}"
-sha512sum "${latest_dir}"/runsc | awk '{print $1 "  runsc"}' > "${latest_dir}"/runsc.sha512
-cp bazel-bin/runsc/linux_amd64_pure_stripped/runsc "${today_dir}"
-sha512sum "${today_dir}"/runsc | awk '{print $1 "  runsc"}' > "${today_dir}"/runsc.sha512
+../tools/run_build.sh
+\ No newline at end of file
diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 6a7c1fdb6..931cd2622 100755..120000
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -1,258 +1 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Fail on any error. Treat unset variables as error. Print commands as executed.
-set -eux
-
-
-###################
-# GLOBAL ENV VARS #
-###################
-
-readonly WORKSPACE_DIR="${PWD}/git/repo"
-
-# Used to configure RBE.
-readonly CLOUD_PROJECT_ID="copybara-shentu"
-readonly RBE_PROJECT_ID="projects/${CLOUD_PROJECT_ID}/instances/default_instance"
-
-# Random runtime name to avoid collisions.
-readonly RUNTIME="runsc_test_$((RANDOM))"
-
-# Packages that will be built and tested.
-readonly BUILD_PACKAGES=("//...")
-readonly TEST_PACKAGES=("//pkg/..." "//runsc/..." "//tools/...")
-
-#######################
-# BAZEL CONFIGURATION #
-#######################
-
-# Install the latest version of Bazel, and log the location and version.
-use_bazel.sh latest
-which bazel
-bazel version
-
-# Load the kvm module
-sudo -n -E modprobe kvm
-
-# General Bazel build/test flags.
-BAZEL_BUILD_FLAGS=(
-  "--show_timestamps"
-  "--test_output=errors"
-  "--keep_going"
-  "--verbose_failures=true"
-)
-
-# Bazel build/test for RBE, a super-set of BAZEL_BUILD_FLAGS.
-BAZEL_BUILD_RBE_FLAGS=(
-  "${BAZEL_BUILD_FLAGS[@]}"
-  "--config=remote"
-  "--project_id=${CLOUD_PROJECT_ID}"
-  "--remote_instance_name=${RBE_PROJECT_ID}"
-  "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
-)
-
-####################
-# Helper Functions #
-####################
-
-build_everything() {
-  FLAVOR="${1}"
-
-  cd ${WORKSPACE_DIR}
-  bazel build \
-    -c "${FLAVOR}" "${BAZEL_BUILD_RBE_FLAGS[@]}" \
-    "${BUILD_PACKAGES[@]}"
-}
-
-# Run simple tests runs the tests that require no special setup or
-# configuration.
-run_simple_tests() {
-  cd ${WORKSPACE_DIR}
-  bazel test \
-    "${BAZEL_BUILD_FLAGS[@]}" \
-    "${TEST_PACKAGES[@]}"
-}
-
-install_runtime() {
-  cd ${WORKSPACE_DIR}
-  sudo -n ${WORKSPACE_DIR}/runsc/test/install.sh --runtime ${RUNTIME}
-}
-
-# Install dependencies for the crictl tests.
-install_crictl_test_deps() {
-  # Install containerd.
-  sudo -n -E apt-get update
-  sudo -n -E apt-get install -y btrfs-tools libseccomp-dev
-  # go get will exit with a status of 1 despite succeeding, so ignore errors.
-  go get -d github.com/containerd/containerd || true
-  cd ${GOPATH}/src/github.com/containerd/containerd
-  git checkout v1.2.2
-  make
-  sudo -n -E make install
-
-  # Install crictl.
-  # go get will exit with a status of 1 despite succeeding, so ignore errors.
-  go get -d github.com/kubernetes-sigs/cri-tools || true
-  cd ${GOPATH}/src/github.com/kubernetes-sigs/cri-tools
-  git checkout tags/v1.11.0
-  make
-  sudo -n -E make install
-
-  # Install gvisor-containerd-shim.
-  local latest=/tmp/gvisor-containerd-shim-latest
-  local shim_path=/tmp/gvisor-containerd-shim
-  wget --no-verbose https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/latest -O ${latest}
-  wget --no-verbose https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
-  chmod +x ${shim_path}
-  sudo -n -E mv ${shim_path} /usr/local/bin
-
-  # Configure containerd-shim.
-  local shim_config_path=/etc/containerd
-  local shim_config_tmp_path=/tmp/gvisor-containerd-shim.toml
-  sudo -n -E mkdir -p ${shim_config_path}
-  cat > ${shim_config_tmp_path} <<-EOF
-    runc_shim = "/usr/local/bin/containerd-shim"
-
-    [runsc_config]
-      debug = "true"
-      debug-log = "/tmp/runsc-logs/"
-      strace = "true"
-      file-access = "shared"
-EOF
-  sudo mv ${shim_config_tmp_path} ${shim_config_path}
-
-  # Configure CNI.
-  sudo -n -E env PATH=${PATH} ${GOPATH}/src/github.com/containerd/containerd/script/setup/install-cni
-}
-
-# Run the tests that require docker.
-run_docker_tests() {
-  cd ${WORKSPACE_DIR}
-
-  # Run tests with a default runtime (runc).
-  bazel test \
-    "${BAZEL_BUILD_FLAGS[@]}" \
-    --test_env=RUNSC_RUNTIME="" \
-    --test_output=all \
-    //runsc/test/image:image_test
-
-  # These names are used to exclude tests not supported in certain
-  # configuration, e.g. save/restore not supported with hostnet.
-  declare -a variations=("" "-kvm" "-hostnet" "-overlay")
-  for v in "${variations[@]}"; do
-    # Run runsc tests with docker that are tagged manual.
-    bazel test \
-      "${BAZEL_BUILD_FLAGS[@]}" \
-      --test_env=RUNSC_RUNTIME="${RUNTIME}${v}" \
-      --test_output=all \
-      //runsc/test/image:image_test \
-      //runsc/test/integration:integration_test
-  done
-}
-
-# Run the tests that require root.
-run_root_tests() {
-  cd ${WORKSPACE_DIR}
-  bazel build //runsc/test/root:root_test
-  local root_test=$(find -L ./bazel-bin/ -executable -type f -name root_test | grep __main__)
-  if [[ ! -f "${root_test}" ]]; then
-    echo "root_test executable not found"
-    exit 1
-  fi
-  sudo -n -E RUNSC_RUNTIME="${RUNTIME}" RUNSC_EXEC=/tmp/"${RUNTIME}"/runsc ${root_test}
-}
-
-# Run syscall unit tests.
-run_syscall_tests() {
-  cd ${WORKSPACE_DIR}
-  bazel test "${BAZEL_BUILD_RBE_FLAGS[@]}" \
-    --test_tag_filters=runsc_ptrace //test/syscalls/...
-}
-
-run_runsc_do_tests() {
-  local runsc=$(find bazel-bin/runsc -type f -executable -name "runsc" | head -n1)
-
-  # run runsc do without root privileges.
-  unshare -Ur ${runsc} --network=none --TESTONLY-unsafe-nonroot do true
-  unshare -Ur ${runsc} --TESTONLY-unsafe-nonroot --network=host do --netns=false true
-
-  # run runsc do with root privileges.
-  sudo -n -E ${runsc} do true
-}
-
-# Find and rename all test xml and log files so that Sponge can pick them up.
-# XML files must be named sponge_log.xml, and log files must be named
-# sponge_log.log. We move all such files into KOKORO_ARTIFACTS_DIR, in a
-# subdirectory named with the test name.
-upload_test_artifacts() {
-  cd ${WORKSPACE_DIR}
-  find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
-    tar --create --files-from - --transform 's/test\./sponge_log./' |
-    tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
-  if [[ -d "/tmp/${RUNTIME}/logs" ]]; then
-    tar --create --gzip "--file=${KOKORO_ARTIFACTS_DIR}/runsc-logs.tar.gz" -C /tmp/ ${RUNTIME}/logs
-  fi
-}
-
-# Finish runs at exit, even in the event of an error, and uploads all test
-# artifacts.
-finish() {
-  # Grab the last exit code, we will return it.
-  local exit_code=${?}
-  upload_test_artifacts
-  exit ${exit_code}
-}
-
-# Run bazel in a docker container
-build_in_docker() {
-  cd ${WORKSPACE_DIR}
-  bazel clean
-  bazel shutdown
-  make
-  make runsc
-  make bazel-shutdown
-}
-
-########
-# MAIN #
-########
-
-main() {
-  # Register finish to run at exit.
-  trap finish EXIT
-
-  # Build and run the simple tests.
-  build_everything opt
-  run_simple_tests
-
-  # So far so good. Install more deps and run the integration tests.
-  install_runtime
-  install_crictl_test_deps
-  run_docker_tests
-  run_root_tests
-
-  run_syscall_tests
-  run_runsc_do_tests
-
-  # Build other flavors too.
-  build_everything dbg
-
-  build_in_docker
-  # No need to call "finish" here, it will happen at exit.
-}
-
-# Kick it off.
-main
+../tools/run_tests.sh
+\ No newline at end of file
diff --git a/pkg/abi/linux/capability.go b/pkg/abi/linux/capability.go
index c120cac64..65dd77e6e 100644
--- a/pkg/abi/linux/capability.go
+++ b/pkg/abi/linux/capability.go
@@ -69,6 +69,90 @@ func (cp Capability) Ok() bool {
 	return cp >= 0 && cp <= MaxCapability
 }
 
+// String returns the capability name.
+func (cp Capability) String() string {
+	switch cp {
+	case CAP_CHOWN:
+		return "CAP_CHOWN"
+	case CAP_DAC_OVERRIDE:
+		return "CAP_DAC_OVERRIDE"
+	case CAP_DAC_READ_SEARCH:
+		return "CAP_DAC_READ_SEARCH"
+	case CAP_FOWNER:
+		return "CAP_FOWNER"
+	case CAP_FSETID:
+		return "CAP_FSETID"
+	case CAP_KILL:
+		return "CAP_KILL"
+	case CAP_SETGID:
+		return "CAP_SETGID"
+	case CAP_SETUID:
+		return "CAP_SETUID"
+	case CAP_SETPCAP:
+		return "CAP_SETPCAP"
+	case CAP_LINUX_IMMUTABLE:
+		return "CAP_LINUX_IMMUTABLE"
+	case CAP_NET_BIND_SERVICE:
+		return "CAP_NET_BIND_SERVICE"
+	case CAP_NET_BROADCAST:
+		return "CAP_NET_BROADCAST"
+	case CAP_NET_ADMIN:
+		return "CAP_NET_ADMIN"
+	case CAP_NET_RAW:
+		return "CAP_NET_RAW"
+	case CAP_IPC_LOCK:
+		return "CAP_IPC_LOCK"
+	case CAP_IPC_OWNER:
+		return "CAP_IPC_OWNER"
+	case CAP_SYS_MODULE:
+		return "CAP_SYS_MODULE"
+	case CAP_SYS_RAWIO:
+		return "CAP_SYS_RAWIO"
+	case CAP_SYS_CHROOT:
+		return "CAP_SYS_CHROOT"
+	case CAP_SYS_PTRACE:
+		return "CAP_SYS_PTRACE"
+	case CAP_SYS_PACCT:
+		return "CAP_SYS_PACCT"
+	case CAP_SYS_ADMIN:
+		return "CAP_SYS_ADMIN"
+	case CAP_SYS_BOOT:
+		return "CAP_SYS_BOOT"
+	case CAP_SYS_NICE:
+		return "CAP_SYS_NICE"
+	case CAP_SYS_RESOURCE:
+		return "CAP_SYS_RESOURCE"
+	case CAP_SYS_TIME:
+		return "CAP_SYS_TIME"
+	case CAP_SYS_TTY_CONFIG:
+		return "CAP_SYS_TTY_CONFIG"
+	case CAP_MKNOD:
+		return "CAP_MKNOD"
+	case CAP_LEASE:
+		return "CAP_LEASE"
+	case CAP_AUDIT_WRITE:
+		return "CAP_AUDIT_WRITE"
+	case CAP_AUDIT_CONTROL:
+		return "CAP_AUDIT_CONTROL"
+	case CAP_SETFCAP:
+		return "CAP_SETFCAP"
+	case CAP_MAC_OVERRIDE:
+		return "CAP_MAC_OVERRIDE"
+	case CAP_MAC_ADMIN:
+		return "CAP_MAC_ADMIN"
+	case CAP_SYSLOG:
+		return "CAP_SYSLOG"
+	case CAP_WAKE_ALARM:
+		return "CAP_WAKE_ALARM"
+	case CAP_BLOCK_SUSPEND:
+		return "CAP_BLOCK_SUSPEND"
+	case CAP_AUDIT_READ:
+		return "CAP_AUDIT_READ"
+	default:
+		return "UNKNOWN"
+	}
+}
+
 // Version numbers used by the capget/capset syscalls, defined in Linux's
 // include/uapi/linux/capability.h.
 const (
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index 0b02f938a..cd043dac3 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -114,3 +114,12 @@ const (
 
 	MPOL_MODE_FLAGS = (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
 )
+
+// Flags for mbind(2).
+const (
+	MPOL_MF_STRICT   = 1 << 0
+	MPOL_MF_MOVE     = 1 << 1
+	MPOL_MF_MOVE_ALL = 1 << 2
+
+	MPOL_MF_VALID = MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL
+)
diff --git a/pkg/abi/linux/prctl.go b/pkg/abi/linux/prctl.go
index 0428282dd..391cfaa1c 100644
--- a/pkg/abi/linux/prctl.go
+++ b/pkg/abi/linux/prctl.go
@@ -155,3 +155,10 @@ const (
 	ARCH_GET_GS    = 0x1004
 	ARCH_SET_CPUID = 0x1012
 )
+
+// Flags for prctl(PR_SET_DUMPABLE), defined in include/linux/sched/coredump.h.
+const (
+	SUID_DUMP_DISABLE = 0
+	SUID_DUMP_USER    = 1
+	SUID_DUMP_ROOT    = 2
+)
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 417840731..a714ac86d 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -102,15 +102,19 @@ const (
 	SOL_NETLINK = 270
 )
 
+// A SockType is a type (as opposed to family) of sockets. These are enumerated
+// below as SOCK_* constants.
+type SockType int
+
 // Socket types, from linux/net.h.
 const (
-	SOCK_STREAM    = 1
-	SOCK_DGRAM     = 2
-	SOCK_RAW       = 3
-	SOCK_RDM       = 4
-	SOCK_SEQPACKET = 5
-	SOCK_DCCP      = 6
-	SOCK_PACKET    = 10
+	SOCK_STREAM    SockType = 1
+	SOCK_DGRAM              = 2
+	SOCK_RAW                = 3
+	SOCK_RDM                = 4
+	SOCK_SEQPACKET          = 5
+	SOCK_DCCP               = 6
+	SOCK_PACKET             = 10
 )
 
 // SOCK_TYPE_MASK covers all of the above socket types. The remaining bits are
@@ -200,6 +204,22 @@ const (
 	SS_DISCONNECTING = 4 // In process of disconnecting.
 )
 
+// TCP protocol states, from include/net/tcp_states.h.
+const (
+	TCP_ESTABLISHED uint32 = iota + 1
+	TCP_SYN_SENT
+	TCP_SYN_RECV
+	TCP_FIN_WAIT1
+	TCP_FIN_WAIT2
+	TCP_TIME_WAIT
+	TCP_CLOSE
+	TCP_CLOSE_WAIT
+	TCP_LAST_ACK
+	TCP_LISTEN
+	TCP_CLOSING
+	TCP_NEW_SYN_RECV
+)
+
 // SockAddrMax is the maximum size of a struct sockaddr, from
 // uapi/linux/socket.h.
 const SockAddrMax = 128
diff --git a/pkg/memutil/BUILD b/pkg/memutil/BUILD
new file mode 100644
index 000000000..71b48a972
--- /dev/null
+++ b/pkg/memutil/BUILD
@@ -0,0 +1,11 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "memutil",
+    srcs = ["memutil_unsafe.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/memutil",
+    visibility = ["//visibility:public"],
+    deps = ["@org_golang_x_sys//unix:go_default_library"],
+)
diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/memutil/memutil_unsafe.go
index 92eab8a26..979d942a9 100644
--- a/pkg/sentry/memutil/memutil_unsafe.go
+++ b/pkg/memutil/memutil_unsafe.go
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build linux
+
+// Package memutil provides a wrapper for the memfd_create() system call.
 package memutil
 
 import (
diff --git a/pkg/sentry/platform/procid/BUILD b/pkg/procid/BUILD
index 277509624..7c22b763a 100644
--- a/pkg/sentry/platform/procid/BUILD
+++ b/pkg/procid/BUILD
@@ -9,8 +9,8 @@ go_library(
         "procid_amd64.s",
         "procid_arm64.s",
     ],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid",
-    visibility = ["//pkg/sentry:internal"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/procid",
+    visibility = ["//visibility:public"],
 )
 
 go_test(
diff --git a/pkg/sentry/platform/procid/procid.go b/pkg/procid/procid.go
index 78b92422c..78b92422c 100644
--- a/pkg/sentry/platform/procid/procid.go
+++ b/pkg/procid/procid.go
diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/procid/procid_amd64.s
index 30ec8e6e2..30ec8e6e2 100644
--- a/pkg/sentry/platform/procid/procid_amd64.s
+++ b/pkg/procid/procid_amd64.s
diff --git a/pkg/sentry/platform/procid/procid_arm64.s b/pkg/procid/procid_arm64.s
index e340d9f98..e340d9f98 100644
--- a/pkg/sentry/platform/procid/procid_arm64.s
+++ b/pkg/procid/procid_arm64.s
diff --git a/pkg/sentry/platform/procid/procid_net_test.go b/pkg/procid/procid_net_test.go
index b628e2285..b628e2285 100644
--- a/pkg/sentry/platform/procid/procid_net_test.go
+++ b/pkg/procid/procid_net_test.go
diff --git a/pkg/sentry/platform/procid/procid_test.go b/pkg/procid/procid_test.go
index 88dd0b3ae..88dd0b3ae 100644
--- a/pkg/sentry/platform/procid/procid_test.go
+++ b/pkg/procid/procid_test.go
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index ce4f1e42c..d17b1bdcf 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -9,11 +9,11 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/memutil",
         "//pkg/sentry/context",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
-        "//pkg/sentry/memutil",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/ptrace",
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index 210a235d2..83da40711 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -21,11 +21,11 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.googlesource.com/gvisor/pkg/memutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index c0bc261a2..a0a35c242 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -805,7 +805,7 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data trans
 	var childDir *Dirent
 	err := d.genericCreate(ctx, root, name, func() error {
 		var e error
-		childDir, e = d.Inode.Bind(ctx, name, data, perms)
+		childDir, e = d.Inode.Bind(ctx, d, name, data, perms)
 		if e != nil {
 			return e
 		}
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 8c1307235..f64954457 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -545,12 +545,28 @@ type lockedWriter struct {
 
 // Write implements io.Writer.Write.
 func (w *lockedWriter) Write(buf []byte) (int, error) {
-	n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), w.File.offset)
-	return int(n), err
+	return w.WriteAt(buf, w.File.offset)
 }
 
 // WriteAt implements io.Writer.WriteAt.
 func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
-	n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), offset)
-	return int(n), err
+	var (
+		written int
+		err     error
+	)
+	// The io.Writer contract requires that Write writes all available
+	// bytes and does not return short writes. This causes errors with
+	// io.Copy, since our own Write interface does not have this same
+	// contract. Enforce that here.
+	for written < len(buf) {
+		var n int64
+		n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written))
+		if n > 0 {
+			written += int(n)
+		}
+		if err != nil {
+			break
+		}
+	}
+	return written, err
 }
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index cbd5b9a84..7ac0a421f 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -15,6 +15,7 @@
 package gofer
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -61,13 +62,13 @@ type endpoint struct {
 	path string
 }
 
-func unixSockToP9(t transport.SockType) (p9.ConnectFlags, bool) {
+func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) {
 	switch t {
-	case transport.SockStream:
+	case linux.SOCK_STREAM:
 		return p9.StreamSocket, true
-	case transport.SockSeqpacket:
+	case linux.SOCK_SEQPACKET:
 		return p9.SeqpacketSocket, true
-	case transport.SockDgram:
+	case linux.SOCK_DGRAM:
 		return p9.DgramSocket, true
 	}
 	return 0, false
@@ -75,7 +76,7 @@ func unixSockToP9(t transport.SockType) (p9.ConnectFlags, bool) {
 
 // BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect.
 func (e *endpoint) BidirectionalConnect(ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error {
-	cf, ok := unixSockToP9(ce.Type())
+	cf, ok := sockTypeToP9(ce.Type())
 	if !ok {
 		return syserr.ErrConnectionRefused
 	}
@@ -139,3 +140,8 @@ func (e *endpoint) UnidirectionalConnect() (transport.ConnectedEndpoint, *syserr
 func (e *endpoint) Release() {
 	e.inode.DecRef()
 }
+
+// Passcred implements transport.BoundEndpoint.Passcred.
+func (e *endpoint) Passcred() bool {
+	return false
+}
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 3ed137006..305eea718 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -15,9 +15,11 @@
 package host
 
 import (
+	"fmt"
 	"sync"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
 	"gvisor.googlesource.com/gvisor/pkg/log"
@@ -51,25 +53,11 @@ type ConnectedEndpoint struct {
 	// ref keeps track of references to a connectedEndpoint.
 	ref refs.AtomicRefCount
 
-	// mu protects fd, readClosed and writeClosed.
-	mu sync.RWMutex `state:"nosave"`
-
-	// file is an *fd.FD containing the FD backing this endpoint. It must be
-	// set to nil if it has been closed.
-	file *fd.FD `state:"nosave"`
-
-	// readClosed is true if the FD has read shutdown or if it has been closed.
-	readClosed bool
-
-	// writeClosed is true if the FD has write shutdown or if it has been
-	// closed.
-	writeClosed bool
-
 	// If srfd >= 0, it is the host FD that file was imported from.
 	srfd int `state:"wait"`
 
 	// stype is the type of Unix socket.
-	stype transport.SockType
+	stype linux.SockType
 
 	// sndbuf is the size of the send buffer.
 	//
@@ -78,6 +66,13 @@ type ConnectedEndpoint struct {
 	// prevent lots of small messages from filling the real send buffer
 	// size on the host.
 	sndbuf int `state:"nosave"`
+
+	// mu protects the fields below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// file is an *fd.FD containing the FD backing this endpoint. It must be
+	// set to nil if it has been closed.
+	file *fd.FD `state:"nosave"`
 }
 
 // init performs initialization required for creating new ConnectedEndpoints and
@@ -111,7 +106,7 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
 		return syserr.ErrInvalidEndpointState
 	}
 
-	c.stype = transport.SockType(stype)
+	c.stype = linux.SockType(stype)
 	c.sndbuf = sndbuf
 
 	return nil
@@ -169,7 +164,7 @@ func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.F
 
 	ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
 
-	return unixsocket.NewWithDirent(ctx, d, ep, e.stype != transport.SockStream, flags), nil
+	return unixsocket.NewWithDirent(ctx, d, ep, e.stype, flags), nil
 }
 
 // newSocket allocates a new unix socket with host endpoint.
@@ -201,16 +196,13 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error)
 
 	ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
 
-	return unixsocket.New(ctx, ep, e.stype != transport.SockStream), nil
+	return unixsocket.New(ctx, ep, e.stype), nil
 }
 
 // Send implements transport.ConnectedEndpoint.Send.
 func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
-	if c.writeClosed {
-		return 0, false, syserr.ErrClosedForSend
-	}
 
 	if !controlMessages.Empty() {
 		return 0, false, syserr.ErrInvalidEndpointState
@@ -218,7 +210,7 @@ func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.Contro
 
 	// Since stream sockets don't preserve message boundaries, we can write
 	// only as much of the message as fits in the send buffer.
-	truncate := c.stype == transport.SockStream
+	truncate := c.stype == linux.SOCK_STREAM
 
 	n, totalLen, err := fdWriteVec(c.file.FD(), data, c.sndbuf, truncate)
 	if n < totalLen && err == nil {
@@ -244,8 +236,13 @@ func (c *ConnectedEndpoint) SendNotify() {}
 // CloseSend implements transport.ConnectedEndpoint.CloseSend.
 func (c *ConnectedEndpoint) CloseSend() {
 	c.mu.Lock()
-	c.writeClosed = true
-	c.mu.Unlock()
+	defer c.mu.Unlock()
+
+	if err := syscall.Shutdown(c.file.FD(), syscall.SHUT_WR); err != nil {
+		// A well-formed UDS shutdown can't fail. See
+		// net/unix/af_unix.c:unix_shutdown.
+		panic(fmt.Sprintf("failed write shutdown on host socket %+v: %v", c, err))
+	}
 }
 
 // CloseNotify implements transport.ConnectedEndpoint.CloseNotify.
@@ -255,9 +252,7 @@ func (c *ConnectedEndpoint) CloseNotify() {}
 func (c *ConnectedEndpoint) Writable() bool {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
-	if c.writeClosed {
-		return true
-	}
+
 	return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0
 }
 
@@ -285,9 +280,6 @@ func (c *ConnectedEndpoint) EventUpdate() {
 func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
-	if c.readClosed {
-		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.ErrClosedForReceive
-	}
 
 	var cm unet.ControlMessage
 	if numRights > 0 {
@@ -344,31 +336,34 @@ func (c *ConnectedEndpoint) RecvNotify() {}
 // CloseRecv implements transport.Receiver.CloseRecv.
 func (c *ConnectedEndpoint) CloseRecv() {
 	c.mu.Lock()
-	c.readClosed = true
-	c.mu.Unlock()
+	defer c.mu.Unlock()
+
+	if err := syscall.Shutdown(c.file.FD(), syscall.SHUT_RD); err != nil {
+		// A well-formed UDS shutdown can't fail. See
+		// net/unix/af_unix.c:unix_shutdown.
+		panic(fmt.Sprintf("failed read shutdown on host socket %+v: %v", c, err))
+	}
 }
 
 // Readable implements transport.Receiver.Readable.
 func (c *ConnectedEndpoint) Readable() bool {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
-	if c.readClosed {
-		return true
-	}
+
 	return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0
 }
 
 // SendQueuedSize implements transport.Receiver.SendQueuedSize.
 func (c *ConnectedEndpoint) SendQueuedSize() int64 {
-	// SendQueuedSize isn't supported for host sockets because we don't allow the
-	// sentry to call ioctl(2).
+	// TODO(gvisor.dev/issue/273): SendQueuedSize isn't supported for host
+	// sockets because we don't allow the sentry to call ioctl(2).
 	return -1
 }
 
 // RecvQueuedSize implements transport.Receiver.RecvQueuedSize.
 func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
-	// RecvQueuedSize isn't supported for host sockets because we don't allow the
-	// sentry to call ioctl(2).
+	// TODO(gvisor.dev/issue/273): RecvQueuedSize isn't supported for host
+	// sockets because we don't allow the sentry to call ioctl(2).
 	return -1
 }
 
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 06392a65a..bc3ce5627 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -198,20 +198,6 @@ func TestListen(t *testing.T) {
 	}
 }
 
-func TestSend(t *testing.T) {
-	e := ConnectedEndpoint{writeClosed: true}
-	if _, _, err := e.Send(nil, transport.ControlMessages{}, tcpip.FullAddress{}); err != syserr.ErrClosedForSend {
-		t.Errorf("Got %#v.Send() = %v, want = %v", e, err, syserr.ErrClosedForSend)
-	}
-}
-
-func TestRecv(t *testing.T) {
-	e := ConnectedEndpoint{readClosed: true}
-	if _, _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != syserr.ErrClosedForReceive {
-		t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, syserr.ErrClosedForReceive)
-	}
-}
-
 func TestPasscred(t *testing.T) {
 	e := ConnectedEndpoint{}
 	if got, want := e.Passcred(), false; got != want {
@@ -244,20 +230,6 @@ func TestQueuedSize(t *testing.T) {
 	}
 }
 
-func TestReadable(t *testing.T) {
-	e := ConnectedEndpoint{readClosed: true}
-	if got, want := e.Readable(), true; got != want {
-		t.Errorf("Got %#v.Readable() = %t, want = %t", e, got, want)
-	}
-}
-
-func TestWritable(t *testing.T) {
-	e := ConnectedEndpoint{writeClosed: true}
-	if got, want := e.Writable(), true; got != want {
-		t.Errorf("Got %#v.Writable() = %t, want = %t", e, got, want)
-	}
-}
-
 func TestRelease(t *testing.T) {
 	f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
@@ -272,131 +244,3 @@ func TestRelease(t *testing.T) {
 		t.Errorf("got = %#v, want = %#v", c, want)
 	}
 }
-
-func TestClose(t *testing.T) {
-	type testCase struct {
-		name  string
-		cep   *ConnectedEndpoint
-		addFD bool
-		f     func()
-		want  *ConnectedEndpoint
-	}
-
-	var tests []testCase
-
-	// nil is the value used by ConnectedEndpoint to indicate a closed file.
-	// Non-nil files are used to check if the file gets closed.
-
-	f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
-	if err != nil {
-		t.Fatal("Creating socket:", err)
-	}
-	c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
-	tests = append(tests, testCase{
-		name:  "First CloseRecv",
-		cep:   c,
-		addFD: false,
-		f:     c.CloseRecv,
-		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
-	})
-
-	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
-	if err != nil {
-		t.Fatal("Creating socket:", err)
-	}
-	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
-	tests = append(tests, testCase{
-		name:  "Second CloseRecv",
-		cep:   c,
-		addFD: false,
-		f:     c.CloseRecv,
-		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
-	})
-
-	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
-	if err != nil {
-		t.Fatal("Creating socket:", err)
-	}
-	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
-	tests = append(tests, testCase{
-		name:  "First CloseSend",
-		cep:   c,
-		addFD: false,
-		f:     c.CloseSend,
-		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
-	})
-
-	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
-	if err != nil {
-		t.Fatal("Creating socket:", err)
-	}
-	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
-	tests = append(tests, testCase{
-		name:  "Second CloseSend",
-		cep:   c,
-		addFD: false,
-		f:     c.CloseSend,
-		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
-	})
-
-	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
-	if err != nil {
-		t.Fatal("Creating socket:", err)
-	}
-	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
-	tests = append(tests, testCase{
-		name:  "CloseSend then CloseRecv",
-		cep:   c,
-		addFD: true,
-		f:     c.CloseRecv,
-		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
-	})
-
-	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
-	if err != nil {
-		t.Fatal("Creating socket:", err)
-	}
-	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
-	tests = append(tests, testCase{
-		name:  "CloseRecv then CloseSend",
-		cep:   c,
-		addFD: true,
-		f:     c.CloseSend,
-		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
-	})
-
-	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
-	if err != nil {
-		t.Fatal("Creating socket:", err)
-	}
-	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
-	tests = append(tests, testCase{
-		name:  "Full close then CloseRecv",
-		cep:   c,
-		addFD: false,
-		f:     c.CloseRecv,
-		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
-	})
-
-	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
-	if err != nil {
-		t.Fatal("Creating socket:", err)
-	}
-	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
-	tests = append(tests, testCase{
-		name:  "Full close then CloseSend",
-		cep:   c,
-		addFD: false,
-		f:     c.CloseSend,
-		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
-	})
-
-	for _, test := range tests {
-		if test.addFD {
-			fdnotifier.AddFD(int32(test.cep.file.FD()), nil)
-		}
-		if test.f(); !reflect.DeepEqual(test.cep, test.want) {
-			t.Errorf("%s: got = %#v, want = %#v", test.name, test.cep, test.want)
-		}
-	}
-}
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index aef1a1cb9..0b54c2e77 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -220,9 +220,9 @@ func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent,
 }
 
 // Bind calls i.InodeOperations.Bind with i as the directory.
-func (i *Inode) Bind(ctx context.Context, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
+func (i *Inode) Bind(ctx context.Context, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
 	if i.overlay != nil {
-		return overlayBind(ctx, i.overlay, name, data, perm)
+		return overlayBind(ctx, i.overlay, parent, name, data, perm)
 	}
 	return i.InodeOperations.Bind(ctx, i, name, data, perm)
 }
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index cdffe173b..06506fb20 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -398,14 +398,14 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 	return nil
 }
 
-func overlayBind(ctx context.Context, o *overlayEntry, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
+func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
+	if err := copyUp(ctx, parent); err != nil {
+		return nil, err
+	}
+
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
-	// We do not support doing anything exciting with sockets unless there
-	// is already a directory in the upper filesystem.
-	if o.upper == nil {
-		return nil, syserror.EOPNOTSUPP
-	}
+
 	d, err := o.upper.InodeOperations.Bind(ctx, o.upper, name, data, perm)
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index d19c360e0..1728fe0b5 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -45,6 +45,7 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/mm",
+        "//pkg/sentry/socket",
         "//pkg/sentry/socket/rpcinet",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
index 379569823..986bc0a45 100644
--- a/pkg/sentry/fs/proc/inode.go
+++ b/pkg/sentry/fs/proc/inode.go
@@ -21,11 +21,14 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
 // taskOwnedInodeOps wraps an fs.InodeOperations and overrides the UnstableAttr
-// method to return the task as the owner.
+// method to return either the task or root as the owner, depending on the
+// task's dumpability.
 //
 // +stateify savable
 type taskOwnedInodeOps struct {
@@ -41,9 +44,42 @@ func (i *taskOwnedInodeOps) UnstableAttr(ctx context.Context, inode *fs.Inode) (
 	if err != nil {
 		return fs.UnstableAttr{}, err
 	}
-	// Set the task owner as the file owner.
+
+	// By default, set the task owner as the file owner.
 	creds := i.t.Credentials()
 	uattr.Owner = fs.FileOwner{creds.EffectiveKUID, creds.EffectiveKGID}
+
+	// Linux doesn't apply dumpability adjustments to world
+	// readable/executable directories so that applications can stat
+	// /proc/PID to determine the effective UID of a process. See
+	// fs/proc/base.c:task_dump_owner.
+	if fs.IsDir(inode.StableAttr) && uattr.Perms == fs.FilePermsFromMode(0555) {
+		return uattr, nil
+	}
+
+	// If the task is not dumpable, then root (in the namespace preferred)
+	// owns the file.
+	var m *mm.MemoryManager
+	i.t.WithMuLocked(func(t *kernel.Task) {
+		m = t.MemoryManager()
+	})
+
+	if m == nil {
+		uattr.Owner.UID = auth.RootKUID
+		uattr.Owner.GID = auth.RootKGID
+	} else if m.Dumpability() != mm.UserDumpable {
+		if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() {
+			uattr.Owner.UID = kuid
+		} else {
+			uattr.Owner.UID = auth.RootKUID
+		}
+		if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() {
+			uattr.Owner.GID = kgid
+		} else {
+			uattr.Owner.GID = auth.RootKGID
+		}
+	}
+
 	return uattr, nil
 }
 
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 4a107c739..034950158 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 )
@@ -213,17 +214,18 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 	fmt.Fprintf(&buf, "Num       RefCount Protocol Flags    Type St Inode Path\n")
 
 	// Entries
-	for _, sref := range n.k.ListSockets(linux.AF_UNIX) {
-		s := sref.Get()
+	for _, se := range n.k.ListSockets() {
+		s := se.Sock.Get()
 		if s == nil {
-			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", sref)
+			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
 			continue
 		}
 		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(*unix.SocketOperations)
-		if !ok {
-			panic(fmt.Sprintf("Found non-unix socket file in unix socket table: %+v", sfile))
+		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+			// Not a unix socket.
+			continue
 		}
+		sops := sfile.FileOperations.(*unix.SocketOperations)
 
 		addr, err := sops.Endpoint().GetLocalAddress()
 		if err != nil {
@@ -240,24 +242,6 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 			}
 		}
 
-		var sockState int
-		switch sops.Endpoint().Type() {
-		case linux.SOCK_DGRAM:
-			sockState = linux.SS_CONNECTING
-			// Unlike Linux, we don't have unbound connection-less sockets,
-			// so no SS_DISCONNECTING.
-
-		case linux.SOCK_SEQPACKET:
-			fallthrough
-		case linux.SOCK_STREAM:
-			// Connectioned.
-			if sops.Endpoint().(transport.ConnectingEndpoint).Connected() {
-				sockState = linux.SS_CONNECTED
-			} else {
-				sockState = linux.SS_UNCONNECTED
-			}
-		}
-
 		// In the socket entry below, the value for the 'Num' field requires
 		// some consideration. Linux prints the address to the struct
 		// unix_sock representing a socket in the kernel, but may redact the
@@ -282,7 +266,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 			0,                             // Protocol, always 0 for UDS.
 			sockFlags,                     // Flags.
 			sops.Endpoint().Type(),        // Type.
-			sockState,                     // State.
+			sops.State(),                  // State.
 			sfile.InodeID(),               // Inode.
 		)
 
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 77e03d349..21a965f90 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -96,7 +96,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, showSubtasks boo
 		contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers)
 	}
 
-	// TODO(b/31916171): Set EUID/EGID based on dumpability.
+	// N.B. taskOwnedInodeOps enforces dumpability-based ownership.
 	d := &taskDir{
 		Dir: *ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
 		t:   t,
@@ -667,6 +667,21 @@ func newComm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 	return newProcInode(c, msrc, fs.SpecialFile, t)
 }
 
+// Check implements fs.InodeOperations.Check.
+func (c *comm) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	// This file can always be read or written by members of the same
+	// thread group. See fs/proc/base.c:proc_tid_comm_permission.
+	//
+	// N.B. This check is currently a no-op as we don't yet support writing
+	// and this file is world-readable anyways.
+	t := kernel.TaskFromContext(ctx)
+	if t != nil && t.ThreadGroup() == c.t.ThreadGroup() && !p.Execute {
+		return true
+	}
+
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
 // GetFile implements fs.InodeOperations.GetFile.
 func (c *comm) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	return fs.NewFile(ctx, dirent, flags, &commFile{t: c.t}), nil
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index bce5f091d..c1721f434 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -54,6 +54,8 @@ type TimerOperations struct {
 // NewFile returns a timerfd File that receives time from c.
 func NewFile(ctx context.Context, c ktime.Clock) *fs.File {
 	dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[timerfd]")
+	// Release the initial dirent reference after NewFile takes a reference.
+	defer dirent.DecRef()
 	tops := &TimerOperations{}
 	tops.timer = ktime.NewTimer(c, tops)
 	// Timerfds reject writes, but the Write flag must be set in order to
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index b7c29a4d1..83e1bf247 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -34,6 +34,16 @@ const (
 	// GID for the root directory.
 	rootGIDKey = "gid"
 
+	// cacheKey sets the caching policy for the mount.
+	cacheKey = "cache"
+
+	// cacheAll uses the virtual file system cache for everything (default).
+	cacheAll = "cache"
+
+	// cacheRevalidate allows dirents to be cached, but revalidates them on each
+	// lookup.
+	cacheRevalidate = "revalidate"
+
 	// TODO(edahlgren/mpratt): support a tmpfs size limit.
 	// size = "size"
 
@@ -122,15 +132,24 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 		delete(options, rootGIDKey)
 	}
 
+	// Construct a mount which will follow the cache options provided.
+	var msrc *fs.MountSource
+	switch options[cacheKey] {
+	case "", cacheAll:
+		msrc = fs.NewCachingMountSource(f, flags)
+	case cacheRevalidate:
+		msrc = fs.NewRevalidatingMountSource(f, flags)
+	default:
+		return nil, fmt.Errorf("invalid cache policy option %q", options[cacheKey])
+	}
+	delete(options, cacheKey)
+
 	// Fail if the caller passed us more options than we can parse. They may be
 	// expecting us to set something we can't set.
 	if len(options) > 0 {
 		return nil, fmt.Errorf("unsupported mount options: %v", options)
 	}
 
-	// Construct a mount which will cache dirents.
-	msrc := fs.NewCachingMountSource(f, flags)
-
 	// Construct the tmpfs root.
 	return NewDir(ctx, nil, owner, perms, msrc), nil
 }
diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD
new file mode 100644
index 000000000..1a4632a54
--- /dev/null
+++ b/pkg/sentry/hostmm/BUILD
@@ -0,0 +1,18 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "hostmm",
+    srcs = [
+        "cgroup.go",
+        "hostmm.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/sentry/usermem",
+    ],
+)
diff --git a/pkg/sentry/hostmm/cgroup.go b/pkg/sentry/hostmm/cgroup.go
new file mode 100644
index 000000000..e5cc26ab2
--- /dev/null
+++ b/pkg/sentry/hostmm/cgroup.go
@@ -0,0 +1,111 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostmm
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"path"
+	"strings"
+)
+
+// currentCgroupDirectory returns the directory for the cgroup for the given
+// controller in which the calling process resides.
+func currentCgroupDirectory(ctrl string) (string, error) {
+	root, err := cgroupRootDirectory(ctrl)
+	if err != nil {
+		return "", err
+	}
+	cg, err := currentCgroup(ctrl)
+	if err != nil {
+		return "", err
+	}
+	return path.Join(root, cg), nil
+}
+
+// cgroupRootDirectory returns the root directory for the cgroup hierarchy in
+// which the given cgroup controller is mounted in the calling process' mount
+// namespace.
+func cgroupRootDirectory(ctrl string) (string, error) {
+	const path = "/proc/self/mounts"
+	file, err := os.Open(path)
+	if err != nil {
+		return "", err
+	}
+	defer file.Close()
+
+	// Per proc(5) -> fstab(5):
+	// Each line of /proc/self/mounts describes a mount.
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		// Each line consists of 6 space-separated fields. Find the line for
+		// which the third field (fs_vfstype) is cgroup, and the fourth field
+		// (fs_mntops, a comma-separated list of mount options) contains
+		// ctrl.
+		var spec, file, vfstype, mntopts, freq, passno string
+		const nrfields = 6
+		line := scanner.Text()
+		n, err := fmt.Sscan(line, &spec, &file, &vfstype, &mntopts, &freq, &passno)
+		if err != nil {
+			return "", fmt.Errorf("failed to parse %s: %v", path, err)
+		}
+		if n != nrfields {
+			return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, n, nrfields)
+		}
+		if vfstype != "cgroup" {
+			continue
+		}
+		for _, mntopt := range strings.Split(mntopts, ",") {
+			if mntopt == ctrl {
+				return file, nil
+			}
+		}
+	}
+	return "", fmt.Errorf("no cgroup hierarchy mounted for controller %s", ctrl)
+}
+
+// currentCgroup returns the cgroup for the given controller in which the
+// calling process resides. The returned string is a path that should be
+// interpreted as relative to cgroupRootDirectory(ctrl).
+func currentCgroup(ctrl string) (string, error) {
+	const path = "/proc/self/cgroup"
+	file, err := os.Open(path)
+	if err != nil {
+		return "", err
+	}
+	defer file.Close()
+
+	// Per proc(5) -> cgroups(7):
+	// Each line of /proc/self/cgroups describes a cgroup hierarchy.
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		// Each line consists of 3 colon-separated fields. Find the line for
+		// which the second field (controller-list, a comma-separated list of
+		// cgroup controllers) contains ctrl.
+		line := scanner.Text()
+		const nrfields = 3
+		fields := strings.Split(line, ":")
+		if len(fields) != nrfields {
+			return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, len(fields), nrfields)
+		}
+		for _, controller := range strings.Split(fields[1], ",") {
+			if controller == ctrl {
+				return fields[2], nil
+			}
+		}
+	}
+	return "", fmt.Errorf("not a member of a cgroup hierarchy for controller %s", ctrl)
+}
diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go
new file mode 100644
index 000000000..5432cada9
--- /dev/null
+++ b/pkg/sentry/hostmm/hostmm.go
@@ -0,0 +1,130 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hostmm provides tools for interacting with the host Linux kernel's
+// virtual memory management subsystem.
+package hostmm
+
+import (
+	"fmt"
+	"os"
+	"path"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// NotifyCurrentMemcgPressureCallback requests that f is called whenever the
+// calling process' memory cgroup indicates memory pressure of the given level,
+// as specified by Linux's Documentation/cgroup-v1/memory.txt.
+//
+// If NotifyCurrentMemcgPressureCallback succeeds, it returns a function that
+// terminates the requested memory pressure notifications. This function may be
+// called at most once.
+func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) {
+	cgdir, err := currentCgroupDirectory("memory")
+	if err != nil {
+		return nil, err
+	}
+
+	pressurePath := path.Join(cgdir, "memory.pressure_level")
+	pressureFile, err := os.Open(pressurePath)
+	if err != nil {
+		return nil, err
+	}
+	defer pressureFile.Close()
+
+	eventControlPath := path.Join(cgdir, "cgroup.event_control")
+	eventControlFile, err := os.OpenFile(eventControlPath, os.O_WRONLY, 0)
+	if err != nil {
+		return nil, err
+	}
+	defer eventControlFile.Close()
+
+	eventFD, err := newEventFD()
+	if err != nil {
+		return nil, err
+	}
+
+	// Don't use fmt.Fprintf since the whole string needs to be written in a
+	// single syscall.
+	eventControlStr := fmt.Sprintf("%d %d %s", eventFD.FD(), pressureFile.Fd(), level)
+	if n, err := eventControlFile.Write([]byte(eventControlStr)); n != len(eventControlStr) || err != nil {
+		eventFD.Close()
+		return nil, fmt.Errorf("error writing %q to %s: got (%d, %v), wanted (%d, nil)", eventControlStr, eventControlPath, n, err, len(eventControlStr))
+	}
+
+	log.Debugf("Receiving memory pressure level notifications from %s at level %q", pressurePath, level)
+	const sizeofUint64 = 8
+	// The most significant bit of the eventfd value is set by the stop
+	// function, which is practically unambiguous since it's not plausible for
+	// 2**63 pressure events to occur between eventfd reads.
+	const stopVal = 1 << 63
+	stopCh := make(chan struct{})
+	go func() { // S/R-SAFE: f provides synchronization if necessary
+		rw := fd.NewReadWriter(eventFD.FD())
+		var buf [sizeofUint64]byte
+		for {
+			n, err := rw.Read(buf[:])
+			if err != nil {
+				if err == syscall.EINTR {
+					continue
+				}
+				panic(fmt.Sprintf("failed to read from memory pressure level eventfd: %v", err))
+			}
+			if n != sizeofUint64 {
+				panic(fmt.Sprintf("short read from memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64))
+			}
+			val := usermem.ByteOrder.Uint64(buf[:])
+			if val >= stopVal {
+				// Assume this was due to the notifier's "destructor" (the
+				// function returned by NotifyCurrentMemcgPressureCallback
+				// below) being called.
+				eventFD.Close()
+				close(stopCh)
+				return
+			}
+			f()
+		}
+	}()
+	return func() {
+		rw := fd.NewReadWriter(eventFD.FD())
+		var buf [sizeofUint64]byte
+		usermem.ByteOrder.PutUint64(buf[:], stopVal)
+		for {
+			n, err := rw.Write(buf[:])
+			if err != nil {
+				if err == syscall.EINTR {
+					continue
+				}
+				panic(fmt.Sprintf("failed to write to memory pressure level eventfd: %v", err))
+			}
+			if n != sizeofUint64 {
+				panic(fmt.Sprintf("short write to memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64))
+			}
+			break
+		}
+		<-stopCh
+	}, nil
+}
+
+func newEventFD() (*fd.FD, error) {
+	f, _, e := syscall.Syscall(syscall.SYS_EVENTFD2, 0, 0, 0)
+	if e != 0 {
+		return nil, fmt.Errorf("failed to create eventfd: %v", e)
+	}
+	return fd.New(int(f)), nil
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 99a2fd964..04e375910 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -64,6 +64,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "socket_list",
+    out = "socket_list.go",
+    package = "kernel",
+    prefix = "socket",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*SocketEntry",
+        "Linker": "*SocketEntry",
+    },
+)
+
 proto_library(
     name = "uncaught_signal_proto",
     srcs = ["uncaught_signal.proto"],
@@ -104,6 +116,7 @@ go_library(
         "sessions.go",
         "signal.go",
         "signal_handlers.go",
+        "socket_list.go",
         "syscalls.go",
         "syscalls_state.go",
         "syslog.go",
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index bbacba1f4..43ae22a5d 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -156,6 +156,8 @@ var cycleMu sync.Mutex
 func NewEventPoll(ctx context.Context) *fs.File {
 	// name matches fs/eventpoll.c:epoll_create1.
 	dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
+	// Release the initial dirent reference after NewFile takes a reference.
+	defer dirent.DecRef()
 	return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
 		files: make(map[FileIdentifier]*pollEntry),
 	})
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 2f900be38..fe474cbf0 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -69,6 +69,8 @@ type EventOperations struct {
 func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
 	// name matches fs/eventfd.c:eventfd_file_create.
 	dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[eventfd]")
+	// Release the initial dirent reference after NewFile takes a reference.
+	defer dirent.DecRef()
 	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
 		val:     initVal,
 		semMode: semMode,
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 85d73ace2..f253a81d9 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -182,9 +182,13 @@ type Kernel struct {
 	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
 	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
 
-	// socketTable is used to track all sockets on the system. Protected by
+	// sockets is the list of all network sockets the system. Protected by
 	// extMu.
-	socketTable map[int]map[*refs.WeakRef]struct{}
+	sockets socketList
+
+	// nextSocketEntry is the next entry number to use in sockets. Protected
+	// by extMu.
+	nextSocketEntry uint64
 
 	// deviceRegistry is used to save/restore device.SimpleDevices.
 	deviceRegistry struct{} `state:".(*device.Registry)"`
@@ -283,7 +287,6 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
 	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
-	k.socketTable = make(map[int]map[*refs.WeakRef]struct{})
 
 	return nil
 }
@@ -1137,51 +1140,43 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
 	})
 }
 
-// socketEntry represents a socket recorded in Kernel.socketTable. It implements
+// SocketEntry represents a socket recorded in Kernel.sockets. It implements
 // refs.WeakRefUser for sockets stored in the socket table.
 //
 // +stateify savable
-type socketEntry struct {
-	k      *Kernel
-	sock   *refs.WeakRef
-	family int
+type SocketEntry struct {
+	socketEntry
+	k    *Kernel
+	Sock *refs.WeakRef
+	ID   uint64 // Socket table entry number.
 }
 
 // WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (s *socketEntry) WeakRefGone() {
+func (s *SocketEntry) WeakRefGone() {
 	s.k.extMu.Lock()
-	// k.socketTable is guaranteed to point to a valid socket table for s.family
-	// at this point, since we made sure of the fact when we created this
-	// socketEntry, and we never delete socket tables.
-	delete(s.k.socketTable[s.family], s.sock)
+	s.k.sockets.Remove(s)
 	s.k.extMu.Unlock()
 }
 
 // RecordSocket adds a socket to the system-wide socket table for tracking.
 //
 // Precondition: Caller must hold a reference to sock.
-func (k *Kernel) RecordSocket(sock *fs.File, family int) {
+func (k *Kernel) RecordSocket(sock *fs.File) {
 	k.extMu.Lock()
-	table, ok := k.socketTable[family]
-	if !ok {
-		table = make(map[*refs.WeakRef]struct{})
-		k.socketTable[family] = table
-	}
-	se := socketEntry{k: k, family: family}
-	se.sock = refs.NewWeakRef(sock, &se)
-	table[se.sock] = struct{}{}
+	id := k.nextSocketEntry
+	k.nextSocketEntry++
+	s := &SocketEntry{k: k, ID: id}
+	s.Sock = refs.NewWeakRef(sock, s)
+	k.sockets.PushBack(s)
 	k.extMu.Unlock()
 }
 
-// ListSockets returns a snapshot of all sockets of a given family.
-func (k *Kernel) ListSockets(family int) []*refs.WeakRef {
+// ListSockets returns a snapshot of all sockets.
+func (k *Kernel) ListSockets() []*SocketEntry {
 	k.extMu.Lock()
-	socks := []*refs.WeakRef{}
-	if table, ok := k.socketTable[family]; ok {
-		socks = make([]*refs.WeakRef, 0, len(table))
-		for s := range table {
-			socks = append(socks, s)
-		}
+	var socks []*SocketEntry
+	for s := k.sockets.Front(); s != nil; s = s.Next() {
+		socks = append(socks, s)
 	}
 	k.extMu.Unlock()
 	return socks
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 926c4c623..dc7da529e 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -38,7 +38,11 @@ type inodeOperations struct {
 	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
 	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.InodeNotVirtual           `state:"nosave"`
+
+	// Marking pipe inodes as virtual allows them to be saved and restored
+	// even if they have been unlinked. We can get away with this because
+	// their state exists entirely within the sentry.
+	fsutil.InodeVirtual `state:"nosave"`
 
 	fsutil.InodeSimpleAttributes
 
@@ -86,7 +90,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 
 	switch {
 	case flags.Read && !flags.Write: // O_RDONLY.
-		r := i.p.Open(ctx, flags)
+		r := i.p.Open(ctx, d, flags)
 		i.newHandleLocked(&i.rWakeup)
 
 		if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
@@ -102,7 +106,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 		return r, nil
 
 	case flags.Write && !flags.Read: // O_WRONLY.
-		w := i.p.Open(ctx, flags)
+		w := i.p.Open(ctx, d, flags)
 		i.newHandleLocked(&i.wWakeup)
 
 		if i.p.isNamed && !i.p.HasReaders() {
@@ -122,7 +126,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 
 	case flags.Read && flags.Write: // O_RDWR.
 		// Pipes opened for read-write always succeeds without blocking.
-		rw := i.p.Open(ctx, flags)
+		rw := i.p.Open(ctx, d, flags)
 		i.newHandleLocked(&i.rWakeup)
 		i.newHandleLocked(&i.wWakeup)
 		return rw, nil
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index 31d9b0443..9a946b380 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -62,7 +62,9 @@ var perms fs.FilePermissions = fs.FilePermissions{
 }
 
 func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) {
-	file, err := n.GetFile(ctx, nil, flags)
+	inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe})
+	d := fs.NewDirent(inode, "pipe")
+	file, err := n.GetFile(ctx, d, flags)
 	if err != nil {
 		t.Fatalf("open with flags %+v failed: %v", flags, err)
 	}
@@ -73,7 +75,9 @@ func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flag
 }
 
 func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, resChan chan<- openResult) (*fs.File, error) {
-	file, err := n.GetFile(ctx, nil, flags)
+	inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe})
+	d := fs.NewDirent(inode, "pipe")
+	file, err := n.GetFile(ctx, d, flags)
 	if resChan != nil {
 		resChan <- openResult{file, err}
 	}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index b65204492..73438dc62 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -71,11 +71,6 @@ type Pipe struct {
 	// This value is immutable.
 	atomicIOBytes int64
 
-	// The dirent backing this pipe. Shared by all readers and writers.
-	//
-	// This value is immutable.
-	Dirent *fs.Dirent
-
 	// The number of active readers for this pipe.
 	//
 	// Access atomically.
@@ -130,14 +125,20 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64)
 	if atomicIOBytes > sizeBytes {
 		atomicIOBytes = sizeBytes
 	}
-	p := &Pipe{
+	return &Pipe{
 		isNamed:       isNamed,
 		max:           sizeBytes,
 		atomicIOBytes: atomicIOBytes,
 	}
+}
 
-	// Build the fs.Dirent of this pipe, shared by all fs.Files associated
-	// with this pipe.
+// NewConnectedPipe initializes a pipe and returns a pair of objects
+// representing the read and write ends of the pipe.
+func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) {
+	p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes)
+
+	// Build an fs.Dirent for the pipe which will be shared by both
+	// returned files.
 	perms := fs.FilePermissions{
 		User: fs.PermMask{Read: true, Write: true},
 	}
@@ -150,36 +151,32 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64)
 		BlockSize: int64(atomicIOBytes),
 	}
 	ms := fs.NewPseudoMountSource()
-	p.Dirent = fs.NewDirent(fs.NewInode(iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
-	return p
-}
-
-// NewConnectedPipe initializes a pipe and returns a pair of objects
-// representing the read and write ends of the pipe.
-func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) {
-	p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes)
-	return p.Open(ctx, fs.FileFlags{Read: true}), p.Open(ctx, fs.FileFlags{Write: true})
+	d := fs.NewDirent(fs.NewInode(iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
+	// The p.Open calls below will each take a reference on the Dirent. We
+	// must drop the one we already have.
+	defer d.DecRef()
+	return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true})
 }
 
 // Open opens the pipe and returns a new file.
 //
 // Precondition: at least one of flags.Read or flags.Write must be set.
-func (p *Pipe) Open(ctx context.Context, flags fs.FileFlags) *fs.File {
+func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.File {
 	switch {
 	case flags.Read && flags.Write:
 		p.rOpen()
 		p.wOpen()
-		return fs.NewFile(ctx, p.Dirent, flags, &ReaderWriter{
+		return fs.NewFile(ctx, d, flags, &ReaderWriter{
 			Pipe: p,
 		})
 	case flags.Read:
 		p.rOpen()
-		return fs.NewFile(ctx, p.Dirent, flags, &Reader{
+		return fs.NewFile(ctx, d, flags, &Reader{
 			ReaderWriter: ReaderWriter{Pipe: p},
 		})
 	case flags.Write:
 		p.wOpen()
-		return fs.NewFile(ctx, p.Dirent, flags, &Writer{
+		return fs.NewFile(ctx, d, flags, &Writer{
 			ReaderWriter: ReaderWriter{Pipe: p},
 		})
 	default:
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 4423e7efd..193447b17 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -92,6 +93,14 @@ const (
 // ptrace(2), subsection "Ptrace access mode checking". If attach is true, it
 // checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access
 // mode PTRACE_MODE_READ.
+//
+// NOTE(b/30815691): The result of CanTrace is immediately stale (e.g., a
+// racing setuid(2) may change traceability). This may pose a risk when a task
+// changes from traceable to not traceable. This is only problematic across
+// execve, where privileges may increase.
+//
+// We currently do not implement privileged executables (set-user/group-ID bits
+// and file capabilities), so that case is not reachable.
 func (t *Task) CanTrace(target *Task, attach bool) bool {
 	// "1. If the calling thread and the target thread are in the same thread
 	// group, access is always allowed." - ptrace(2)
@@ -162,7 +171,13 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
 	if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
 		return false
 	}
-	// TODO(b/31916171): dumpability check
+	var targetMM *mm.MemoryManager
+	target.WithMuLocked(func(t *Task) {
+		targetMM = t.MemoryManager()
+	})
+	if targetMM != nil && targetMM.Dumpability() != mm.UserDumpable {
+		return false
+	}
 	if callerCreds.UserNamespace != targetCreds.UserNamespace {
 		return false
 	}
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 0572053db..27cd3728b 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -32,6 +32,51 @@ import (
 // syscall.
 const maxSyscallNum = 2000
 
+// SyscallSupportLevel is a syscall support levels.
+type SyscallSupportLevel int
+
+// String returns a human readable represetation of the support level.
+func (l SyscallSupportLevel) String() string {
+	switch l {
+	case SupportUnimplemented:
+		return "Unimplemented"
+	case SupportPartial:
+		return "Partial Support"
+	case SupportFull:
+		return "Full Support"
+	default:
+		return "Undocumented"
+	}
+}
+
+const (
+	// SupportUndocumented indicates the syscall is not documented yet.
+	SupportUndocumented = iota
+
+	// SupportUnimplemented indicates the syscall is unimplemented.
+	SupportUnimplemented
+
+	// SupportPartial indicates the syscall is partially supported.
+	SupportPartial
+
+	// SupportFull indicates the syscall is fully supported.
+	SupportFull
+)
+
+// Syscall includes the syscall implementation and compatibility information.
+type Syscall struct {
+	// Name is the syscall name.
+	Name string
+	// Fn is the implementation of the syscall.
+	Fn SyscallFn
+	// SupportLevel is the level of support implemented in gVisor.
+	SupportLevel SyscallSupportLevel
+	// Note describes the compatibility of the syscall.
+	Note string
+	// URLs is set of URLs to any relevant bugs or issues.
+	URLs []string
+}
+
 // SyscallFn is a syscall implementation.
 type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
 
@@ -83,7 +128,7 @@ type SyscallFlagsTable struct {
 // Init initializes the struct, with all syscalls in table set to enable.
 //
 // max is the largest syscall number in table.
-func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) {
+func (e *SyscallFlagsTable) init(table map[uintptr]Syscall, max uintptr) {
 	e.enable = make([]uint32, max+1)
 	for num := range table {
 		e.enable[num] = syscallPresent
@@ -194,7 +239,7 @@ type SyscallTable struct {
 	AuditNumber uint32 `state:"manual"`
 
 	// Table is the collection of functions.
-	Table map[uintptr]SyscallFn `state:"manual"`
+	Table map[uintptr]Syscall `state:"manual"`
 
 	// lookup is a fixed-size array that holds the syscalls (indexed by
 	// their numbers). It is used for fast look ups.
@@ -247,7 +292,7 @@ func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
 func RegisterSyscallTable(s *SyscallTable) {
 	if s.Table == nil {
 		// Ensure non-nil lookup table.
-		s.Table = make(map[uintptr]SyscallFn)
+		s.Table = make(map[uintptr]Syscall)
 	}
 	if s.Emulate == nil {
 		// Ensure non-nil emulate table.
@@ -268,8 +313,8 @@ func RegisterSyscallTable(s *SyscallTable) {
 	s.lookup = make([]SyscallFn, max+1)
 
 	// Initialize the fast-lookup table.
-	for num, fn := range s.Table {
-		s.lookup[num] = fn
+	for num, sc := range s.Table {
+		s.lookup[num] = sc.Fn
 	}
 
 	s.FeatureEnable.init(s.Table, max)
@@ -303,5 +348,8 @@ func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
 // mapLookup is similar to Lookup, except that it only uses the syscall table,
 // that is, it skips the fast look array. This is available for benchmarking.
 func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
-	return s.Table[sysno]
+	if sc, ok := s.Table[sysno]; ok {
+		return sc.Fn
+	}
+	return nil
 }
diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go
index 8f7cdb9f3..3f2b042c8 100644
--- a/pkg/sentry/kernel/table_test.go
+++ b/pkg/sentry/kernel/table_test.go
@@ -26,11 +26,13 @@ const (
 )
 
 func createSyscallTable() *SyscallTable {
-	m := make(map[uintptr]SyscallFn)
+	m := make(map[uintptr]Syscall)
 	for i := uintptr(0); i <= maxTestSyscall; i++ {
 		j := i
-		m[i] = func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) {
-			return j, nil, nil
+		m[i] = Syscall{
+			Fn: func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) {
+				return j, nil, nil
+			},
 		}
 	}
 
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index f9378c2de..4d889422f 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -455,12 +455,13 @@ type Task struct {
 	// single numa node, all policies are no-ops. We only track this information
 	// so that we can return reasonable values if the application calls
 	// get_mempolicy(2) after setting a non-default policy. Note that in the
-	// real syscall, nodemask can be longer than 4 bytes, but we always report a
-	// single node so never need to save more than a single bit.
+	// real syscall, nodemask can be longer than a single unsigned long, but we
+	// always report a single node so never need to save more than a single
+	// bit.
 	//
 	// numaPolicy and numaNodeMask are protected by mu.
 	numaPolicy   int32
-	numaNodeMask uint32
+	numaNodeMask uint64
 
 	// If netns is true, the task is in a non-root network namespace. Network
 	// namespaces aren't currently implemented in full; being in a network
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 5d1425d5c..35d5cb90c 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -68,6 +68,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -198,6 +199,12 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 		return flags.CloseOnExec
 	})
 
+	// NOTE(b/30815691): We currently do not implement privileged
+	// executables (set-user/group-ID bits and file capabilities). This
+	// allows us to unconditionally enable user dumpability on the new mm.
+	// See fs/exec.c:setup_new_exec.
+	r.tc.MemoryManager.SetDumpability(mm.UserDumpable)
+
 	// Switch to the new process.
 	t.MemoryManager().Deactivate()
 	t.mu.Lock()
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index 17f08729a..ec95f78d0 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -206,8 +207,17 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
 	// (filesystem UIDs aren't implemented, nor are any of the capabilities in
 	// question)
 
-	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
 	if oldE != newE {
+		// "[dumpability] is reset to the current value contained in
+		// the file /proc/sys/fs/suid_dumpable (which by default has
+		// the value 0), in the following circumstances: The process's
+		// effective user or group ID is changed." - prctl(2)
+		//
+		// (suid_dumpable isn't implemented, so we just use the
+		// default.
+		t.MemoryManager().SetDumpability(mm.NotDumpable)
+
+		// Not documented, but compare Linux's kernel/cred.c:commit_creds().
 		t.parentDeathSignal = 0
 	}
 }
@@ -303,8 +313,18 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
 	t.creds = t.creds.Fork() // See doc for creds.
 	t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS
 
-	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
 	if oldE != newE {
+		// "[dumpability] is reset to the current value contained in
+		// the file /proc/sys/fs/suid_dumpable (which by default has
+		// the value 0), in the following circumstances: The process's
+		// effective user or group ID is changed." - prctl(2)
+		//
+		// (suid_dumpable isn't implemented, so we just use the
+		// default.
+		t.MemoryManager().SetDumpability(mm.NotDumpable)
+
+		// Not documented, but compare Linux's
+		// kernel/cred.c:commit_creds().
 		t.parentDeathSignal = 0
 	}
 }
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 5455f6ea9..1c94ab11b 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -622,14 +622,14 @@ func (t *Task) SetNiceness(n int) {
 }
 
 // NumaPolicy returns t's current numa policy.
-func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) {
+func (t *Task) NumaPolicy() (policy int32, nodeMask uint64) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	return t.numaPolicy, t.numaNodeMask
 }
 
 // SetNumaPolicy sets t's numa policy.
-func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) {
+func (t *Task) SetNumaPolicy(policy int32, nodeMask uint64) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	t.numaPolicy = policy
diff --git a/pkg/sentry/memutil/BUILD b/pkg/sentry/memutil/BUILD
deleted file mode 100644
index 68b03d4cc..000000000
--- a/pkg/sentry/memutil/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "memutil",
-    srcs = [
-        "memutil.go",
-        "memutil_unsafe.go",
-    ],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/memutil",
-    visibility = ["//pkg/sentry:internal"],
-    deps = ["@org_golang_x_sys//unix:go_default_library"],
-)
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 7a65a62a2..7646d5ab2 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -37,6 +37,7 @@ func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *Memo
 		privateRefs: &privateRefs{},
 		users:       1,
 		auxv:        arch.Auxv{},
+		dumpability: UserDumpable,
 		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
 	}
 }
@@ -79,8 +80,9 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		envv:                 mm.envv,
 		auxv:                 append(arch.Auxv(nil), mm.auxv...),
 		// IncRef'd below, once we know that there isn't an error.
-		executable: mm.executable,
-		aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
+		executable:  mm.executable,
+		dumpability: mm.dumpability,
+		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
 	}
 
 	// Copy vmas.
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index 9768e51f1..c218006ee 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -20,6 +20,36 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// Dumpability describes if and how core dumps should be created.
+type Dumpability int
+
+const (
+	// NotDumpable indicates that core dumps should never be created.
+	NotDumpable Dumpability = iota
+
+	// UserDumpable indicates that core dumps should be created, owned by
+	// the current user.
+	UserDumpable
+
+	// RootDumpable indicates that core dumps should be created, owned by
+	// root.
+	RootDumpable
+)
+
+// Dumpability returns the dumpability.
+func (mm *MemoryManager) Dumpability() Dumpability {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.dumpability
+}
+
+// SetDumpability sets the dumpability.
+func (mm *MemoryManager) SetDumpability(d Dumpability) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.dumpability = d
+}
+
 // ArgvStart returns the start of the application argument vector.
 //
 // There is no guarantee that this value is sensible w.r.t. ArgvEnd.
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index eb6defa2b..604866d04 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -219,6 +219,12 @@ type MemoryManager struct {
 	// executable is protected by metadataMu.
 	executable *fs.Dirent
 
+	// dumpability describes if and how this MemoryManager may be dumped to
+	// userspace.
+	//
+	// dumpability is protected by metadataMu.
+	dumpability Dumpability
+
 	// aioManager keeps track of AIOContexts used for async IOs. AIOManager
 	// must be cloned when CLONE_VM is used.
 	aioManager aioManager
@@ -270,6 +276,12 @@ type vma struct {
 
 	mlockMode memmap.MLockMode
 
+	// numaPolicy is the NUMA policy for this vma set by mbind().
+	numaPolicy int32
+
+	// numaNodemask is the NUMA nodemask for this vma set by mbind().
+	numaNodemask uint64
+
 	// If id is not nil, it controls the lifecycle of mappable and provides vma
 	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
 	id memmap.MappingIdentity
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 0368c6794..9cf136532 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -470,6 +470,16 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			return 0, syserror.EINVAL
 		}
 
+		// Check that the new region is valid.
+		_, err := mm.findAvailableLocked(newSize, findAvailableOpts{
+			Addr:  newAddr,
+			Fixed: true,
+			Unmap: true,
+		})
+		if err != nil {
+			return 0, err
+		}
+
 		// Unmap any mappings at the destination.
 		mm.unmapLocked(ctx, newAR)
 
@@ -963,6 +973,59 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error
 	return nil
 }
 
+// NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
+func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	vseg := mm.vmas.FindSegment(addr)
+	if !vseg.Ok() {
+		return 0, 0, syserror.EFAULT
+	}
+	vma := vseg.ValuePtr()
+	return vma.numaPolicy, vma.numaNodemask, nil
+}
+
+// SetNumaPolicy implements the semantics of Linux's mbind().
+func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy int32, nodemask uint64) error {
+	if !addr.IsPageAligned() {
+		return syserror.EINVAL
+	}
+	// Linux allows this to overflow.
+	la, _ := usermem.Addr(length).RoundUp()
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+	if ar.Length() == 0 {
+		return nil
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	defer func() {
+		mm.vmas.MergeRange(ar)
+		mm.vmas.MergeAdjacent(ar)
+	}()
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	lastEnd := ar.Start
+	for {
+		if !vseg.Ok() || lastEnd < vseg.Start() {
+			// "EFAULT: ... there was an unmapped hole in the specified memory
+			// range specified [sic] by addr and len." - mbind(2)
+			return syserror.EFAULT
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vma := vseg.ValuePtr()
+		vma.numaPolicy = policy
+		vma.numaNodemask = nodemask
+		lastEnd = vseg.End()
+		if ar.End <= lastEnd {
+			return nil
+		}
+		vseg, _ = vseg.NextNonEmpty()
+	}
+}
+
 // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
 func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	ar, ok := addr.ToRange(length)
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 02203f79f..0af8de5b0 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -107,6 +107,7 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		private:        opts.Private,
 		growsDown:      opts.GrowsDown,
 		mlockMode:      opts.MLockMode,
+		numaPolicy:     linux.MPOL_DEFAULT,
 		id:             opts.MappingIdentity,
 		hint:           opts.Hint,
 	}
@@ -436,6 +437,8 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
 		vma1.private != vma2.private ||
 		vma1.growsDown != vma2.growsDown ||
 		vma1.mlockMode != vma2.mlockMode ||
+		vma1.numaPolicy != vma2.numaPolicy ||
+		vma1.numaNodemask != vma2.numaNodemask ||
 		vma1.id != vma2.id ||
 		vma1.hint != vma2.hint {
 		return vma{}, false
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 8a8a0e4e4..ca2d5ba6f 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -63,9 +63,10 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
+        "//pkg/memutil",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
-        "//pkg/sentry/memutil",
+        "//pkg/sentry/hostmm",
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 2b9924ad7..6d91f1a7b 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -32,6 +32,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostmm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -162,6 +163,11 @@ type MemoryFile struct {
 
 	// evictionWG counts the number of goroutines currently performing evictions.
 	evictionWG sync.WaitGroup
+
+	// stopNotifyPressure stops memory cgroup pressure level
+	// notifications used to drive eviction. stopNotifyPressure is
+	// immutable.
+	stopNotifyPressure func()
 }
 
 // MemoryFileOpts provides options to NewMemoryFile.
@@ -169,6 +175,11 @@ type MemoryFileOpts struct {
 	// DelayedEviction controls the extent to which the MemoryFile may delay
 	// eviction of evictable allocations.
 	DelayedEviction DelayedEvictionType
+
+	// If UseHostMemcgPressure is true, use host memory cgroup pressure level
+	// notifications to determine when eviction is necessary. This option has
+	// no effect unless DelayedEviction is DelayedEvictionEnabled.
+	UseHostMemcgPressure bool
 }
 
 // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
@@ -186,9 +197,14 @@ const (
 	// evictable allocations until doing so is considered necessary to avoid
 	// performance degradation due to host memory pressure, or OOM kills.
 	//
-	// As of this writing, DelayedEvictionEnabled delays evictions until the
-	// reclaimer goroutine is out of work (pages to reclaim), then evicts all
-	// pending evictable allocations immediately.
+	// As of this writing, the behavior of DelayedEvictionEnabled depends on
+	// whether or not MemoryFileOpts.UseHostMemcgPressure is enabled:
+	//
+	// - If UseHostMemcgPressure is true, evictions are delayed until memory
+	// pressure is indicated.
+	//
+	// - Otherwise, evictions are only delayed until the reclaimer goroutine
+	// is out of work (pages to reclaim).
 	DelayedEvictionEnabled
 
 	// DelayedEvictionManual requires that evictable allocations are only
@@ -292,6 +308,22 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
 	}
 	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
 	f.reclaimCond.L = &f.mu
+
+	if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
+		stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() {
+			f.mu.Lock()
+			startedAny := f.startEvictionsLocked()
+			f.mu.Unlock()
+			if startedAny {
+				log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure")
+			}
+		}, "low")
+		if err != nil {
+			return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err)
+		}
+		f.stopNotifyPressure = stop
+	}
+
 	go f.runReclaim() // S/R-SAFE: f.mu
 
 	// The Linux kernel contains an optional feature called "Integrity
@@ -692,9 +724,11 @@ func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange)
 			// Kick off eviction immediately.
 			f.startEvictionGoroutineLocked(user, info)
 		case DelayedEvictionEnabled:
-			// Ensure that the reclaimer goroutine is running, so that it can
-			// start eviction when necessary.
-			f.reclaimCond.Signal()
+			if !f.opts.UseHostMemcgPressure {
+				// Ensure that the reclaimer goroutine is running, so that it
+				// can start eviction when necessary.
+				f.reclaimCond.Signal()
+			}
 		}
 	}
 }
@@ -992,11 +1026,12 @@ func (f *MemoryFile) runReclaim() {
 		}
 		f.markReclaimed(fr)
 	}
+
 	// We only get here if findReclaimable finds f.destroyed set and returns
 	// false.
 	f.mu.Lock()
-	defer f.mu.Unlock()
 	if !f.destroyed {
+		f.mu.Unlock()
 		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
 	}
 	f.file.Close()
@@ -1016,6 +1051,13 @@ func (f *MemoryFile) runReclaim() {
 	}
 	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
 	f.mappings.Store([]uintptr{})
+	f.mu.Unlock()
+
+	// This must be called without holding f.mu to avoid circular lock
+	// ordering.
+	if f.stopNotifyPressure != nil {
+		f.stopNotifyPressure()
+	}
 }
 
 func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
@@ -1029,7 +1071,7 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
 			if f.reclaimable {
 				break
 			}
-			if f.opts.DelayedEviction == DelayedEvictionEnabled {
+			if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure {
 				// No work to do. Evict any pending evictable allocations to
 				// get more reclaimable pages before going to sleep.
 				f.startEvictionsLocked()
@@ -1089,14 +1131,17 @@ func (f *MemoryFile) StartEvictions() {
 }
 
 // Preconditions: f.mu must be locked.
-func (f *MemoryFile) startEvictionsLocked() {
+func (f *MemoryFile) startEvictionsLocked() bool {
+	startedAny := false
 	for user, info := range f.evictable {
 		// Don't start multiple goroutines to evict the same user's
 		// allocations.
 		if !info.evicting {
 			f.startEvictionGoroutineLocked(user, info)
+			startedAny = true
 		}
 	}
+	return startedAny
 }
 
 // Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 9999e58f4..2931d6ddc 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -32,10 +32,10 @@ go_library(
         "//pkg/atomicbitops",
         "//pkg/cpuid",
         "//pkg/log",
+        "//pkg/procid",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/interrupt",
-        "//pkg/sentry/platform/procid",
         "//pkg/sentry/platform/ring0",
         "//pkg/sentry/platform/ring0/pagetables",
         "//pkg/sentry/platform/safecopy",
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f5953b96e..f8ccd86af 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -23,7 +23,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
 	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+	"gvisor.googlesource.com/gvisor/pkg/procid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index e9e4a0d16..434d003a3 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -20,11 +20,11 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
+        "//pkg/procid",
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/interrupt",
-        "//pkg/sentry/platform/procid",
         "//pkg/sentry/platform/safecopy",
         "//pkg/sentry/usermem",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 83b43057f..d7800a55e 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -21,9 +21,10 @@ import (
 	"sync"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/procid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -300,6 +301,18 @@ const (
 	killed
 )
 
+func (t *thread) dumpAndPanic(message string) {
+	var regs syscall.PtraceRegs
+	message += "\n"
+	if err := t.getRegs(&regs); err == nil {
+		message += dumpRegs(&regs)
+	} else {
+		log.Warningf("unable to get registers: %v", err)
+	}
+	message += fmt.Sprintf("stubStart\t = %016x\n", stubStart)
+	panic(message)
+}
+
 // wait waits for a stop event.
 //
 // Precondition: outcome is a valid waitOutcome.
@@ -320,7 +333,7 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal {
 		switch outcome {
 		case stopped:
 			if !status.Stopped() {
-				panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
+				t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
 			}
 			stopSig := status.StopSignal()
 			if stopSig == 0 {
@@ -334,12 +347,12 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal {
 			return stopSig
 		case killed:
 			if !status.Exited() && !status.Signaled() {
-				panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
+				t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
 			}
 			return syscall.Signal(status.ExitStatus())
 		default:
 			// Should not happen.
-			panic(fmt.Sprintf("unknown outcome: %v", outcome))
+			t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome))
 		}
 	}
 }
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 77a0e908f..fdd21c8f8 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -17,6 +17,8 @@
 package ptrace
 
 import (
+	"fmt"
+	"strings"
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
@@ -102,3 +104,38 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
 	}
 	return uintptr(rval), nil
 }
+
+func dumpRegs(regs *syscall.PtraceRegs) string {
+	var m strings.Builder
+
+	fmt.Fprintf(&m, "Registers:\n")
+	fmt.Fprintf(&m, "\tR15\t = %016x\n", regs.R15)
+	fmt.Fprintf(&m, "\tR14\t = %016x\n", regs.R14)
+	fmt.Fprintf(&m, "\tR13\t = %016x\n", regs.R13)
+	fmt.Fprintf(&m, "\tR12\t = %016x\n", regs.R12)
+	fmt.Fprintf(&m, "\tRbp\t = %016x\n", regs.Rbp)
+	fmt.Fprintf(&m, "\tRbx\t = %016x\n", regs.Rbx)
+	fmt.Fprintf(&m, "\tR11\t = %016x\n", regs.R11)
+	fmt.Fprintf(&m, "\tR10\t = %016x\n", regs.R10)
+	fmt.Fprintf(&m, "\tR9\t = %016x\n", regs.R9)
+	fmt.Fprintf(&m, "\tR8\t = %016x\n", regs.R8)
+	fmt.Fprintf(&m, "\tRax\t = %016x\n", regs.Rax)
+	fmt.Fprintf(&m, "\tRcx\t = %016x\n", regs.Rcx)
+	fmt.Fprintf(&m, "\tRdx\t = %016x\n", regs.Rdx)
+	fmt.Fprintf(&m, "\tRsi\t = %016x\n", regs.Rsi)
+	fmt.Fprintf(&m, "\tRdi\t = %016x\n", regs.Rdi)
+	fmt.Fprintf(&m, "\tOrig_rax = %016x\n", regs.Orig_rax)
+	fmt.Fprintf(&m, "\tRip\t = %016x\n", regs.Rip)
+	fmt.Fprintf(&m, "\tCs\t = %016x\n", regs.Cs)
+	fmt.Fprintf(&m, "\tEflags\t = %016x\n", regs.Eflags)
+	fmt.Fprintf(&m, "\tRsp\t = %016x\n", regs.Rsp)
+	fmt.Fprintf(&m, "\tSs\t = %016x\n", regs.Ss)
+	fmt.Fprintf(&m, "\tFs_base\t = %016x\n", regs.Fs_base)
+	fmt.Fprintf(&m, "\tGs_base\t = %016x\n", regs.Gs_base)
+	fmt.Fprintf(&m, "\tDs\t = %016x\n", regs.Ds)
+	fmt.Fprintf(&m, "\tEs\t = %016x\n", regs.Es)
+	fmt.Fprintf(&m, "\tFs\t = %016x\n", regs.Fs)
+	fmt.Fprintf(&m, "\tGs\t = %016x\n", regs.Gs)
+
+	return m.String()
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 2c07b4ac3..914be7486 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -22,9 +22,9 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/procid"
 	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
 )
 
 const syscallEvent syscall.Signal = 0x80
@@ -142,7 +142,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
 	// down available calls only to what is needed.
 	rules := []seccomp.RuleSet{
 		// Rules for trapping vsyscall access.
-		seccomp.RuleSet{
+		{
 			Rules: seccomp.SyscallRules{
 				syscall.SYS_GETTIMEOFDAY: {},
 				syscall.SYS_TIME:         {},
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index c0238691d..434d7ca2e 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -406,12 +406,20 @@ func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials {
 		return nil
 	}
 	if cr, ok := socketOrEndpoint.(transport.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) {
-		tcred := t.Credentials()
-		return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
+		return MakeCreds(t)
 	}
 	return nil
 }
 
+// MakeCreds creates default SCMCredentials.
+func MakeCreds(t *kernel.Task) SCMCredentials {
+	if t == nil {
+		return nil
+	}
+	tcred := t.Credentials()
+	return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
+}
+
 // New creates default control messages if needed.
 func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages {
 	return transport.ControlMessages{
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 44bb97b5b..7e2679ea0 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -32,7 +32,6 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
-        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index de4b963da..a50798cb3 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -44,7 +44,6 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
@@ -52,6 +51,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -227,7 +227,8 @@ type SocketOperations struct {
 
 	family   int
 	Endpoint tcpip.Endpoint
-	skType   transport.SockType
+	skType   linux.SockType
+	protocol int
 
 	// readMu protects access to the below fields.
 	readMu sync.Mutex `state:"nosave"`
@@ -252,8 +253,8 @@ type SocketOperations struct {
 }
 
 // New creates a new endpoint socket.
-func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
-	if skType == transport.SockStream {
+func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
+	if skType == linux.SOCK_STREAM {
 		if err := endpoint.SetSockOpt(tcpip.DelayOption(1)); err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
@@ -266,6 +267,7 @@ func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Qu
 		family:   family,
 		Endpoint: endpoint,
 		skType:   skType,
+		protocol: protocol,
 	}), nil
 }
 
@@ -550,7 +552,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		}
 	}
 
-	ns, err := New(t, s.family, s.skType, wq, ep)
+	ns, err := New(t, s.family, s.skType, s.protocol, wq, ep)
 	if err != nil {
 		return 0, nil, 0, err
 	}
@@ -578,7 +580,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	}
 	fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits())
 
-	t.Kernel().RecordSocket(ns, s.family)
+	t.Kernel().RecordSocket(ns)
 
 	return fd, addr, addrLen, syserr.FromError(e)
 }
@@ -637,7 +639,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (
 
 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
 // sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -663,7 +665,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 }
 
 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
-func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
 	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
 	switch name {
 	case linux.SO_TYPE:
@@ -918,6 +920,30 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		t.Kernel().EmitUnimplementedEvent(t)
 
+	case linux.TCP_CONGESTION:
+		if outLen <= 0 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.CongestionControlOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		// We match linux behaviour here where it returns the lower of
+		// TCP_CA_NAME_MAX bytes or the value of the option length.
+		//
+		// This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+		const tcpCANameMax = 16
+
+		toCopy := tcpCANameMax
+		if outLen < tcpCANameMax {
+			toCopy = outLen
+		}
+		b := make([]byte, toCopy)
+		copy(b, v)
+		return b, nil
+
 	default:
 		emitUnimplementedEventTCP(t, name)
 	}
@@ -1220,6 +1246,12 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_CONGESTION:
+		v := tcpip.CongestionControlOption(optVal)
+		if err := ep.SetSockOpt(v); err != nil {
+			return syserr.TranslateNetstackError(err)
+		}
+		return nil
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
@@ -2281,3 +2313,51 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
 	}
 	return rv
 }
+
+// State implements socket.Socket.State. State translates the internal state
+// returned by netstack to values defined by Linux.
+func (s *SocketOperations) State() uint32 {
+	if s.family != linux.AF_INET && s.family != linux.AF_INET6 {
+		// States not implemented for this socket's family.
+		return 0
+	}
+
+	if !s.isPacketBased() {
+		// TCP socket.
+		switch tcp.EndpointState(s.Endpoint.State()) {
+		case tcp.StateEstablished:
+			return linux.TCP_ESTABLISHED
+		case tcp.StateSynSent:
+			return linux.TCP_SYN_SENT
+		case tcp.StateSynRecv:
+			return linux.TCP_SYN_RECV
+		case tcp.StateFinWait1:
+			return linux.TCP_FIN_WAIT1
+		case tcp.StateFinWait2:
+			return linux.TCP_FIN_WAIT2
+		case tcp.StateTimeWait:
+			return linux.TCP_TIME_WAIT
+		case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
+			return linux.TCP_CLOSE
+		case tcp.StateCloseWait:
+			return linux.TCP_CLOSE_WAIT
+		case tcp.StateLastAck:
+			return linux.TCP_LAST_ACK
+		case tcp.StateListen:
+			return linux.TCP_LISTEN
+		case tcp.StateClosing:
+			return linux.TCP_CLOSING
+		default:
+			// Internal or unknown state.
+			return 0
+		}
+	}
+
+	// TODO(b/112063468): Export states for UDP, ICMP, and raw sockets.
+	return 0
+}
+
+// Type implements socket.Socket.Type.
+func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) {
+	return s.family, s.skType, s.protocol
+}
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index ec930d8d5..516582828 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -23,7 +23,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
@@ -42,7 +41,7 @@ type provider struct {
 
 // getTransportProtocol figures out transport protocol. Currently only TCP,
 // UDP, and ICMP are supported.
-func getTransportProtocol(ctx context.Context, stype transport.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
+func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
 	switch stype {
 	case linux.SOCK_STREAM:
 		if protocol != 0 && protocol != syscall.IPPROTO_TCP {
@@ -80,7 +79,7 @@ func getTransportProtocol(ctx context.Context, stype transport.SockType, protoco
 }
 
 // Socket creates a new socket object for the AF_INET or AF_INET6 family.
-func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Fail right away if we don't have a stack.
 	stack := t.NetworkContext()
 	if stack == nil {
@@ -112,11 +111,11 @@ func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int
 		return nil, syserr.TranslateNetstackError(e)
 	}
 
-	return New(t, p.family, stype, wq, ep)
+	return New(t, p.family, stype, protocol, wq, ep)
 }
 
 // Pair just returns nil sockets (not supported).
-func (*provider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
+func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
 	return nil, nil, nil
 }
 
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index a469af7ac..975f47bc3 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -30,7 +30,6 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
-        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 41f9693bb..c62c8d8f1 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -19,7 +19,9 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
 	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
@@ -28,7 +30,6 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -55,15 +56,22 @@ type socketOperations struct {
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 
-	family int // Read-only.
-	fd     int // must be O_NONBLOCK
-	queue  waiter.Queue
+	family   int            // Read-only.
+	stype    linux.SockType // Read-only.
+	protocol int            // Read-only.
+	fd       int            // must be O_NONBLOCK
+	queue    waiter.Queue
 }
 
 var _ = socket.Socket(&socketOperations{})
 
-func newSocketFile(ctx context.Context, family int, fd int, nonblock bool) (*fs.File, *syserr.Error) {
-	s := &socketOperations{family: family, fd: fd}
+func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) {
+	s := &socketOperations{
+		family:   family,
+		stype:    stype,
+		protocol: protocol,
+		fd:       fd,
+	}
 	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
 		return nil, syserr.FromError(err)
 	}
@@ -221,7 +229,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr)
 	}
 
-	f, err := newSocketFile(t, s.family, fd, flags&syscall.SOCK_NONBLOCK != 0)
+	f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0)
 	if err != nil {
 		syscall.Close(fd)
 		return 0, nil, 0, err
@@ -232,7 +240,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
 	}
 	kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits())
-	t.Kernel().RecordSocket(f, s.family)
+	t.Kernel().RecordSocket(f)
 	return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr)
 }
 
@@ -519,12 +527,39 @@ func translateIOSyscallError(err error) error {
 	return err
 }
 
+// State implements socket.Socket.State.
+func (s *socketOperations) State() uint32 {
+	info := linux.TCPInfo{}
+	buf, err := getsockopt(s.fd, syscall.SOL_TCP, syscall.TCP_INFO, linux.SizeOfTCPInfo)
+	if err != nil {
+		if err != syscall.ENOPROTOOPT {
+			log.Warningf("Failed to get TCP socket info from %+v: %v", s, err)
+		}
+		// For non-TCP sockets, silently ignore the failure.
+		return 0
+	}
+	if len(buf) != linux.SizeOfTCPInfo {
+		// Unmarshal below will panic if getsockopt returns a buffer of
+		// unexpected size.
+		log.Warningf("Failed to get TCP socket info from %+v: getsockopt(2) returned %d bytes, expecting %d bytes.", s, len(buf), linux.SizeOfTCPInfo)
+		return 0
+	}
+
+	binary.Unmarshal(buf, usermem.ByteOrder, &info)
+	return uint32(info.State)
+}
+
+// Type implements socket.Socket.Type.
+func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) {
+	return s.family, s.stype, s.protocol
+}
+
 type socketProvider struct {
 	family int
 }
 
 // Socket implements socket.Provider.Socket.
-func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Check that we are using the host network stack.
 	stack := t.NetworkContext()
 	if stack == nil {
@@ -535,7 +570,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p
 	}
 
 	// Only accept TCP and UDP.
-	stype := int(stypeflags) & linux.SOCK_TYPE_MASK
+	stype := stypeflags & linux.SOCK_TYPE_MASK
 	switch stype {
 	case syscall.SOCK_STREAM:
 		switch protocol {
@@ -558,15 +593,15 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p
 	// Conservatively ignore all flags specified by the application and add
 	// SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0
 	// to simplify the syscall filters, since 0 and IPPROTO_* are equivalent.
-	fd, err := syscall.Socket(p.family, stype|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	fd, err := syscall.Socket(p.family, int(stype)|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		return nil, syserr.FromError(err)
 	}
-	return newSocketFile(t, p.family, fd, stypeflags&syscall.SOCK_NONBLOCK != 0)
+	return newSocketFile(t, p.family, stype, protocol, fd, stypeflags&syscall.SOCK_NONBLOCK != 0)
 }
 
 // Pair implements socket.Provider.Pair.
-func (p *socketProvider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
 	// Not supported by AF_INET/AF_INET6.
 	return nil, nil, nil
 }
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 76cf12fd4..5dc103877 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 )
 
@@ -66,10 +65,10 @@ type socketProvider struct {
 }
 
 // Socket implements socket.Provider.Socket.
-func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Netlink sockets must be specified as datagram or raw, but they
 	// behave the same regardless of type.
-	if stype != transport.SockDgram && stype != transport.SockRaw {
+	if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW {
 		return nil, syserr.ErrSocketNotSupported
 	}
 
@@ -83,7 +82,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol
 		return nil, err
 	}
 
-	s, err := NewSocket(t, p)
+	s, err := NewSocket(t, stype, p)
 	if err != nil {
 		return nil, err
 	}
@@ -94,7 +93,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol
 }
 
 // Pair implements socket.Provider.Pair by returning an error.
-func (*socketProvider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
+func (*socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
 	// Netlink sockets never supports creating socket pairs.
 	return nil, nil, syserr.ErrNotSupported
 }
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index afd06ca33..62659784a 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -80,6 +80,10 @@ type Socket struct {
 	// protocol is the netlink protocol implementation.
 	protocol Protocol
 
+	// skType is the socket type. This is either SOCK_DGRAM or SOCK_RAW for
+	// netlink sockets.
+	skType linux.SockType
+
 	// ep is a datagram unix endpoint used to buffer messages sent from the
 	// kernel to userspace. RecvMsg reads messages from this endpoint.
 	ep transport.Endpoint
@@ -105,7 +109,7 @@ type Socket struct {
 var _ socket.Socket = (*Socket)(nil)
 
 // NewSocket creates a new Socket.
-func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) {
+func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) {
 	// Datagram endpoint used to buffer kernel -> user messages.
 	ep := transport.NewConnectionless()
 
@@ -126,6 +130,7 @@ func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) {
 	return &Socket{
 		ports:          t.Kernel().NetlinkPorts(),
 		protocol:       protocol,
+		skType:         skType,
 		ep:             ep,
 		connection:     connection,
 		sendBufferSize: defaultSendBufferSize,
@@ -616,3 +621,13 @@ func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence,
 	n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
 	return int64(n), err.ToError()
 }
+
+// State implements socket.Socket.State.
+func (s *Socket) State() uint32 {
+	return s.ep.State()
+}
+
+// Type implements socket.Socket.Type.
+func (s *Socket) Type() (family int, skType linux.SockType, protocol int) {
+	return linux.AF_NETLINK, s.skType, s.protocol.Protocol()
+}
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 4da14a1e0..33ba20de7 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -31,7 +31,6 @@ go_library(
         "//pkg/sentry/socket/hostinet",
         "//pkg/sentry/socket/rpcinet/conn",
         "//pkg/sentry/socket/rpcinet/notifier",
-        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 55e0b6665..c22ff1ff0 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -32,7 +32,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier"
 	pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
@@ -54,7 +53,10 @@ type socketOperations struct {
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 
-	family   int    // Read-only.
+	family   int            // Read-only.
+	stype    linux.SockType // Read-only.
+	protocol int            // Read-only.
+
 	fd       uint32 // must be O_NONBLOCK
 	wq       *waiter.Queue
 	rpcConn  *conn.RPCConnection
@@ -70,7 +72,7 @@ type socketOperations struct {
 var _ = socket.Socket(&socketOperations{})
 
 // New creates a new RPC socket.
-func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, protocol int) (*fs.File, *syserr.Error) {
+func newSocketFile(ctx context.Context, stack *Stack, family int, skType linux.SockType, protocol int) (*fs.File, *syserr.Error) {
 	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(family), Type: int64(skType | syscall.SOCK_NONBLOCK), Protocol: int64(protocol)}}}, false /* ignoreResult */)
 	<-c
 
@@ -87,6 +89,8 @@ func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, pr
 	defer dirent.DecRef()
 	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &socketOperations{
 		family:   family,
+		stype:    skType,
+		protocol: protocol,
 		wq:       &wq,
 		fd:       fd,
 		rpcConn:  stack.rpcConn,
@@ -333,7 +337,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	if err != nil {
 		return 0, nil, 0, syserr.FromError(err)
 	}
-	t.Kernel().RecordSocket(file, s.family)
+	t.Kernel().RecordSocket(file)
 
 	if peerRequested {
 		return fd, payload.Address.Address, payload.Address.Length, nil
@@ -830,12 +834,23 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	}
 }
 
+// State implements socket.Socket.State.
+func (s *socketOperations) State() uint32 {
+	// TODO(b/127845868): Define a new rpc to query the socket state.
+	return 0
+}
+
+// Type implements socket.Socket.Type.
+func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) {
+	return s.family, s.stype, s.protocol
+}
+
 type socketProvider struct {
 	family int
 }
 
 // Socket implements socket.Provider.Socket.
-func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Check that we are using the RPC network stack.
 	stack := t.NetworkContext()
 	if stack == nil {
@@ -851,7 +866,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p
 	//
 	// Try to restrict the flags we will accept to minimize backwards
 	// incompatibility with netstack.
-	stype := int(stypeflags) & linux.SOCK_TYPE_MASK
+	stype := stypeflags & linux.SOCK_TYPE_MASK
 	switch stype {
 	case syscall.SOCK_STREAM:
 		switch protocol {
@@ -871,11 +886,11 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p
 		return nil, nil
 	}
 
-	return newSocketFile(t, s, p.family, stype, 0)
+	return newSocketFile(t, s, p.family, stype, protocol)
 }
 
 // Pair implements socket.Provider.Pair.
-func (p *socketProvider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
 	// Not supported by AF_INET/AF_INET6.
 	return nil, nil, nil
 }
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 9393acd28..d60944b6b 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -116,6 +116,13 @@ type Socket interface {
 	// SendTimeout gets the current timeout (in ns) for send operations. Zero
 	// means no timeout, and negative means DONTWAIT.
 	SendTimeout() int64
+
+	// State returns the current state of the socket, as represented by Linux in
+	// procfs. The returned state value is protocol-specific.
+	State() uint32
+
+	// Type returns the family, socket type and protocol of the socket.
+	Type() (family int, skType linux.SockType, protocol int)
 }
 
 // Provider is the interface implemented by providers of sockets for specific
@@ -126,12 +133,12 @@ type Provider interface {
 	// If a nil Socket _and_ a nil error is returned, it means that the
 	// protocol is not supported. A non-nil error should only be returned
 	// if the protocol is supported, but an error occurs during creation.
-	Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error)
+	Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error)
 
 	// Pair creates a pair of connected sockets.
 	//
 	// See Socket for error information.
-	Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error)
+	Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error)
 }
 
 // families holds a map of all known address families and their providers.
@@ -145,14 +152,14 @@ func RegisterProvider(family int, provider Provider) {
 }
 
 // New creates a new socket with the given family, type and protocol.
-func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func New(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
 	for _, p := range families[family] {
 		s, err := p.Socket(t, stype, protocol)
 		if err != nil {
 			return nil, err
 		}
 		if s != nil {
-			t.Kernel().RecordSocket(s, family)
+			t.Kernel().RecordSocket(s)
 			return s, nil
 		}
 	}
@@ -162,7 +169,7 @@ func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*f
 
 // Pair creates a new connected socket pair with the given family, type and
 // protocol.
-func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func Pair(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
 	providers, ok := families[family]
 	if !ok {
 		return nil, nil, syserr.ErrAddressFamilyNotSupported
@@ -175,8 +182,8 @@ func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*
 		}
 		if s1 != nil && s2 != nil {
 			k := t.Kernel()
-			k.RecordSocket(s1, family)
-			k.RecordSocket(s2, family)
+			k.RecordSocket(s1)
+			k.RecordSocket(s2)
 			return s1, s2, nil
 		}
 	}
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 5a2de0c4c..52f324eed 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -28,6 +28,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport",
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/ilist",
         "//pkg/refs",
         "//pkg/syserr",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 18e492862..db79ac904 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -17,6 +17,7 @@ package transport
 import (
 	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -44,7 +45,7 @@ type ConnectingEndpoint interface {
 	// Type returns the socket type, typically either SockStream or
 	// SockSeqpacket. The connection attempt must be aborted if this
 	// value doesn't match the ConnectableEndpoint's type.
-	Type() SockType
+	Type() linux.SockType
 
 	// GetLocalAddress returns the bound path.
 	GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
@@ -100,7 +101,7 @@ type connectionedEndpoint struct {
 	// stype is used by connecting sockets to ensure that they are the
 	// same type. The value is typically either tcpip.SockSeqpacket or
 	// tcpip.SockStream.
-	stype SockType
+	stype linux.SockType
 
 	// acceptedChan is per the TCP endpoint implementation. Note that the
 	// sockets in this channel are _already in the connected state_, and
@@ -111,7 +112,7 @@ type connectionedEndpoint struct {
 }
 
 // NewConnectioned creates a new unbound connectionedEndpoint.
-func NewConnectioned(stype SockType, uid UniqueIDProvider) Endpoint {
+func NewConnectioned(stype linux.SockType, uid UniqueIDProvider) Endpoint {
 	return &connectionedEndpoint{
 		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
 		id:           uid.UniqueID(),
@@ -121,7 +122,7 @@ func NewConnectioned(stype SockType, uid UniqueIDProvider) Endpoint {
 }
 
 // NewPair allocates a new pair of connected unix-domain connectionedEndpoints.
-func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
+func NewPair(stype linux.SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
 	a := &connectionedEndpoint{
 		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
 		id:           uid.UniqueID(),
@@ -138,7 +139,7 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
 	q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit}
 	q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit}
 
-	if stype == SockStream {
+	if stype == linux.SOCK_STREAM {
 		a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
 		b.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q2}}
 	} else {
@@ -162,7 +163,7 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
 
 // NewExternal creates a new externally backed Endpoint. It behaves like a
 // socketpair.
-func NewExternal(stype SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint {
+func NewExternal(stype linux.SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint {
 	return &connectionedEndpoint{
 		baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected},
 		id:           uid.UniqueID(),
@@ -177,7 +178,7 @@ func (e *connectionedEndpoint) ID() uint64 {
 }
 
 // Type implements ConnectingEndpoint.Type and Endpoint.Type.
-func (e *connectionedEndpoint) Type() SockType {
+func (e *connectionedEndpoint) Type() linux.SockType {
 	return e.stype
 }
 
@@ -293,7 +294,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur
 	}
 
 	writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit}
-	if e.stype == SockStream {
+	if e.stype == linux.SOCK_STREAM {
 		ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
 	} else {
 		ne.receiver = &queueReceiver{readQueue: writeQueue}
@@ -308,7 +309,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur
 			writeQueue: writeQueue,
 		}
 		readQueue.IncRef()
-		if e.stype == SockStream {
+		if e.stype == linux.SOCK_STREAM {
 			returnConnect(&streamQueueReceiver{queueReceiver: queueReceiver{readQueue: readQueue}}, connected)
 		} else {
 			returnConnect(&queueReceiver{readQueue: readQueue}, connected)
@@ -428,7 +429,7 @@ func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *syser
 func (e *connectionedEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
 	// Stream sockets do not support specifying the endpoint. Seqpacket
 	// sockets ignore the passed endpoint.
-	if e.stype == SockStream && to != nil {
+	if e.stype == linux.SOCK_STREAM && to != nil {
 		return 0, syserr.ErrNotSupported
 	}
 	return e.baseEndpoint.SendMsg(data, c, to)
@@ -458,3 +459,11 @@ func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask
 
 	return ready
 }
+
+// State implements socket.Socket.State.
+func (e *connectionedEndpoint) State() uint32 {
+	if e.Connected() {
+		return linux.SS_CONNECTED
+	}
+	return linux.SS_UNCONNECTED
+}
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 43ff875e4..81ebfba10 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -15,6 +15,7 @@
 package transport
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -118,8 +119,8 @@ func (e *connectionlessEndpoint) SendMsg(data [][]byte, c ControlMessages, to Bo
 }
 
 // Type implements Endpoint.Type.
-func (e *connectionlessEndpoint) Type() SockType {
-	return SockDgram
+func (e *connectionlessEndpoint) Type() linux.SockType {
+	return linux.SOCK_DGRAM
 }
 
 // Connect attempts to connect directly to server.
@@ -194,3 +195,18 @@ func (e *connectionlessEndpoint) Readiness(mask waiter.EventMask) waiter.EventMa
 
 	return ready
 }
+
+// State implements socket.Socket.State.
+func (e *connectionlessEndpoint) State() uint32 {
+	e.Lock()
+	defer e.Unlock()
+
+	switch {
+	case e.isBound():
+		return linux.SS_UNCONNECTED
+	case e.Connected():
+		return linux.SS_CONNECTING
+	default:
+		return linux.SS_DISCONNECTING
+	}
+}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index b734b4c20..5c55c529e 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -19,6 +19,7 @@ import (
 	"sync"
 	"sync/atomic"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
@@ -28,21 +29,6 @@ import (
 // initialLimit is the starting limit for the socket buffers.
 const initialLimit = 16 * 1024
 
-// A SockType is a type (as opposed to family) of sockets. These are enumerated
-// in the syscall package as syscall.SOCK_* constants.
-type SockType int
-
-const (
-	// SockStream corresponds to syscall.SOCK_STREAM.
-	SockStream SockType = 1
-	// SockDgram corresponds to syscall.SOCK_DGRAM.
-	SockDgram SockType = 2
-	// SockRaw corresponds to syscall.SOCK_RAW.
-	SockRaw SockType = 3
-	// SockSeqpacket corresponds to syscall.SOCK_SEQPACKET.
-	SockSeqpacket SockType = 5
-)
-
 // A RightsControlMessage is a control message containing FDs.
 type RightsControlMessage interface {
 	// Clone returns a copy of the RightsControlMessage.
@@ -175,7 +161,7 @@ type Endpoint interface {
 
 	// Type return the socket type, typically either SockStream, SockDgram
 	// or SockSeqpacket.
-	Type() SockType
+	Type() linux.SockType
 
 	// GetLocalAddress returns the address to which the endpoint is bound.
 	GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
@@ -191,6 +177,10 @@ type Endpoint interface {
 	// GetSockOpt gets a socket option. opt should be a pointer to one of the
 	// tcpip.*Option types.
 	GetSockOpt(opt interface{}) *tcpip.Error
+
+	// State returns the current state of the socket, as represented by Linux in
+	// procfs.
+	State() uint32
 }
 
 // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
@@ -237,6 +227,10 @@ type BoundEndpoint interface {
 	// endpoint.
 	UnidirectionalConnect() (ConnectedEndpoint, *syserr.Error)
 
+	// Passcred returns whether or not the SO_PASSCRED socket option is
+	// enabled on this end.
+	Passcred() bool
+
 	// Release releases any resources held by the BoundEndpoint. It must be
 	// called before dropping all references to a BoundEndpoint returned by a
 	// function.
@@ -621,7 +615,7 @@ type connectedEndpoint struct {
 		GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
 
 		// Type implements Endpoint.Type.
-		Type() SockType
+		Type() linux.SockType
 	}
 
 	writeQueue *queue
@@ -645,7 +639,7 @@ func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages,
 	}
 
 	truncate := false
-	if e.endpoint.Type() == SockStream {
+	if e.endpoint.Type() == linux.SOCK_STREAM {
 		// Since stream sockets don't preserve message boundaries, we
 		// can write only as much of the message as fits in the queue.
 		truncate = true
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 1414be0c6..b07e8d67b 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -17,6 +17,7 @@
 package unix
 
 import (
+	"fmt"
 	"strings"
 	"syscall"
 
@@ -55,22 +56,22 @@ type SocketOperations struct {
 	refs.AtomicRefCount
 	socket.SendReceiveTimeout
 
-	ep       transport.Endpoint
-	isPacket bool
+	ep    transport.Endpoint
+	stype linux.SockType
 }
 
 // New creates a new unix socket.
-func New(ctx context.Context, endpoint transport.Endpoint, isPacket bool) *fs.File {
+func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File {
 	dirent := socket.NewDirent(ctx, unixSocketDevice)
 	defer dirent.DecRef()
-	return NewWithDirent(ctx, dirent, endpoint, isPacket, fs.FileFlags{Read: true, Write: true})
+	return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true})
 }
 
 // NewWithDirent creates a new unix socket using an existing dirent.
-func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, isPacket bool, flags fs.FileFlags) *fs.File {
+func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File {
 	return fs.NewFile(ctx, d, flags, &SocketOperations{
-		ep:       ep,
-		isPacket: isPacket,
+		ep:    ep,
+		stype: stype,
 	})
 }
 
@@ -88,6 +89,18 @@ func (s *SocketOperations) Release() {
 	s.DecRef()
 }
 
+func (s *SocketOperations) isPacket() bool {
+	switch s.stype {
+	case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
+		return true
+	case linux.SOCK_STREAM:
+		return false
+	default:
+		// We shouldn't have allowed any other socket types during creation.
+		panic(fmt.Sprintf("Invalid socket type %d", s.stype))
+	}
+}
+
 // Endpoint extracts the transport.Endpoint.
 func (s *SocketOperations) Endpoint() transport.Endpoint {
 	return s.ep
@@ -193,7 +206,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		}
 	}
 
-	ns := New(t, ep, s.isPacket)
+	ns := New(t, ep, s.stype)
 	defer ns.DecRef()
 
 	if flags&linux.SOCK_NONBLOCK != 0 {
@@ -221,7 +234,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		return 0, nil, 0, syserr.FromError(e)
 	}
 
-	t.Kernel().RecordSocket(ns, linux.AF_UNIX)
+	t.Kernel().RecordSocket(ns)
 
 	return fd, addr, addrLen, nil
 }
@@ -385,6 +398,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		}
 		defer ep.Release()
 		w.To = ep
+
+		if ep.Passcred() && w.Control.Credentials == nil {
+			w.Control.Credentials = control.MakeCreds(t)
+		}
 	}
 
 	n, err := src.CopyInTo(t, &w)
@@ -483,6 +500,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	peek := flags&linux.MSG_PEEK != 0
 	dontWait := flags&linux.MSG_DONTWAIT != 0
 	waitAll := flags&linux.MSG_WAITALL != 0
+	isPacket := s.isPacket()
 
 	// Calculate the number of FDs for which we have space and if we are
 	// requesting credentials.
@@ -516,7 +534,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait {
 		var from interface{}
 		var fromLen uint32
-		if r.From != nil {
+		if r.From != nil && len([]byte(r.From.Addr)) != 0 {
 			from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
 		}
 
@@ -524,8 +542,8 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			msgFlags |= linux.MSG_CTRUNC
 		}
 
-		if err != nil || dontWait || !waitAll || s.isPacket || n >= dst.NumBytes() {
-			if s.isPacket && n < int64(r.MsgSize) {
+		if err != nil || dontWait || !waitAll || isPacket || n >= dst.NumBytes() {
+			if isPacket && n < int64(r.MsgSize) {
 				msgFlags |= linux.MSG_TRUNC
 			}
 
@@ -566,11 +584,11 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 				total += n
 			}
 
-			if err != nil || !waitAll || s.isPacket || n >= dst.NumBytes() {
+			if err != nil || !waitAll || isPacket || n >= dst.NumBytes() {
 				if total > 0 {
 					err = nil
 				}
-				if s.isPacket && n < int64(r.MsgSize) {
+				if isPacket && n < int64(r.MsgSize) {
 					msgFlags |= linux.MSG_TRUNC
 				}
 				return int(total), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
@@ -592,11 +610,22 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	}
 }
 
+// State implements socket.Socket.State.
+func (s *SocketOperations) State() uint32 {
+	return s.ep.State()
+}
+
+// Type implements socket.Socket.Type.
+func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) {
+	// Unix domain sockets always have a protocol of 0.
+	return linux.AF_UNIX, s.stype, 0
+}
+
 // provider is a unix domain socket provider.
 type provider struct{}
 
 // Socket returns a new unix domain socket.
-func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (*provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Check arguments.
 	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
 		return nil, syserr.ErrProtocolNotSupported
@@ -604,43 +633,36 @@ func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int)
 
 	// Create the endpoint and socket.
 	var ep transport.Endpoint
-	var isPacket bool
 	switch stype {
 	case linux.SOCK_DGRAM:
-		isPacket = true
 		ep = transport.NewConnectionless()
-	case linux.SOCK_SEQPACKET:
-		isPacket = true
-		fallthrough
-	case linux.SOCK_STREAM:
+	case linux.SOCK_SEQPACKET, linux.SOCK_STREAM:
 		ep = transport.NewConnectioned(stype, t.Kernel())
 	default:
 		return nil, syserr.ErrInvalidArgument
 	}
 
-	return New(t, ep, isPacket), nil
+	return New(t, ep, stype), nil
 }
 
 // Pair creates a new pair of AF_UNIX connected sockets.
-func (*provider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
 	// Check arguments.
 	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
 		return nil, nil, syserr.ErrProtocolNotSupported
 	}
 
-	var isPacket bool
 	switch stype {
-	case linux.SOCK_STREAM:
-	case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
-		isPacket = true
+	case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
+		// Ok
 	default:
 		return nil, nil, syserr.ErrInvalidArgument
 	}
 
 	// Create the endpoints and sockets.
 	ep1, ep2 := transport.NewPair(stype, t.Kernel())
-	s1 := New(t, ep1, isPacket)
-	s2 := New(t, ep2, isPacket)
+	s1 := New(t, ep1, stype)
+	s2 := New(t, ep2, stype)
 
 	return s1, s2, nil
 }
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index dbe53b9a2..0b5ef84c4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -76,13 +76,13 @@ var SocketFamily = abi.ValueSet{
 
 // SocketType are the possible socket(2) types.
 var SocketType = abi.ValueSet{
-	linux.SOCK_STREAM:    "SOCK_STREAM",
-	linux.SOCK_DGRAM:     "SOCK_DGRAM",
-	linux.SOCK_RAW:       "SOCK_RAW",
-	linux.SOCK_RDM:       "SOCK_RDM",
-	linux.SOCK_SEQPACKET: "SOCK_SEQPACKET",
-	linux.SOCK_DCCP:      "SOCK_DCCP",
-	linux.SOCK_PACKET:    "SOCK_PACKET",
+	uint64(linux.SOCK_STREAM):    "SOCK_STREAM",
+	uint64(linux.SOCK_DGRAM):     "SOCK_DGRAM",
+	uint64(linux.SOCK_RAW):       "SOCK_RAW",
+	uint64(linux.SOCK_RDM):       "SOCK_RDM",
+	uint64(linux.SOCK_SEQPACKET): "SOCK_SEQPACKET",
+	uint64(linux.SOCK_DCCP):      "SOCK_DCCP",
+	uint64(linux.SOCK_PACKET):    "SOCK_PACKET",
 }
 
 // SocketFlagSet are the possible socket(2) flags.
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index f76989ae2..1c057526b 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -19,6 +19,7 @@ go_library(
         "sys_identity.go",
         "sys_inotify.go",
         "sys_lseek.go",
+        "sys_mempolicy.go",
         "sys_mmap.go",
         "sys_mount.go",
         "sys_pipe.go",
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 1ba3695fb..72146ea63 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -92,6 +92,10 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
 		// TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
 		// also be sent to the application.
 		return nil
+	case syserror.ECONNRESET:
+		// For TCP sendfile connections, we may have a reset. But we
+		// should just return n as the result.
+		return nil
 	case syserror.ErrWouldBlock:
 		// Syscall would block, but completed a partial read/write.
 		// This case should only be returned by IssueIO for nonblocking
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 3e4d312af..5251c2463 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -34,33 +34,6 @@ const _AUDIT_ARCH_X86_64 = 0xc000003e
 // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
 // numbers from Linux 4.4. The entries commented out are those syscalls we
 // don't currently support.
-//
-// Syscall support is documented as annotations in Go comments of the form:
-// @Syscall(<name>, <key:value>, ...)
-//
-// Supported args and values are:
-//
-// - arg: A syscall option. This entry only applies to the syscall when given
-//        this option.
-// - support: Indicates support level
-//   - UNIMPLEMENTED: Unimplemented (default, implies returns:ENOSYS)
-//   - PARTIAL: Partial support. Details should be provided in note.
-//   - FULL: Full support
-// - returns: Indicates a known return value. Values are syscall errors. This
-//            is treated as a string so you can use something like
-//            "returns:EPERM or ENOSYS".
-// - issue: A Github issue number.
-// - note: A note
-//
-// Example:
-// // @Syscall(mmap, arg:MAP_PRIVATE, support:FULL, note:Private memory fully supported)
-// // @Syscall(mmap, arg:MAP_SHARED, issue:123, note:Shared memory not supported)
-// // @Syscall(setxattr, returns:ENOTSUP, note:Requires file system support)
-//
-// Annotations should be placed as close to their implementation as possible
-// (preferrably as part of a supporting function's Godoc) and should be
-// updated as syscall support changes. Unimplemented syscalls are documented
-// here due to their lack of a supporting function or method.
 var AMD64 = &kernel.SyscallTable{
 	OS:   abi.Linux,
 	Arch: arch.AMD64,
@@ -74,405 +47,338 @@ var AMD64 = &kernel.SyscallTable{
 		Version: "#1 SMP Sun Jan 10 15:06:54 PST 2016",
 	},
 	AuditNumber: _AUDIT_ARCH_X86_64,
-	Table: map[uintptr]kernel.SyscallFn{
-		0:  Read,
-		1:  Write,
-		2:  Open,
-		3:  Close,
-		4:  Stat,
-		5:  Fstat,
-		6:  Lstat,
-		7:  Poll,
-		8:  Lseek,
-		9:  Mmap,
-		10: Mprotect,
-		11: Munmap,
-		12: Brk,
-		13: RtSigaction,
-		14: RtSigprocmask,
-		15: RtSigreturn,
-		16: Ioctl,
-		17: Pread64,
-		18: Pwrite64,
-		19: Readv,
-		20: Writev,
-		21: Access,
-		22: Pipe,
-		23: Select,
-		24: SchedYield,
-		25: Mremap,
-		26: Msync,
-		27: Mincore,
-		28: Madvise,
-		29: Shmget,
-		30: Shmat,
-		31: Shmctl,
-		32: Dup,
-		33: Dup2,
-		34: Pause,
-		35: Nanosleep,
-		36: Getitimer,
-		37: Alarm,
-		38: Setitimer,
-		39: Getpid,
-		40: Sendfile,
-		41: Socket,
-		42: Connect,
-		43: Accept,
-		44: SendTo,
-		45: RecvFrom,
-		46: SendMsg,
-		47: RecvMsg,
-		48: Shutdown,
-		49: Bind,
-		50: Listen,
-		51: GetSockName,
-		52: GetPeerName,
-		53: SocketPair,
-		54: SetSockOpt,
-		55: GetSockOpt,
-		56: Clone,
-		57: Fork,
-		58: Vfork,
-		59: Execve,
-		60: Exit,
-		61: Wait4,
-		62: Kill,
-		63: Uname,
-		64: Semget,
-		65: Semop,
-		66: Semctl,
-		67: Shmdt,
-		//     68: @Syscall(Msgget), TODO(b/29354921)
-		//     69: @Syscall(Msgsnd), TODO(b/29354921)
-		//     70: @Syscall(Msgrcv), TODO(b/29354921)
-		//     71: @Syscall(Msgctl), TODO(b/29354921)
-		72:  Fcntl,
-		73:  Flock,
-		74:  Fsync,
-		75:  Fdatasync,
-		76:  Truncate,
-		77:  Ftruncate,
-		78:  Getdents,
-		79:  Getcwd,
-		80:  Chdir,
-		81:  Fchdir,
-		82:  Rename,
-		83:  Mkdir,
-		84:  Rmdir,
-		85:  Creat,
-		86:  Link,
-		87:  Unlink,
-		88:  Symlink,
-		89:  Readlink,
-		90:  Chmod,
-		91:  Fchmod,
-		92:  Chown,
-		93:  Fchown,
-		94:  Lchown,
-		95:  Umask,
-		96:  Gettimeofday,
-		97:  Getrlimit,
-		98:  Getrusage,
-		99:  Sysinfo,
-		100: Times,
-		101: Ptrace,
-		102: Getuid,
-		103: Syslog,
-		104: Getgid,
-		105: Setuid,
-		106: Setgid,
-		107: Geteuid,
-		108: Getegid,
-		109: Setpgid,
-		110: Getppid,
-		111: Getpgrp,
-		112: Setsid,
-		113: Setreuid,
-		114: Setregid,
-		115: Getgroups,
-		116: Setgroups,
-		117: Setresuid,
-		118: Getresuid,
-		119: Setresgid,
-		120: Getresgid,
-		121: Getpgid,
-		//     122: @Syscall(Setfsuid), TODO(b/112851702)
-		//     123: @Syscall(Setfsgid), TODO(b/112851702)
-		124: Getsid,
-		125: Capget,
-		126: Capset,
-		127: RtSigpending,
-		128: RtSigtimedwait,
-		129: RtSigqueueinfo,
-		130: RtSigsuspend,
-		131: Sigaltstack,
-		132: Utime,
-		133: Mknod,
-		// @Syscall(Uselib, note:Obsolete)
-		134: syscalls.Error(syscall.ENOSYS),
-		// @Syscall(SetPersonality, returns:EINVAL, note:Unable to change personality)
-		135: syscalls.ErrorWithEvent(syscall.EINVAL),
-		// @Syscall(Ustat, note:Needs filesystem support)
-		136: syscalls.ErrorWithEvent(syscall.ENOSYS),
-		137: Statfs,
-		138: Fstatfs,
-		//     139: @Syscall(Sysfs), TODO(gvisor.dev/issue/165)
-		140: Getpriority,
-		141: Setpriority,
-		// @Syscall(SchedSetparam, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
-		142: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice
-		143: SchedGetparam,
-		144: SchedSetscheduler,
-		145: SchedGetscheduler,
-		146: SchedGetPriorityMax,
-		147: SchedGetPriorityMin,
-		// @Syscall(SchedRrGetInterval, returns:EPERM)
-		148: syscalls.ErrorWithEvent(syscall.EPERM),
-		149: Mlock,
-		150: Munlock,
-		151: Mlockall,
-		152: Munlockall,
-		// @Syscall(Vhangup, returns:EPERM)
-		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG),
-		// @Syscall(ModifyLdt, returns:EPERM)
-		154: syscalls.Error(syscall.EPERM),
-		// @Syscall(PivotRoot, returns:EPERM)
-		155: syscalls.Error(syscall.EPERM),
-		// @Syscall(Sysctl, returns:EPERM)
-		156: syscalls.Error(syscall.EPERM), // syscall is "worthless"
-		157: Prctl,
-		158: ArchPrctl,
-		// @Syscall(Adjtimex, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_time; ENOSYS otherwise)
-		159: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
-		160: Setrlimit,
-		161: Chroot,
-		162: Sync,
-		// @Syscall(Acct, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_pacct; ENOSYS otherwise)
-		163: syscalls.CapError(linux.CAP_SYS_PACCT), // requires cap_sys_pacct
-		// @Syscall(Settimeofday, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_time; ENOSYS otherwise)
-		164: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
-		165: Mount,
-		166: Umount2,
-		// @Syscall(Swapon, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
-		167: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin
-		// @Syscall(Swapoff, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
-		168: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin
-		// @Syscall(Reboot, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
-		169: syscalls.CapError(linux.CAP_SYS_BOOT), // requires cap_sys_boot
-		170: Sethostname,
-		171: Setdomainname,
-		// @Syscall(Iopl, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_rawio; ENOSYS otherwise)
-		172: syscalls.CapError(linux.CAP_SYS_RAWIO), // requires cap_sys_rawio
-		// @Syscall(Ioperm, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_rawio; ENOSYS otherwise)
-		173: syscalls.CapError(linux.CAP_SYS_RAWIO), // requires cap_sys_rawio
-		// @Syscall(CreateModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
-		174: syscalls.CapError(linux.CAP_SYS_MODULE), // CreateModule, requires cap_sys_module
-		// @Syscall(InitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
-		175: syscalls.CapError(linux.CAP_SYS_MODULE), // requires cap_sys_module
-		// @Syscall(DeleteModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
-		176: syscalls.CapError(linux.CAP_SYS_MODULE), // requires cap_sys_module
-		// @Syscall(GetKernelSyms, note:Not supported in > 2.6)
-		177: syscalls.Error(syscall.ENOSYS),
-		// @Syscall(QueryModule, note:Not supported in > 2.6)
-		178: syscalls.Error(syscall.ENOSYS),
-		// @Syscall(Quotactl, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
-		179: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin (most operations)
-		// @Syscall(Nfsservctl, note:Does not exist > 3.1)
-		180: syscalls.Error(syscall.ENOSYS),
-		// @Syscall(Getpmsg, note:Not implemented in Linux)
-		181: syscalls.Error(syscall.ENOSYS),
-		// @Syscall(Putpmsg, note:Not implemented in Linux)
-		182: syscalls.Error(syscall.ENOSYS),
-		// @Syscall(AfsSyscall, note:Not implemented in Linux)
-		183: syscalls.Error(syscall.ENOSYS),
-		// @Syscall(Tuxcall, note:Not implemented in Linux)
-		184: syscalls.Error(syscall.ENOSYS),
-		// @Syscall(Security, note:Not implemented in Linux)
-		185: syscalls.Error(syscall.ENOSYS),
-		186: Gettid,
-		187: nil, // @Syscall(Readahead), TODO(b/29351341)
-		// @Syscall(Setxattr, returns:ENOTSUP, note:Requires filesystem support)
-		188: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Lsetxattr, returns:ENOTSUP, note:Requires filesystem support)
-		189: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Fsetxattr, returns:ENOTSUP, note:Requires filesystem support)
-		190: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Getxattr, returns:ENOTSUP, note:Requires filesystem support)
-		191: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Lgetxattr, returns:ENOTSUP, note:Requires filesystem support)
-		192: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Fgetxattr, returns:ENOTSUP, note:Requires filesystem support)
-		193: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Listxattr, returns:ENOTSUP, note:Requires filesystem support)
-		194: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Llistxattr, returns:ENOTSUP, note:Requires filesystem support)
-		195: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Flistxattr, returns:ENOTSUP, note:Requires filesystem support)
-		196: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Removexattr, returns:ENOTSUP, note:Requires filesystem support)
-		197: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Lremovexattr, returns:ENOTSUP, note:Requires filesystem support)
-		198: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		// @Syscall(Fremovexattr, returns:ENOTSUP, note:Requires filesystem support)
-		199: syscalls.ErrorWithEvent(syscall.ENOTSUP),
-		200: Tkill,
-		201: Time,
-		202: Futex,
-		203: SchedSetaffinity,
-		204: SchedGetaffinity,
-		// @Syscall(SetThreadArea, note:Expected to return ENOSYS on 64-bit)
-		205: syscalls.Error(syscall.ENOSYS),
-		206: IoSetup,
-		207: IoDestroy,
-		208: IoGetevents,
-		209: IoSubmit,
-		210: IoCancel,
-		// @Syscall(GetThreadArea, note:Expected to return ENOSYS on 64-bit)
-		211: syscalls.Error(syscall.ENOSYS),
-		// @Syscall(LookupDcookie, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
-		212: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin
-		213: EpollCreate,
-		// @Syscall(EpollCtlOld, note:Deprecated)
-		214: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated (afaik, unused)
-		// @Syscall(EpollWaitOld, note:Deprecated)
-		215: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated (afaik, unused)
-		// @Syscall(RemapFilePages, note:Deprecated)
-		216: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated since 3.16
-		217: Getdents64,
-		218: SetTidAddress,
-		219: RestartSyscall,
-		//     220: @Syscall(Semtimedop), TODO(b/29354920)
-		221: Fadvise64,
-		222: TimerCreate,
-		223: TimerSettime,
-		224: TimerGettime,
-		225: TimerGetoverrun,
-		226: TimerDelete,
-		227: ClockSettime,
-		228: ClockGettime,
-		229: ClockGetres,
-		230: ClockNanosleep,
-		231: ExitGroup,
-		232: EpollWait,
-		233: EpollCtl,
-		234: Tgkill,
-		235: Utimes,
-		// @Syscall(Vserver, note:Not implemented by Linux)
-		236: syscalls.Error(syscall.ENOSYS), // Vserver, not implemented by Linux
-		// @Syscall(Mbind, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise), TODO(b/117792295)
-		237: syscalls.CapError(linux.CAP_SYS_NICE), // may require cap_sys_nice
-		238: SetMempolicy,
-		239: GetMempolicy,
-		//     240: @Syscall(MqOpen), TODO(b/29354921)
-		//     241: @Syscall(MqUnlink), TODO(b/29354921)
-		//     242: @Syscall(MqTimedsend), TODO(b/29354921)
-		//     243: @Syscall(MqTimedreceive), TODO(b/29354921)
-		//     244: @Syscall(MqNotify), TODO(b/29354921)
-		//     245: @Syscall(MqGetsetattr), TODO(b/29354921)
-		246: syscalls.CapError(linux.CAP_SYS_BOOT), // kexec_load, requires cap_sys_boot
-		247: Waitid,
-		// @Syscall(AddKey, returns:EACCES, note:Not available to user)
-		248: syscalls.Error(syscall.EACCES),
-		// @Syscall(RequestKey, returns:EACCES, note:Not available to user)
-		249: syscalls.Error(syscall.EACCES),
-		// @Syscall(Keyctl, returns:EACCES, note:Not available to user)
-		250: syscalls.Error(syscall.EACCES),
-		// @Syscall(IoprioSet, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
-		251: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_nice or cap_sys_admin (depending)
-		// @Syscall(IoprioGet, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
-		252: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_nice or cap_sys_admin (depending)
-		253: InotifyInit,
-		254: InotifyAddWatch,
-		255: InotifyRmWatch,
-		// @Syscall(MigratePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
-		256: syscalls.CapError(linux.CAP_SYS_NICE),
-		257: Openat,
-		258: Mkdirat,
-		259: Mknodat,
-		260: Fchownat,
-		261: Futimesat,
-		262: Fstatat,
-		263: Unlinkat,
-		264: Renameat,
-		265: Linkat,
-		266: Symlinkat,
-		267: Readlinkat,
-		268: Fchmodat,
-		269: Faccessat,
-		270: Pselect,
-		271: Ppoll,
-		272: Unshare,
-		// @Syscall(SetRobustList, note:Obsolete)
-		273: syscalls.Error(syscall.ENOSYS),
-		// @Syscall(GetRobustList, note:Obsolete)
-		274: syscalls.Error(syscall.ENOSYS),
-		275: Splice,
-		//     276: @Syscall(Tee), TODO(b/29354098)
-		277: SyncFileRange,
-		//     278: @Syscall(Vmsplice), TODO(b/29354098)
-		// @Syscall(MovePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
-		279: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice (mostly)
-		280: Utimensat,
-		281: EpollPwait,
-		//     282: @Syscall(Signalfd), TODO(b/19846426)
-		283: TimerfdCreate,
-		284: Eventfd,
-		285: Fallocate,
-		286: TimerfdSettime,
-		287: TimerfdGettime,
-		288: Accept4,
-		//     289: @Syscall(Signalfd4), TODO(b/19846426)
-		290: Eventfd2,
-		291: EpollCreate1,
-		292: Dup3,
-		293: Pipe2,
-		294: InotifyInit1,
-		295: Preadv,
-		296: Pwritev,
-		297: RtTgsigqueueinfo,
-		// @Syscall(PerfEventOpen, returns:ENODEV, note:No support for perf counters)
-		298: syscalls.ErrorWithEvent(syscall.ENODEV),
-		299: RecvMMsg,
-		// @Syscall(FanotifyInit, note:Needs CONFIG_FANOTIFY)
-		300: syscalls.ErrorWithEvent(syscall.ENOSYS),
-		// @Syscall(FanotifyMark, note:Needs CONFIG_FANOTIFY)
-		301: syscalls.ErrorWithEvent(syscall.ENOSYS),
-		302: Prlimit64,
-		// @Syscall(NameToHandleAt, returns:EOPNOTSUPP, note:Needs filesystem support)
-		303: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP),
-		// @Syscall(OpenByHandleAt, returns:EOPNOTSUPP, note:Needs filesystem support)
-		304: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP),
-		// @Syscall(ClockAdjtime, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
-		305: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
-		306: Syncfs,
-		307: SendMMsg,
-		//     308: @Syscall(Setns), TODO(b/29354995)
-		309: Getcpu,
-		//     310: @Syscall(ProcessVmReadv), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace
-		//     311: @Syscall(ProcessVmWritev), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace
-		// @Syscall(Kcmp, returns:EPERM or ENOSYS, note:Requires cap_sys_ptrace)
-		312: syscalls.CapError(linux.CAP_SYS_PTRACE),
-		// @Syscall(FinitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
-		313: syscalls.CapError(linux.CAP_SYS_MODULE),
-		//     314: @Syscall(SchedSetattr), TODO(b/118902272), we have no scheduler
-		//     315: @Syscall(SchedGetattr), TODO(b/118902272), we have no scheduler
-		//     316: @Syscall(Renameat2), TODO(b/118902772)
-		317: Seccomp,
-		318: GetRandom,
-		319: MemfdCreate,
-		// @Syscall(KexecFileLoad, EPERM or ENOSYS, note:Infeasible to support. Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
-		320: syscalls.CapError(linux.CAP_SYS_BOOT),
-		// @Syscall(Bpf, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
-		321: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin for all commands
-		//     322: @Syscall(Execveat), TODO(b/118901836)
-		//     323: @Syscall(Userfaultfd), TODO(b/118906345)
-		//     324: @Syscall(Membarrier), TODO(b/118904897)
-		325: Mlock2,
+	Table: map[uintptr]kernel.Syscall{
+		0:   syscalls.Supported("read", Read),
+		1:   syscalls.Supported("write", Write),
+		2:   syscalls.Supported("open", Open),
+		3:   syscalls.Supported("close", Close),
+		4:   syscalls.Undocumented("stat", Stat),
+		5:   syscalls.Undocumented("fstat", Fstat),
+		6:   syscalls.Undocumented("lstat", Lstat),
+		7:   syscalls.Undocumented("poll", Poll),
+		8:   syscalls.Undocumented("lseek", Lseek),
+		9:   syscalls.Undocumented("mmap", Mmap),
+		10:  syscalls.Undocumented("mprotect", Mprotect),
+		11:  syscalls.Undocumented("munmap", Munmap),
+		12:  syscalls.Undocumented("brk", Brk),
+		13:  syscalls.Undocumented("rt_sigaction", RtSigaction),
+		14:  syscalls.Undocumented("rt_sigprocmask", RtSigprocmask),
+		15:  syscalls.Undocumented("rt_sigreturn", RtSigreturn),
+		16:  syscalls.Undocumented("ioctl", Ioctl),
+		17:  syscalls.Undocumented("pread64", Pread64),
+		18:  syscalls.Undocumented("pwrite64", Pwrite64),
+		19:  syscalls.Undocumented("readv", Readv),
+		20:  syscalls.Undocumented("writev", Writev),
+		21:  syscalls.Undocumented("access", Access),
+		22:  syscalls.Undocumented("pipe", Pipe),
+		23:  syscalls.Undocumented("select", Select),
+		24:  syscalls.Undocumented("sched_yield", SchedYield),
+		25:  syscalls.Undocumented("mremap", Mremap),
+		26:  syscalls.Undocumented("msync", Msync),
+		27:  syscalls.Undocumented("mincore", Mincore),
+		28:  syscalls.Undocumented("madvise", Madvise),
+		29:  syscalls.Undocumented("shmget", Shmget),
+		30:  syscalls.Undocumented("shmat", Shmat),
+		31:  syscalls.Undocumented("shmctl", Shmctl),
+		32:  syscalls.Undocumented("dup", Dup),
+		33:  syscalls.Undocumented("dup2", Dup2),
+		34:  syscalls.Undocumented("pause", Pause),
+		35:  syscalls.Undocumented("nanosleep", Nanosleep),
+		36:  syscalls.Undocumented("getitimer", Getitimer),
+		37:  syscalls.Undocumented("alarm", Alarm),
+		38:  syscalls.Undocumented("setitimer", Setitimer),
+		39:  syscalls.Undocumented("getpid", Getpid),
+		40:  syscalls.Undocumented("sendfile", Sendfile),
+		41:  syscalls.Undocumented("socket", Socket),
+		42:  syscalls.Undocumented("connect", Connect),
+		43:  syscalls.Undocumented("accept", Accept),
+		44:  syscalls.Undocumented("sendto", SendTo),
+		45:  syscalls.Undocumented("recvfrom", RecvFrom),
+		46:  syscalls.Undocumented("sendmsg", SendMsg),
+		47:  syscalls.Undocumented("recvmsg", RecvMsg),
+		48:  syscalls.Undocumented("shutdown", Shutdown),
+		49:  syscalls.Undocumented("bind", Bind),
+		50:  syscalls.Undocumented("listen", Listen),
+		51:  syscalls.Undocumented("getsockname", GetSockName),
+		52:  syscalls.Undocumented("getpeername", GetPeerName),
+		53:  syscalls.Undocumented("socketpair", SocketPair),
+		54:  syscalls.Undocumented("setsockopt", SetSockOpt),
+		55:  syscalls.Undocumented("getsockopt", GetSockOpt),
+		56:  syscalls.Undocumented("clone", Clone),
+		57:  syscalls.Undocumented("fork", Fork),
+		58:  syscalls.Undocumented("vfork", Vfork),
+		59:  syscalls.Undocumented("execve", Execve),
+		60:  syscalls.Undocumented("exit", Exit),
+		61:  syscalls.Undocumented("wait4", Wait4),
+		62:  syscalls.Undocumented("kill", Kill),
+		63:  syscalls.Undocumented("uname", Uname),
+		64:  syscalls.Undocumented("semget", Semget),
+		65:  syscalls.Undocumented("semop", Semop),
+		66:  syscalls.Undocumented("semctl", Semctl),
+		67:  syscalls.Undocumented("shmdt", Shmdt),
+		68:  syscalls.ErrorWithEvent("msgget", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		69:  syscalls.ErrorWithEvent("msgsnd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		70:  syscalls.ErrorWithEvent("msgrcv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		71:  syscalls.ErrorWithEvent("msgctl", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+		72:  syscalls.Undocumented("fcntl", Fcntl),
+		73:  syscalls.Undocumented("flock", Flock),
+		74:  syscalls.Undocumented("fsync", Fsync),
+		75:  syscalls.Undocumented("fdatasync", Fdatasync),
+		76:  syscalls.Undocumented("truncate", Truncate),
+		77:  syscalls.Undocumented("ftruncate", Ftruncate),
+		78:  syscalls.Undocumented("getdents", Getdents),
+		79:  syscalls.Undocumented("getcwd", Getcwd),
+		80:  syscalls.Undocumented("chdir", Chdir),
+		81:  syscalls.Undocumented("fchdir", Fchdir),
+		82:  syscalls.Undocumented("rename", Rename),
+		83:  syscalls.Undocumented("mkdir", Mkdir),
+		84:  syscalls.Undocumented("rmdir", Rmdir),
+		85:  syscalls.Undocumented("creat", Creat),
+		86:  syscalls.Undocumented("link", Link),
+		87:  syscalls.Undocumented("link", Unlink),
+		88:  syscalls.Undocumented("symlink", Symlink),
+		89:  syscalls.Undocumented("readlink", Readlink),
+		90:  syscalls.Undocumented("chmod", Chmod),
+		91:  syscalls.Undocumented("fchmod", Fchmod),
+		92:  syscalls.Undocumented("chown", Chown),
+		93:  syscalls.Undocumented("fchown", Fchown),
+		94:  syscalls.Undocumented("lchown", Lchown),
+		95:  syscalls.Undocumented("umask", Umask),
+		96:  syscalls.Undocumented("gettimeofday", Gettimeofday),
+		97:  syscalls.Undocumented("getrlimit", Getrlimit),
+		98:  syscalls.Undocumented("getrusage", Getrusage),
+		99:  syscalls.Undocumented("sysinfo", Sysinfo),
+		100: syscalls.Undocumented("times", Times),
+		101: syscalls.Undocumented("ptrace", Ptrace),
+		102: syscalls.Undocumented("getuid", Getuid),
+		103: syscalls.Undocumented("syslog", Syslog),
+		104: syscalls.Undocumented("getgid", Getgid),
+		105: syscalls.Undocumented("setuid", Setuid),
+		106: syscalls.Undocumented("setgid", Setgid),
+		107: syscalls.Undocumented("geteuid", Geteuid),
+		108: syscalls.Undocumented("getegid", Getegid),
+		109: syscalls.Undocumented("setpgid", Setpgid),
+		110: syscalls.Undocumented("getppid", Getppid),
+		111: syscalls.Undocumented("getpgrp", Getpgrp),
+		112: syscalls.Undocumented("setsid", Setsid),
+		113: syscalls.Undocumented("setreuid", Setreuid),
+		114: syscalls.Undocumented("setregid", Setregid),
+		115: syscalls.Undocumented("getgroups", Getgroups),
+		116: syscalls.Undocumented("setgroups", Setgroups),
+		117: syscalls.Undocumented("setresuid", Setresuid),
+		118: syscalls.Undocumented("getresuid", Getresuid),
+		119: syscalls.Undocumented("setresgid", Setresgid),
+		120: syscalls.Undocumented("setresgid", Getresgid),
+		121: syscalls.Undocumented("getpgid", Getpgid),
+		122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+		123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+		124: syscalls.Undocumented("getsid", Getsid),
+		125: syscalls.Undocumented("capget", Capget),
+		126: syscalls.Undocumented("capset", Capset),
+		127: syscalls.Undocumented("rt_sigpending", RtSigpending),
+		128: syscalls.Undocumented("rt_sigtimedwait", RtSigtimedwait),
+		129: syscalls.Undocumented("rt_sigqueueinfo", RtSigqueueinfo),
+		130: syscalls.Undocumented("rt_sigsuspend", RtSigsuspend),
+		131: syscalls.Undocumented("sigaltstack", Sigaltstack),
+		132: syscalls.Undocumented("utime", Utime),
+		133: syscalls.Undocumented("mknod", Mknod),
+		134: syscalls.Error("uselib", syscall.ENOSYS, "Obsolete", nil),
+		135: syscalls.ErrorWithEvent("personality", syscall.EINVAL, "Unable to change personality.", nil),
+		136: syscalls.ErrorWithEvent("ustat", syscall.ENOSYS, "Needs filesystem support.", nil),
+		137: syscalls.Undocumented("statfs", Statfs),
+		138: syscalls.Undocumented("fstatfs", Fstatfs),
+		139: syscalls.ErrorWithEvent("sysfs", syscall.ENOSYS, "", []string{"gvisor.dev/issue/165"}),
+		140: syscalls.Undocumented("getpriority", Getpriority),
+		141: syscalls.Undocumented("setpriority", Setpriority),
+		142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
+		143: syscalls.Undocumented("sched_getparam", SchedGetparam),
+		144: syscalls.Undocumented("sched_setscheduler", SchedSetscheduler),
+		145: syscalls.Undocumented("sched_getscheduler", SchedGetscheduler),
+		146: syscalls.Undocumented("sched_get_priority_max", SchedGetPriorityMax),
+		147: syscalls.Undocumented("sched_get_priority_min", SchedGetPriorityMin),
+		148: syscalls.ErrorWithEvent("sched_rr_get_interval", syscall.EPERM, "", nil),
+		149: syscalls.Undocumented("mlock", Mlock),
+		150: syscalls.Undocumented("munlock", Munlock),
+		151: syscalls.Undocumented("mlockall", Mlockall),
+		152: syscalls.Undocumented("munlockall", Munlockall),
+		153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
+		154: syscalls.Error("modify_ldt", syscall.EPERM, "", nil),
+		155: syscalls.Error("pivot_root", syscall.EPERM, "", nil),
+		156: syscalls.Error("sysctl", syscall.EPERM, `syscall is "worthless"`, nil),
+		157: syscalls.Undocumented("prctl", Prctl),
+		158: syscalls.Undocumented("arch_prctl", ArchPrctl),
+		159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
+		160: syscalls.Undocumented("setrlimit", Setrlimit),
+		161: syscalls.Undocumented("chroot", Chroot),
+		162: syscalls.Undocumented("sync", Sync),
+		163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
+		164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
+		165: syscalls.Undocumented("mount", Mount),
+		166: syscalls.Undocumented("umount2", Umount2),
+		167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
+		168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
+		169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
+		170: syscalls.Undocumented("sethostname", Sethostname),
+		171: syscalls.Undocumented("setdomainname", Setdomainname),
+		172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil),
+		173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil),
+		174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil),
+		175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
+		176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
+		177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in > 2.6", nil),
+		178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in > 2.6", nil),
+		179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
+		180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Does not exist > 3.1", nil),
+		181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux", nil),
+		182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux", nil),
+		183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux", nil),
+		184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux", nil),
+		185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux", nil),
+		186: syscalls.Undocumented("gettid", Gettid),
+		187: syscalls.ErrorWithEvent("readahead", syscall.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341)
+		188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+		200: syscalls.Undocumented("tkill", Tkill),
+		201: syscalls.Undocumented("time", Time),
+		202: syscalls.Undocumented("futex", Futex),
+		203: syscalls.Undocumented("sched_setaffinity", SchedSetaffinity),
+		204: syscalls.Undocumented("sched_getaffinity", SchedGetaffinity),
+		205: syscalls.Error("set_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+		206: syscalls.Undocumented("io_setup", IoSetup),
+		207: syscalls.Undocumented("io_destroy", IoDestroy),
+		208: syscalls.Undocumented("io_getevents", IoGetevents),
+		209: syscalls.Undocumented("io_submit", IoSubmit),
+		210: syscalls.Undocumented("io_cancel", IoCancel),
+		211: syscalls.Error("get_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+		212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
+		213: syscalls.Undocumented("epoll_create", EpollCreate),
+		214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated", nil),
+		215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated", nil),
+		216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since 3.16", nil),
+		217: syscalls.Undocumented("getdents64", Getdents64),
+		218: syscalls.Undocumented("set_tid_address", SetTidAddress),
+		219: syscalls.Undocumented("restart_syscall", RestartSyscall),
+		220: syscalls.ErrorWithEvent("semtimedop", syscall.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
+		221: syscalls.Undocumented("fadvise64", Fadvise64),
+		222: syscalls.Undocumented("timer_create", TimerCreate),
+		223: syscalls.Undocumented("timer_settime", TimerSettime),
+		224: syscalls.Undocumented("timer_gettime", TimerGettime),
+		225: syscalls.Undocumented("timer_getoverrun", TimerGetoverrun),
+		226: syscalls.Undocumented("timer_delete", TimerDelete),
+		227: syscalls.Undocumented("clock_settime", ClockSettime),
+		228: syscalls.Undocumented("clock_gettime", ClockGettime),
+		229: syscalls.Undocumented("clock_getres", ClockGetres),
+		230: syscalls.Undocumented("clock_nanosleep", ClockNanosleep),
+		231: syscalls.Undocumented("exit_group", ExitGroup),
+		232: syscalls.Undocumented("epoll_wait", EpollWait),
+		233: syscalls.Undocumented("epoll_ctl", EpollCtl),
+		234: syscalls.Undocumented("tgkill", Tgkill),
+		235: syscalls.Undocumented("utimes", Utimes),
+		236: syscalls.Error("vserver", syscall.ENOSYS, "Not implemented by Linux", nil),
+		237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
+		238: syscalls.Undocumented("set_mempolicy", SetMempolicy),
+		239: syscalls.Undocumented("get_mempolicy", GetMempolicy),
+		240: syscalls.ErrorWithEvent("mq_open", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}),         // TODO(b/29354921)
+		241: syscalls.ErrorWithEvent("mq_unlink", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
+		242: syscalls.ErrorWithEvent("mq_timedsend", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}),    // TODO(b/29354921)
+		243: syscalls.ErrorWithEvent("mq_timedreceive", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+		244: syscalls.ErrorWithEvent("mq_notify", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}),       // TODO(b/29354921)
+		245: syscalls.ErrorWithEvent("mq_getsetattr", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}),   // TODO(b/29354921)
+		246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
+		247: syscalls.Undocumented("waitid", Waitid),
+		248: syscalls.Error("add_key", syscall.EACCES, "Not available to user", nil),
+		249: syscalls.Error("request_key", syscall.EACCES, "Not available to user", nil),
+		250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user", nil),
+		251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+		252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+		253: syscalls.Undocumented("inotify_init", InotifyInit),
+		254: syscalls.Undocumented("inotify_add_watch", InotifyAddWatch),
+		255: syscalls.Undocumented("inotify_rm_watch", InotifyRmWatch),
+		256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
+		257: syscalls.Undocumented("openat", Openat),
+		258: syscalls.Undocumented("mkdirat", Mkdirat),
+		259: syscalls.Undocumented("mknodat", Mknodat),
+		260: syscalls.Undocumented("fchownat", Fchownat),
+		261: syscalls.Undocumented("futimesat", Futimesat),
+		262: syscalls.Undocumented("fstatat", Fstatat),
+		263: syscalls.Undocumented("unlinkat", Unlinkat),
+		264: syscalls.Undocumented("renameat", Renameat),
+		265: syscalls.Undocumented("linkat", Linkat),
+		266: syscalls.Undocumented("symlinkat", Symlinkat),
+		267: syscalls.Undocumented("readlinkat", Readlinkat),
+		268: syscalls.Undocumented("fchmodat", Fchmodat),
+		269: syscalls.Undocumented("faccessat", Faccessat),
+		270: syscalls.Undocumented("pselect", Pselect),
+		271: syscalls.Undocumented("ppoll", Ppoll),
+		272: syscalls.Undocumented("unshare", Unshare),
+		273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete", nil),
+		274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete", nil),
+		275: syscalls.PartiallySupported("splice", Splice, "Stub implementation", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+		276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}),                   // TODO(b/29354098)
+		277: syscalls.Undocumented("sync_file_range", SyncFileRange),
+		278: syscalls.ErrorWithEvent("vmsplice", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+		279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil),                              // requires cap_sys_nice (mostly)
+		280: syscalls.Undocumented("utimensat", Utimensat),
+		281: syscalls.Undocumented("epoll_pwait", EpollPwait),
+		282: syscalls.ErrorWithEvent("signalfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
+		283: syscalls.Undocumented("timerfd_create", TimerfdCreate),
+		284: syscalls.Undocumented("eventfd", Eventfd),
+		285: syscalls.Undocumented("fallocate", Fallocate),
+		286: syscalls.Undocumented("timerfd_settime", TimerfdSettime),
+		287: syscalls.Undocumented("timerfd_gettime", TimerfdGettime),
+		288: syscalls.Undocumented("accept4", Accept4),
+		289: syscalls.ErrorWithEvent("signalfd4", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
+		290: syscalls.Undocumented("eventfd2", Eventfd2),
+		291: syscalls.Undocumented("epoll_create1", EpollCreate1),
+		292: syscalls.Undocumented("dup3", Dup3),
+		293: syscalls.Undocumented("pipe2", Pipe2),
+		294: syscalls.Undocumented("inotify_init1", InotifyInit1),
+		295: syscalls.Undocumented("preadv", Preadv),
+		296: syscalls.Undocumented("pwritev", Pwritev),
+		297: syscalls.Undocumented("rt_tgsigqueueinfo", RtTgsigqueueinfo),
+		298: syscalls.ErrorWithEvent("perf_event_open", syscall.ENODEV, "No support for perf counters", nil),
+		299: syscalls.Undocumented("recvmmsg", RecvMMsg),
+		300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+		301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+		302: syscalls.Undocumented("prlimit64", Prlimit64),
+		303: syscalls.ErrorWithEvent("name_to_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil),
+		304: syscalls.ErrorWithEvent("open_by_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil),
+		305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
+		306: syscalls.Undocumented("syncfs", Syncfs),
+		307: syscalls.Undocumented("sendmmsg", SendMMsg),
+		308: syscalls.ErrorWithEvent("setns", syscall.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
+		309: syscalls.Undocumented("getcpu", Getcpu),
+		310: syscalls.ErrorWithEvent("process_vm_readv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+		311: syscalls.ErrorWithEvent("process_vm_writev", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+		312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
+		313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
+		314: syscalls.ErrorWithEvent("sched_setattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+		315: syscalls.ErrorWithEvent("sched_getattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+		316: syscalls.ErrorWithEvent("renameat2", syscall.ENOSYS, "", []string{"gvisor.dev/issue/263"}),                                           // TODO(b/118902772)
+		317: syscalls.Undocumented("seccomp", Seccomp),
+		318: syscalls.Undocumented("getrandom", GetRandom),
+		319: syscalls.Undocumented("memfd_create", MemfdCreate),
+		320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
+		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
+		322: syscalls.ErrorWithEvent("execveat", syscall.ENOSYS, "", []string{"gvisor.dev/issue/265"}),    // TODO(b/118901836)
+		323: syscalls.ErrorWithEvent("userfaultfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
+		324: syscalls.ErrorWithEvent("membarrier", syscall.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(b/118904897)
+		325: syscalls.Undocumented("mlock2", Mlock2),
+
 		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
-		//	326: @Syscall(CopyFileRange),
-		327: Preadv2,
-		328: Pwritev2,
+		326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil),
+		327: syscalls.Undocumented("preadv2", Preadv2),
+		328: syscalls.Undocumented("pwritev2", Pwritev2),
 	},
 
 	Emulate: map[usermem.Addr]uintptr{
diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go
new file mode 100644
index 000000000..652b2c206
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go
@@ -0,0 +1,312 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// We unconditionally report a single NUMA node. This also means that our
+// "nodemask_t" is a single unsigned long (uint64).
+const (
+	maxNodes        = 1
+	allowedNodemask = (1 << maxNodes) - 1
+)
+
+func copyInNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32) (uint64, error) {
+	// "nodemask points to a bit mask of node IDs that contains up to maxnode
+	// bits. The bit mask size is rounded to the next multiple of
+	// sizeof(unsigned long), but the kernel will use bits only up to maxnode.
+	// A NULL value of nodemask or a maxnode value of zero specifies the empty
+	// set of nodes. If the value of maxnode is zero, the nodemask argument is
+	// ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate
+	// because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses
+	// maxnode-1, not maxnode, as the number of bits.
+	bits := maxnode - 1
+	if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
+		return 0, syserror.EINVAL
+	}
+	if bits == 0 {
+		return 0, nil
+	}
+	// Copy in the whole nodemask.
+	numUint64 := (bits + 63) / 64
+	buf := t.CopyScratchBuffer(int(numUint64) * 8)
+	if _, err := t.CopyInBytes(addr, buf); err != nil {
+		return 0, err
+	}
+	val := usermem.ByteOrder.Uint64(buf)
+	// Check that only allowed bits in the first unsigned long in the nodemask
+	// are set.
+	if val&^allowedNodemask != 0 {
+		return 0, syserror.EINVAL
+	}
+	// Check that all remaining bits in the nodemask are 0.
+	for i := 8; i < len(buf); i++ {
+		if buf[i] != 0 {
+			return 0, syserror.EINVAL
+		}
+	}
+	return val, nil
+}
+
+func copyOutNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32, val uint64) error {
+	// mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of
+	// bits.
+	bits := maxnode - 1
+	if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
+		return syserror.EINVAL
+	}
+	if bits == 0 {
+		return nil
+	}
+	// Copy out the first unsigned long in the nodemask.
+	buf := t.CopyScratchBuffer(8)
+	usermem.ByteOrder.PutUint64(buf, val)
+	if _, err := t.CopyOutBytes(addr, buf); err != nil {
+		return err
+	}
+	// Zero out remaining unsigned longs in the nodemask.
+	if bits > 64 {
+		remAddr, ok := addr.AddLength(8)
+		if !ok {
+			return syserror.EFAULT
+		}
+		remUint64 := (bits - 1) / 64
+		if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// GetMempolicy implements the syscall get_mempolicy(2).
+func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	mode := args[0].Pointer()
+	nodemask := args[1].Pointer()
+	maxnode := args[2].Uint()
+	addr := args[3].Pointer()
+	flags := args[4].Uint()
+
+	if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	nodeFlag := flags&linux.MPOL_F_NODE != 0
+	addrFlag := flags&linux.MPOL_F_ADDR != 0
+	memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
+
+	// "EINVAL: The value specified by maxnode is less than the number of node
+	// IDs supported by the system." - get_mempolicy(2)
+	if nodemask != 0 && maxnode < maxNodes {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is
+	// ignored and the set of nodes (memories) that the thread is allowed to
+	// specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the
+	// absence of any mode flags) is returned in nodemask."
+	if memsAllowed {
+		// "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either
+		// MPOL_F_ADDR or MPOL_F_NODE."
+		if nodeFlag || addrFlag {
+			return 0, nil, syserror.EINVAL
+		}
+		if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, nil
+	}
+
+	// "If flags specifies MPOL_F_ADDR, then information is returned about the
+	// policy governing the memory address given in addr. ... If the mode
+	// argument is not NULL, then get_mempolicy() will store the policy mode
+	// and any optional mode flags of the requested NUMA policy in the location
+	// pointed to by this argument. If nodemask is not NULL, then the nodemask
+	// associated with the policy will be stored in the location pointed to by
+	// this argument."
+	if addrFlag {
+		policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if nodeFlag {
+			// "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR,
+			// get_mempolicy() will return the node ID of the node on which the
+			// address addr is allocated into the location pointed to by mode.
+			// If no page has yet been allocated for the specified address,
+			// get_mempolicy() will allocate a page as if the thread had
+			// performed a read (load) access to that address, and return the
+			// ID of the node where that page was allocated."
+			buf := t.CopyScratchBuffer(1)
+			_, err := t.CopyInBytes(addr, buf)
+			if err != nil {
+				return 0, nil, err
+			}
+			policy = 0 // maxNodes == 1
+		}
+		if mode != 0 {
+			if _, err := t.CopyOut(mode, policy); err != nil {
+				return 0, nil, err
+			}
+		}
+		if nodemask != 0 {
+			if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
+				return 0, nil, err
+			}
+		}
+		return 0, nil, nil
+	}
+
+	// "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did
+	// not specify MPOL_F_ADDR and addr is not NULL." This is partially
+	// inaccurate: if flags specifies MPOL_F_ADDR,
+	// mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will
+	// just (usually) fail to find a VMA at address 0 and return EFAULT.
+	if addr != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If flags is specified as 0, then information about the calling thread's
+	// default policy (as set by set_mempolicy(2)) is returned, in the buffers
+	// pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but
+	// not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE,
+	// then get_mempolicy() will return in the location pointed to by a
+	// non-NULL mode argument, the node ID of the next node that will be used
+	// for interleaving of internal kernel pages allocated on behalf of the
+	// thread."
+	policy, nodemaskVal := t.NumaPolicy()
+	if nodeFlag {
+		if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
+			return 0, nil, syserror.EINVAL
+		}
+		policy = 0 // maxNodes == 1
+	}
+	if mode != 0 {
+		if _, err := t.CopyOut(mode, policy); err != nil {
+			return 0, nil, err
+		}
+	}
+	if nodemask != 0 {
+		if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// SetMempolicy implements the syscall set_mempolicy(2).
+func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	modeWithFlags := args[0].Int()
+	nodemask := args[1].Pointer()
+	maxnode := args[2].Uint()
+
+	modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	t.SetNumaPolicy(modeWithFlags, nodemaskVal)
+	return 0, nil, nil
+}
+
+// Mbind implements the syscall mbind(2).
+func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Uint64()
+	mode := args[2].Int()
+	nodemask := args[3].Pointer()
+	maxnode := args[4].Uint()
+	flags := args[5].Uint()
+
+	if flags&^linux.MPOL_MF_VALID != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	// "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be
+	// privileged (CAP_SYS_NICE) to use this flag." - mbind(2)
+	if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) {
+		return 0, nil, syserror.EPERM
+	}
+
+	mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Since we claim to have only a single node, all flags can be ignored
+	// (since all pages must already be on that single node).
+	err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal)
+	return 0, nil, err
+}
+
+func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags int32, nodemask usermem.Addr, maxnode uint32) (int32, uint64, error) {
+	flags := modeWithFlags & linux.MPOL_MODE_FLAGS
+	mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
+	if flags == linux.MPOL_MODE_FLAGS {
+		// Can't specify both mode flags simultaneously.
+		return 0, 0, syserror.EINVAL
+	}
+	if mode < 0 || mode >= linux.MPOL_MAX {
+		// Must specify a valid mode.
+		return 0, 0, syserror.EINVAL
+	}
+
+	var nodemaskVal uint64
+	if nodemask != 0 {
+		var err error
+		nodemaskVal, err = copyInNodemask(t, nodemask, maxnode)
+		if err != nil {
+			return 0, 0, err
+		}
+	}
+
+	switch mode {
+	case linux.MPOL_DEFAULT:
+		// "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate;
+		// Linux allows a nodemask to be specified, as long as it is empty.
+		if nodemaskVal != 0 {
+			return 0, 0, syserror.EINVAL
+		}
+	case linux.MPOL_BIND, linux.MPOL_INTERLEAVE:
+		// These require a non-empty nodemask.
+		if nodemaskVal == 0 {
+			return 0, 0, syserror.EINVAL
+		}
+	case linux.MPOL_PREFERRED:
+		// This permits an empty nodemask, as long as no flags are set.
+		if nodemaskVal == 0 && flags != 0 {
+			return 0, 0, syserror.EINVAL
+		}
+	case linux.MPOL_LOCAL:
+		// This requires an empty nodemask and no flags set ...
+		if nodemaskVal != 0 || flags != 0 {
+			return 0, 0, syserror.EINVAL
+		}
+		// ... and is implemented as MPOL_PREFERRED.
+		mode = linux.MPOL_PREFERRED
+	default:
+		// Unknown mode, which we should have rejected above.
+		panic(fmt.Sprintf("unknown mode: %v", mode))
+	}
+
+	return mode | flags, nodemaskVal, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 64a6e639c..9926f0ac5 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -204,151 +204,6 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 }
 
-func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) {
-	if ptr != 0 {
-		return t.CopyOut(ptr, val)
-	}
-	return 0, nil
-}
-
-// GetMempolicy implements the syscall get_mempolicy(2).
-func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	mode := args[0].Pointer()
-	nodemask := args[1].Pointer()
-	maxnode := args[2].Uint()
-	addr := args[3].Pointer()
-	flags := args[4].Uint()
-
-	memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
-	nodeFlag := flags&linux.MPOL_F_NODE != 0
-	addrFlag := flags&linux.MPOL_F_ADDR != 0
-
-	// TODO(rahat): Once sysfs is implemented, report a single numa node in
-	// /sys/devices/system/node.
-	if nodemask != 0 && maxnode < 1 {
-		return 0, nil, syserror.EINVAL
-	}
-
-	// 'addr' provided iff 'addrFlag' set.
-	if addrFlag == (addr == 0) {
-		return 0, nil, syserror.EINVAL
-	}
-
-	// Default policy for the thread.
-	if flags == 0 {
-		policy, nodemaskVal := t.NumaPolicy()
-		if _, err := copyOutIfNotNull(t, mode, policy); err != nil {
-			return 0, nil, syserror.EFAULT
-		}
-		if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil {
-			return 0, nil, syserror.EFAULT
-		}
-		return 0, nil, nil
-	}
-
-	// Report all nodes available to caller.
-	if memsAllowed {
-		// MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED.
-		if nodeFlag || addrFlag {
-			return 0, nil, syserror.EINVAL
-		}
-
-		// Report a single numa node.
-		if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil {
-			return 0, nil, syserror.EFAULT
-		}
-		return 0, nil, nil
-	}
-
-	if addrFlag {
-		if nodeFlag {
-			// Return the id for the node where 'addr' resides, via 'mode'.
-			//
-			// The real get_mempolicy(2) allocates the page referenced by 'addr'
-			// by simulating a read, if it is unallocated before the call. It
-			// then returns the node the page is allocated on through the mode
-			// pointer.
-			b := t.CopyScratchBuffer(1)
-			_, err := t.CopyInBytes(addr, b)
-			if err != nil {
-				return 0, nil, syserror.EFAULT
-			}
-			if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil {
-				return 0, nil, syserror.EFAULT
-			}
-		} else {
-			storedPolicy, _ := t.NumaPolicy()
-			// Return the policy governing the memory referenced by 'addr'.
-			if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil {
-				return 0, nil, syserror.EFAULT
-			}
-		}
-		return 0, nil, nil
-	}
-
-	storedPolicy, _ := t.NumaPolicy()
-	if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) {
-		// Policy for current thread is to interleave memory between
-		// nodes. Return the next node we'll allocate on. Since we only have a
-		// single node, this is always node 0.
-		if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil {
-			return 0, nil, syserror.EFAULT
-		}
-		return 0, nil, nil
-	}
-
-	return 0, nil, syserror.EINVAL
-}
-
-func allowedNodesMask() uint32 {
-	const maxNodes = 1
-	return ^uint32((1 << maxNodes) - 1)
-}
-
-// SetMempolicy implements the syscall set_mempolicy(2).
-func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	modeWithFlags := args[0].Int()
-	nodemask := args[1].Pointer()
-	maxnode := args[2].Uint()
-
-	if nodemask != 0 && maxnode < 1 {
-		return 0, nil, syserror.EINVAL
-	}
-
-	if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS {
-		// Can't specify multiple modes simultaneously.
-		return 0, nil, syserror.EINVAL
-	}
-
-	mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
-	if mode < 0 || mode >= linux.MPOL_MAX {
-		// Must specify a valid mode.
-		return 0, nil, syserror.EINVAL
-	}
-
-	var nodemaskVal uint32
-	// Nodemask may be empty for some policy modes.
-	if nodemask != 0 && maxnode > 0 {
-		if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil {
-			return 0, nil, syserror.EFAULT
-		}
-	}
-
-	if (mode == linux.MPOL_INTERLEAVE || mode == linux.MPOL_BIND) && nodemaskVal == 0 {
-		// Mode requires a non-empty nodemask, but got an empty nodemask.
-		return 0, nil, syserror.EINVAL
-	}
-
-	if nodemaskVal&allowedNodesMask() != 0 {
-		// Invalid node specified.
-		return 0, nil, syserror.EINVAL
-	}
-
-	t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal)
-
-	return 0, nil, nil
-}
-
 // Mincore implements the syscall mincore(2).
 func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 117ae1a0e..1b7e5616b 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -15,6 +15,7 @@
 package linux
 
 import (
+	"fmt"
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -23,6 +24,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 )
 
 // Prctl implements linux syscall prctl(2).
@@ -44,6 +46,33 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		_, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal()))
 		return 0, nil, err
 
+	case linux.PR_GET_DUMPABLE:
+		d := t.MemoryManager().Dumpability()
+		switch d {
+		case mm.NotDumpable:
+			return linux.SUID_DUMP_DISABLE, nil, nil
+		case mm.UserDumpable:
+			return linux.SUID_DUMP_USER, nil, nil
+		case mm.RootDumpable:
+			return linux.SUID_DUMP_ROOT, nil, nil
+		default:
+			panic(fmt.Sprintf("Unknown dumpability %v", d))
+		}
+
+	case linux.PR_SET_DUMPABLE:
+		var d mm.Dumpability
+		switch args[1].Int() {
+		case linux.SUID_DUMP_DISABLE:
+			d = mm.NotDumpable
+		case linux.SUID_DUMP_USER:
+			d = mm.UserDumpable
+		default:
+			// N.B. Userspace may not pass SUID_DUMP_ROOT.
+			return 0, nil, syscall.EINVAL
+		}
+		t.MemoryManager().SetDumpability(d)
+		return 0, nil, nil
+
 	case linux.PR_GET_KEEPCAPS:
 		if t.Credentials().KeepCaps {
 			return 1, nil, nil
@@ -171,9 +200,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		}
 		return 0, nil, t.DropBoundingCapability(cp)
 
-	case linux.PR_GET_DUMPABLE,
-		linux.PR_SET_DUMPABLE,
-		linux.PR_GET_TIMING,
+	case linux.PR_GET_TIMING,
 		linux.PR_SET_TIMING,
 		linux.PR_GET_TSC,
 		linux.PR_SET_TSC,
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 8f4dbf3bc..31295a6a9 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -188,7 +188,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	}
 
 	// Create the new socket.
-	s, e := socket.New(t, domain, transport.SockType(stype&0xf), protocol)
+	s, e := socket.New(t, domain, linux.SockType(stype&0xf), protocol)
 	if e != nil {
 		return 0, nil, e.ToError()
 	}
@@ -227,7 +227,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	// Create the socket pair.
-	s1, s2, e := socket.Pair(t, domain, transport.SockType(stype&0xf), protocol)
+	s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol)
 	if e != nil {
 		return 0, nil, e.ToError()
 	}
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
index 5d10b3824..48c114232 100644
--- a/pkg/sentry/syscalls/syscalls.go
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -25,37 +25,97 @@
 package syscalls
 
 import (
+	"fmt"
+	"syscall"
+
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
+// Supported returns a syscall that is fully supported.
+func Supported(name string, fn kernel.SyscallFn) kernel.Syscall {
+	return kernel.Syscall{
+		Name:         name,
+		Fn:           fn,
+		SupportLevel: kernel.SupportFull,
+		Note:         "Full Support",
+	}
+}
+
+// Undocumented returns a syscall that is undocumented.
+func Undocumented(name string, fn kernel.SyscallFn) kernel.Syscall {
+	return kernel.Syscall{
+		Name:         name,
+		Fn:           fn,
+		SupportLevel: kernel.SupportUndocumented,
+	}
+}
+
+// PartiallySupported returns a syscall that has a partial implementation.
+func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []string) kernel.Syscall {
+	return kernel.Syscall{
+		Name:         name,
+		Fn:           fn,
+		SupportLevel: kernel.SupportPartial,
+		Note:         note,
+		URLs:         urls,
+	}
+}
+
 // Error returns a syscall handler that will always give the passed error.
-func Error(err error) kernel.SyscallFn {
-	return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-		return 0, nil, err
+func Error(name string, err syscall.Errno, note string, urls []string) kernel.Syscall {
+	if note != "" {
+		note = note + "; "
+	}
+	return kernel.Syscall{
+		Name: name,
+		Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+			return 0, nil, err
+		},
+		SupportLevel: kernel.SupportUnimplemented,
+		Note:         fmt.Sprintf("%sReturns %q", note, err.Error()),
+		URLs:         urls,
 	}
 }
 
 // ErrorWithEvent gives a syscall function that sends an unimplemented
 // syscall event via the event channel and returns the passed error.
-func ErrorWithEvent(err error) kernel.SyscallFn {
-	return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-		t.Kernel().EmitUnimplementedEvent(t)
-		return 0, nil, err
+func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string) kernel.Syscall {
+	if note != "" {
+		note = note + "; "
+	}
+	return kernel.Syscall{
+		Name: name,
+		Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+			t.Kernel().EmitUnimplementedEvent(t)
+			return 0, nil, err
+		},
+		SupportLevel: kernel.SupportUnimplemented,
+		Note:         fmt.Sprintf("%sReturns %q", note, err.Error()),
+		URLs:         urls,
 	}
 }
 
 // CapError gives a syscall function that checks for capability c.  If the task
 // has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged
 // tasks, it will seem like there is an implementation.
-func CapError(c linux.Capability) kernel.SyscallFn {
-	return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-		if !t.HasCapability(c) {
-			return 0, nil, syserror.EPERM
-		}
-		t.Kernel().EmitUnimplementedEvent(t)
-		return 0, nil, syserror.ENOSYS
+func CapError(name string, c linux.Capability, note string, urls []string) kernel.Syscall {
+	if note != "" {
+		note = note + "; "
+	}
+	return kernel.Syscall{
+		Name: name,
+		Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+			if !t.HasCapability(c) {
+				return 0, nil, syserror.EPERM
+			}
+			t.Kernel().EmitUnimplementedEvent(t)
+			return 0, nil, syserror.ENOSYS
+		},
+		SupportLevel: kernel.SupportUnimplemented,
+		Note:         fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise", note, syserror.EPERM, c.String(), syserror.ENOSYS),
+		URLs:         urls,
 	}
 }
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index b2f8f6832..b50579a92 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -32,7 +32,7 @@ go_library(
         "tsc_arm64.s",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/time",
-    visibility = ["//pkg/sentry:internal"],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
         "//pkg/metric",
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index 09198496b..860733061 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -17,6 +17,6 @@ go_library(
     ],
     deps = [
         "//pkg/bits",
-        "//pkg/sentry/memutil",
+        "//pkg/memutil",
     ],
 )
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index c316f1597..9ed974ccb 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -22,7 +22,7 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/bits"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
+	"gvisor.googlesource.com/gvisor/pkg/memutil"
 )
 
 // MemoryKind represents a type of memory used by the application.
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 31e4d6ada..9dde327a2 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -222,9 +222,11 @@ func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts
 	return int(r.Addr - addr), nil
 }
 
-// copyStringIncrement is the maximum number of bytes that are copied from
-// virtual memory at a time by CopyStringIn.
-const copyStringIncrement = 64
+// CopyStringIn tuning parameters, defined outside that function for tests.
+const (
+	copyStringIncrement     = 64
+	copyStringMaxInitBufLen = 256
+)
 
 // CopyStringIn copies a NUL-terminated string of unknown length from the
 // memory mapped at addr in uio and returns it as a string (not including the
@@ -234,31 +236,38 @@ const copyStringIncrement = 64
 //
 // Preconditions: As for IO.CopyFromUser. maxlen >= 0.
 func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) {
-	buf := make([]byte, maxlen)
+	initLen := maxlen
+	if initLen > copyStringMaxInitBufLen {
+		initLen = copyStringMaxInitBufLen
+	}
+	buf := make([]byte, initLen)
 	var done int
 	for done < maxlen {
-		start, ok := addr.AddLength(uint64(done))
-		if !ok {
-			// Last page of kernel memory. The application can't use this
-			// anyway.
-			return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
-		}
 		// Read up to copyStringIncrement bytes at a time.
 		readlen := copyStringIncrement
 		if readlen > maxlen-done {
 			readlen = maxlen - done
 		}
-		end, ok := start.AddLength(uint64(readlen))
+		end, ok := addr.AddLength(uint64(readlen))
 		if !ok {
 			return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
 		}
 		// Shorten the read to avoid crossing page boundaries, since faulting
 		// in a page unnecessarily is expensive. This also ensures that partial
 		// copies up to the end of application-mappable memory succeed.
-		if start.RoundDown() != end.RoundDown() {
+		if addr.RoundDown() != end.RoundDown() {
 			end = end.RoundDown()
+			readlen = int(end - addr)
+		}
+		// Ensure that our buffer is large enough to accommodate the read.
+		if done+readlen > len(buf) {
+			newBufLen := len(buf) * 2
+			if newBufLen > maxlen {
+				newBufLen = maxlen
+			}
+			buf = append(buf, make([]byte, newBufLen-len(buf))...)
 		}
-		n, err := uio.CopyIn(ctx, start, buf[done:done+int(end-start)], opts)
+		n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts)
 		// Look for the terminating zero byte, which may have occurred before
 		// hitting err.
 		for i, c := range buf[done : done+n] {
@@ -270,6 +279,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		if err != nil {
 			return stringFromImmutableBytes(buf[:done]), err
 		}
+		addr = end
 	}
 	return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG
 }
diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go
index 4a07118b7..575e5039d 100644
--- a/pkg/sentry/usermem/usermem_test.go
+++ b/pkg/sentry/usermem/usermem_test.go
@@ -192,6 +192,7 @@ func TestCopyObject(t *testing.T) {
 }
 
 func TestCopyStringInShort(t *testing.T) {
+	// Tests for string length <= copyStringIncrement.
 	want := strings.Repeat("A", copyStringIncrement-2)
 	mem := want + "\x00"
 	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil {
@@ -200,13 +201,25 @@ func TestCopyStringInShort(t *testing.T) {
 }
 
 func TestCopyStringInLong(t *testing.T) {
-	want := strings.Repeat("A", copyStringIncrement+1)
+	// Tests for copyStringIncrement < string length <= copyStringMaxInitBufLen
+	// (requiring multiple calls to IO.CopyIn()).
+	want := strings.Repeat("A", copyStringIncrement*3/4) + strings.Repeat("B", copyStringIncrement*3/4)
 	mem := want + "\x00"
 	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil {
 		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
 	}
 }
 
+func TestCopyStringInVeryLong(t *testing.T) {
+	// Tests for string length > copyStringMaxInitBufLen (requiring buffer
+	// reallocation).
+	want := strings.Repeat("A", copyStringMaxInitBufLen*3/4) + strings.Repeat("B", copyStringMaxInitBufLen*3/4)
+	mem := want + "\x00"
+	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringMaxInitBufLen, IOOpts{}); got != want || err != nil {
+		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
+	}
+}
+
 func TestCopyStringInNoTerminatingZeroByte(t *testing.T) {
 	want := strings.Repeat("A", copyStringIncrement-1)
 	got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{})
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 1f889c2a0..b88e2e7bf 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -21,12 +21,29 @@
 // FD based endpoints can be used in the networking stack by calling New() to
 // create a new endpoint, and then passing it as an argument to
 // Stack.CreateNIC().
+//
+// FD based endpoints can use more than one file descriptor to read incoming
+// packets. If there are more than one FDs specified and the underlying FD is an
+// AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the
+// host kernel will consistently hash the packets to the sockets. This ensures
+// that packets for the same TCP streams are not reordered.
+//
+// Similarly if more than one FD's are specified where the underlying FD is not
+// AF_PACKET then it's the caller's responsibility to ensure that all inbound
+// packets on the descriptors are consistently 5 tuple hashed to one of the
+// descriptors to prevent TCP reordering.
+//
+// Since netstack today does not compute 5 tuple hashes for outgoing packets we
+// only use the first FD to write outbound packets. Once 5 tuple hashes for
+// all outbound packets are available we will make use of all underlying FD's to
+// write outbound packets.
 package fdbased
 
 import (
 	"fmt"
 	"syscall"
 
+	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
@@ -65,8 +82,10 @@ const (
 )
 
 type endpoint struct {
-	// fd is the file descriptor used to send and receive packets.
-	fd int
+	// fds is the set of file descriptors each identifying one inbound/outbound
+	// channel. The endpoint will dispatch from all inbound channels as well as
+	// hash outbound packets to specific channels based on the packet hash.
+	fds []int
 
 	// mtu (maximum transmission unit) is the maximum size of a packet.
 	mtu uint32
@@ -85,8 +104,8 @@ type endpoint struct {
 	// its end of the communication pipe.
 	closed func(*tcpip.Error)
 
-	inboundDispatcher linkDispatcher
-	dispatcher        stack.NetworkDispatcher
+	inboundDispatchers []linkDispatcher
+	dispatcher         stack.NetworkDispatcher
 
 	// packetDispatchMode controls the packet dispatcher used by this
 	// endpoint.
@@ -99,17 +118,47 @@ type endpoint struct {
 
 // Options specify the details about the fd-based endpoint to be created.
 type Options struct {
-	FD                 int
-	MTU                uint32
-	EthernetHeader     bool
-	ClosedFunc         func(*tcpip.Error)
-	Address            tcpip.LinkAddress
-	SaveRestore        bool
-	DisconnectOk       bool
-	GSOMaxSize         uint32
+	// FDs is a set of FDs used to read/write packets.
+	FDs []int
+
+	// MTU is the mtu to use for this endpoint.
+	MTU uint32
+
+	// EthernetHeader if true, indicates that the endpoint should read/write
+	// ethernet frames instead of IP packets.
+	EthernetHeader bool
+
+	// ClosedFunc is a function to be called when an endpoint's peer (if
+	// any) closes its end of the communication pipe.
+	ClosedFunc func(*tcpip.Error)
+
+	// Address is the link address for this endpoint. Only used if
+	// EthernetHeader is true.
+	Address tcpip.LinkAddress
+
+	// SaveRestore if true, indicates that this NIC capability set should
+	// include CapabilitySaveRestore
+	SaveRestore bool
+
+	// DisconnectOk if true, indicates that this NIC capability set should
+	// include CapabilityDisconnectOk.
+	DisconnectOk bool
+
+	// GSOMaxSize is the maximum GSO packet size. It is zero if GSO is
+	// disabled.
+	GSOMaxSize uint32
+
+	// PacketDispatchMode specifies the type of inbound dispatcher to be
+	// used for this endpoint.
 	PacketDispatchMode PacketDispatchMode
-	TXChecksumOffload  bool
-	RXChecksumOffload  bool
+
+	// TXChecksumOffload if true, indicates that this endpoints capability
+	// set should include CapabilityTXChecksumOffload.
+	TXChecksumOffload bool
+
+	// RXChecksumOffload if true, indicates that this endpoints capability
+	// set should include CapabilityRXChecksumOffload.
+	RXChecksumOffload bool
 }
 
 // New creates a new fd-based endpoint.
@@ -117,10 +166,6 @@ type Options struct {
 // Makes fd non-blocking, but does not take ownership of fd, which must remain
 // open for the lifetime of the returned endpoint.
 func New(opts *Options) (tcpip.LinkEndpointID, error) {
-	if err := syscall.SetNonblock(opts.FD, true); err != nil {
-		return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", opts.FD, err)
-	}
-
 	caps := stack.LinkEndpointCapabilities(0)
 	if opts.RXChecksumOffload {
 		caps |= stack.CapabilityRXChecksumOffload
@@ -144,8 +189,12 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) {
 		caps |= stack.CapabilityDisconnectOk
 	}
 
+	if len(opts.FDs) == 0 {
+		return 0, fmt.Errorf("opts.FD is empty, at least one FD must be specified")
+	}
+
 	e := &endpoint{
-		fd:                 opts.FD,
+		fds:                opts.FDs,
 		mtu:                opts.MTU,
 		caps:               caps,
 		closed:             opts.ClosedFunc,
@@ -154,46 +203,71 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) {
 		packetDispatchMode: opts.PacketDispatchMode,
 	}
 
-	isSocket, err := isSocketFD(e.fd)
-	if err != nil {
-		return 0, err
-	}
-	if isSocket {
-		if opts.GSOMaxSize != 0 {
-			e.caps |= stack.CapabilityGSO
-			e.gsoMaxSize = opts.GSOMaxSize
+	// Create per channel dispatchers.
+	for i := 0; i < len(e.fds); i++ {
+		fd := e.fds[i]
+		if err := syscall.SetNonblock(fd, true); err != nil {
+			return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", fd, err)
 		}
-	}
-	e.inboundDispatcher, err = createInboundDispatcher(e, isSocket)
-	if err != nil {
-		return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err)
+
+		isSocket, err := isSocketFD(fd)
+		if err != nil {
+			return 0, err
+		}
+		if isSocket {
+			if opts.GSOMaxSize != 0 {
+				e.caps |= stack.CapabilityGSO
+				e.gsoMaxSize = opts.GSOMaxSize
+			}
+		}
+		inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket)
+		if err != nil {
+			return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err)
+		}
+		e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher)
 	}
 
 	return stack.RegisterLinkEndpoint(e), nil
 }
 
-func createInboundDispatcher(e *endpoint, isSocket bool) (linkDispatcher, error) {
+func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher, error) {
 	// By default use the readv() dispatcher as it works with all kinds of
 	// FDs (tap/tun/unix domain sockets and af_packet).
-	inboundDispatcher, err := newReadVDispatcher(e.fd, e)
+	inboundDispatcher, err := newReadVDispatcher(fd, e)
 	if err != nil {
-		return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", e.fd, e, err)
+		return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err)
 	}
 
 	if isSocket {
+		sa, err := unix.Getsockname(fd)
+		if err != nil {
+			return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
+		}
+		switch sa.(type) {
+		case *unix.SockaddrLinklayer:
+			// enable PACKET_FANOUT mode is the underlying socket is
+			// of type AF_PACKET.
+			const fanoutID = 1
+			const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG
+			fanoutArg := fanoutID | fanoutType<<16
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
+				return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
+			}
+		}
+
 		switch e.packetDispatchMode {
 		case PacketMMap:
-			inboundDispatcher, err = newPacketMMapDispatcher(e.fd, e)
+			inboundDispatcher, err = newPacketMMapDispatcher(fd, e)
 			if err != nil {
-				return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", e.fd, e, err)
+				return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err)
 			}
 		case RecvMMsg:
 			// If the provided FD is a socket then we optimize
 			// packet reads by using recvmmsg() instead of read() to
 			// read packets in a batch.
-			inboundDispatcher, err = newRecvMMsgDispatcher(e.fd, e)
+			inboundDispatcher, err = newRecvMMsgDispatcher(fd, e)
 			if err != nil {
-				return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", e.fd, e, err)
+				return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err)
 			}
 		}
 	}
@@ -215,7 +289,9 @@ func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
 	// Link endpoints are not savable. When transportation endpoints are
 	// saved, they stop sending outgoing packets and all incoming packets
 	// are rejected.
-	go e.dispatchLoop() // S/R-SAFE: See above.
+	for i := range e.inboundDispatchers {
+		go e.dispatchLoop(e.inboundDispatchers[i]) // S/R-SAFE: See above.
+	}
 }
 
 // IsAttached implements stack.LinkEndpoint.IsAttached.
@@ -305,26 +381,26 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 			}
 		}
 
-		return rawfile.NonBlockingWrite3(e.fd, vnetHdrBuf, hdr.View(), payload.ToView())
+		return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, hdr.View(), payload.ToView())
 	}
 
 	if payload.Size() == 0 {
-		return rawfile.NonBlockingWrite(e.fd, hdr.View())
+		return rawfile.NonBlockingWrite(e.fds[0], hdr.View())
 	}
 
-	return rawfile.NonBlockingWrite3(e.fd, hdr.View(), payload.ToView(), nil)
+	return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil)
 }
 
 // WriteRawPacket writes a raw packet directly to the file descriptor.
 func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error {
-	return rawfile.NonBlockingWrite(e.fd, packet)
+	return rawfile.NonBlockingWrite(e.fds[0], packet)
 }
 
 // dispatchLoop reads packets from the file descriptor in a loop and dispatches
 // them to the network stack.
-func (e *endpoint) dispatchLoop() *tcpip.Error {
+func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) *tcpip.Error {
 	for {
-		cont, err := e.inboundDispatcher.dispatch()
+		cont, err := inboundDispatcher.dispatch()
 		if err != nil || !cont {
 			if e.closed != nil {
 				e.closed(err)
@@ -363,7 +439,7 @@ func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabiliti
 	syscall.SetNonblock(fd, true)
 
 	e := &InjectableEndpoint{endpoint: endpoint{
-		fd:   fd,
+		fds:  []int{fd},
 		mtu:  mtu,
 		caps: capabilities,
 	}}
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index fd1722074..ba3e09192 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -67,7 +67,7 @@ func newContext(t *testing.T, opt *Options) *context {
 		done <- struct{}{}
 	}
 
-	opt.FD = fds[1]
+	opt.FDs = []int{fds[1]}
 	epID, err := New(opt)
 	if err != nil {
 		t.Fatalf("Failed to create FD endpoint: %v", err)
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index fccabd554..98581e50e 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -118,7 +118,7 @@ func NewWithFile(lower tcpip.LinkEndpointID, file *os.File, snapLen uint32) (tcp
 // logs the packet before forwarding to the actual dispatcher.
 func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("recv", protocol, vv.First())
+		logPacket("recv", protocol, vv.First(), nil)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
 		vs := vv.Views()
@@ -198,7 +198,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 // the request to the lower endpoint.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("send", protocol, hdr.View())
+		logPacket("send", protocol, hdr.View(), gso)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
 		hdrBuf := hdr.View()
@@ -240,7 +240,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 	return e.lower.WritePacket(r, gso, hdr, payload, protocol)
 }
 
-func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View) {
+func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View, gso *stack.GSO) {
 	// Figure out the network layer info.
 	var transProto uint8
 	src := tcpip.Address("unknown")
@@ -404,5 +404,9 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
 		return
 	}
 
+	if gso != nil {
+		details += fmt.Sprintf(" gso: %+v", gso)
+	}
+
 	log.Infof("%s %s %v:%v -> %v:%v len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details)
 }
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index da07a39e5..44b1d5b9b 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -215,7 +215,9 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 		views[0] = hdr.View()
 		views = append(views, payload.Views()...)
 		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
-		e.HandlePacket(r, vv)
+		loopedR := r.MakeLoopedRoute()
+		e.HandlePacket(&loopedR, vv)
+		loopedR.Release()
 	}
 	if loop&stack.PacketOut == 0 {
 		return nil
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 4b8cd496b..bcae98e1f 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -108,7 +108,9 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
 		views[0] = hdr.View()
 		views = append(views, payload.Views()...)
 		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
-		e.HandlePacket(r, vv)
+		loopedR := r.MakeLoopedRoute()
+		e.HandlePacket(&loopedR, vv)
+		loopedR.Release()
 	}
 	if loop&stack.PacketOut == 0 {
 		return nil
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 1681de56e..1fa899e7e 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -137,7 +137,7 @@ func main() {
 		log.Fatal(err)
 	}
 
-	linkID, err := fdbased.New(&fdbased.Options{FD: fd, MTU: mtu})
+	linkID, err := fdbased.New(&fdbased.Options{FDs: []int{fd}, MTU: mtu})
 	if err != nil {
 		log.Fatal(err)
 	}
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index 642607f83..d47085581 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -129,7 +129,7 @@ func main() {
 	}
 
 	linkID, err := fdbased.New(&fdbased.Options{
-		FD:             fd,
+		FDs:            []int{fd},
 		MTU:            mtu,
 		EthernetHeader: *tap,
 		Address:        tcpip.LinkAddress(maddr),
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 3d4c282a9..55ed02479 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -187,3 +187,13 @@ func (r *Route) Clone() Route {
 	r.ref.incRef()
 	return *r
 }
+
+// MakeLoopedRoute duplicates the given route and tweaks it in case of multicast.
+func (r *Route) MakeLoopedRoute() Route {
+	l := r.Clone()
+	if header.IsV4MulticastAddress(r.RemoteAddress) || header.IsV6MulticastAddress(r.RemoteAddress) {
+		l.RemoteAddress, l.LocalAddress = l.LocalAddress, l.RemoteAddress
+		l.RemoteLinkAddress = l.LocalLinkAddress
+	}
+	return l
+}
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 8d74f1543..e8a9392b5 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -188,6 +188,10 @@ func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, s
 	f.proto.controlCount++
 }
 
+func (f *fakeTransportEndpoint) State() uint32 {
+	return 0
+}
+
 type fakeTransportGoodOption bool
 
 type fakeTransportBadOption bool
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f9886c6e4..04c776205 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -377,6 +377,10 @@ type Endpoint interface {
 	// GetSockOpt gets a socket option. opt should be a pointer to one of the
 	// *Option types.
 	GetSockOpt(opt interface{}) *Error
+
+	// State returns a socket's lifecycle state. The returned value is
+	// protocol-specific and is primarily used for diagnostics.
+	State() uint32
 }
 
 // WriteOptions contains options for Endpoint.Write.
@@ -468,6 +472,14 @@ type KeepaliveIntervalOption time.Duration
 // closed.
 type KeepaliveCountOption int
 
+// CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
+// the current congestion control algorithm.
+type CongestionControlOption string
+
+// AvailableCongestionControlOption is used to query the supported congestion
+// control algorithms.
+type AvailableCongestionControlOption string
+
 // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
 // TTL value for multicast messages. The default is 1.
 type MulticastTTLOption uint8
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index 9aa6f3978..84a2b53b7 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -33,6 +33,7 @@ go_library(
         "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
+        "//pkg/tcpip/transport/tcp",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index e2b90ef10..b8005093a 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -708,3 +708,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
 func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
 }
+
+// State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
+// expose internal socket state.
+func (e *endpoint) State() uint32 {
+	return 0
+}
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 1daf5823f..e4ff50c91 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -519,3 +519,8 @@ func (ep *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv b
 		ep.waiterQueue.Notify(waiter.EventIn)
 	}
 }
+
+// State implements socket.Socket.State.
+func (ep *endpoint) State() uint32 {
+	return 0
+}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index e31b03f7d..a9dbfb930 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -21,6 +21,7 @@ go_library(
         "accept.go",
         "connect.go",
         "cubic.go",
+        "cubic_state.go",
         "endpoint.go",
         "endpoint_state.go",
         "forwarder.go",
@@ -70,6 +71,7 @@ go_test(
     srcs = [
         "dual_stack_test.go",
         "sack_scoreboard_test.go",
+        "tcp_noracedetector_test.go",
         "tcp_sack_test.go",
         "tcp_test.go",
         "tcp_timestamp_test.go",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index d4b860975..d05259c0a 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -19,7 +19,6 @@ import (
 	"encoding/binary"
 	"hash"
 	"io"
-	"log"
 	"sync"
 	"time"
 
@@ -227,7 +226,6 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	}
 
 	n.isRegistered = true
-	n.state = stateConnecting
 
 	// Create sender and receiver.
 	//
@@ -253,14 +251,15 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// Perform the 3-way handshake.
 	h := newHandshake(ep, l.rcvWnd)
 
-	h.resetToSynRcvd(cookie, irs, opts, l.listenEP)
+	h.resetToSynRcvd(cookie, irs, opts)
 	if err := h.execute(); err != nil {
 		ep.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		ep.Close()
 		return nil, err
 	}
-
-	ep.state = stateConnected
+	ep.mu.Lock()
+	ep.state = StateEstablished
+	ep.mu.Unlock()
 
 	// Update the receive window scaling. We can't do it before the
 	// handshake because it's possible that the peer doesn't support window
@@ -277,7 +276,7 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
 	e.mu.RLock()
 	state := e.state
 	e.mu.RUnlock()
-	if state == stateListen {
+	if state == StateListen {
 		e.acceptedChan <- n
 		e.waiterQueue.Notify(waiter.EventIn)
 	} else {
@@ -295,7 +294,6 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	defer decSynRcvdCount()
 	defer e.decSynRcvdCount()
 	defer s.decRef()
-
 	n, err := ctx.createEndpointAndPerformHandshake(s, opts)
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
@@ -307,8 +305,7 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 
 func (e *endpoint) incSynRcvdCount() bool {
 	e.mu.Lock()
-	log.Printf("l: %d, c: %d, e.synRcvdCount: %d", len(e.acceptedChan), cap(e.acceptedChan), e.synRcvdCount)
-	if l, c := len(e.acceptedChan), cap(e.acceptedChan); l == c && e.synRcvdCount >= c {
+	if e.synRcvdCount >= cap(e.acceptedChan) {
 		e.mu.Unlock()
 		return false
 	}
@@ -323,6 +320,16 @@ func (e *endpoint) decSynRcvdCount() {
 	e.mu.Unlock()
 }
 
+func (e *endpoint) acceptQueueIsFull() bool {
+	e.mu.Lock()
+	if l, c := len(e.acceptedChan)+e.synRcvdCount, cap(e.acceptedChan); l >= c {
+		e.mu.Unlock()
+		return true
+	}
+	e.mu.Unlock()
+	return false
+}
+
 // handleListenSegment is called when a listening endpoint receives a segment
 // and needs to handle it.
 func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
@@ -330,20 +337,27 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 	case header.TCPFlagSyn:
 		opts := parseSynSegmentOptions(s)
 		if incSynRcvdCount() {
-			// Drop the SYN if the listen endpoint's accept queue is
-			// overflowing.
-			if e.incSynRcvdCount() {
-				log.Printf("processing syn packet")
+			// Only handle the syn if the following conditions hold
+			//   - accept queue is not full.
+			//   - number of connections in synRcvd state is less than the
+			//     backlog.
+			if !e.acceptQueueIsFull() && e.incSynRcvdCount() {
 				s.incRef()
 				go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
 				return
 			}
-			log.Printf("dropping syn packet")
+			decSynRcvdCount()
 			e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
 			e.stack.Stats().DroppedPackets.Increment()
 			return
 		} else {
-			// TODO(bhaskerh): Increment syncookie sent stat.
+			// If cookies are in use but the endpoint accept queue
+			// is full then drop the syn.
+			if e.acceptQueueIsFull() {
+				e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
+				e.stack.Stats().DroppedPackets.Increment()
+				return
+			}
 			cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
 			// Send SYN with window scaling because we currently
 			// dont't encode this information in the cookie.
@@ -361,7 +375,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		}
 
 	case header.TCPFlagAck:
-		if len(e.acceptedChan) == cap(e.acceptedChan) {
+		if e.acceptQueueIsFull() {
 			// Silently drop the ack as the application can't accept
 			// the connection at this point. The ack will be
 			// retransmitted by the sender anyway and we can
@@ -411,7 +425,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		n.tsOffset = 0
 
 		// Switch state to connected.
-		n.state = stateConnected
+		n.state = StateEstablished
 
 		// Do the delivery in a separate goroutine so
 		// that we don't block the listen loop in case
@@ -434,7 +448,7 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 		// handleSynSegment() from attempting to queue new connections
 		// to the endpoint.
 		e.mu.Lock()
-		e.state = stateClosed
+		e.state = StateClose
 
 		// Do cleanup if needed.
 		e.completeWorkerLocked()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 2aed6f286..dd671f7ce 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -60,12 +60,11 @@ const (
 
 // handshake holds the state used during a TCP 3-way handshake.
 type handshake struct {
-	ep       *endpoint
-	listenEP *endpoint // only non nil when doing passive connects.
-	state    handshakeState
-	active   bool
-	flags    uint8
-	ackNum   seqnum.Value
+	ep     *endpoint
+	state  handshakeState
+	active bool
+	flags  uint8
+	ackNum seqnum.Value
 
 	// iss is the initial send sequence number, as defined in RFC 793.
 	iss seqnum.Value
@@ -142,7 +141,7 @@ func (h *handshake) effectiveRcvWndScale() uint8 {
 
 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
 // state.
-func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, listenEP *endpoint) {
+func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) {
 	h.active = false
 	h.state = handshakeSynRcvd
 	h.flags = header.TCPFlagSyn | header.TCPFlagAck
@@ -150,7 +149,9 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.ackNum = irs + 1
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
-	h.listenEP = listenEP
+	h.ep.mu.Lock()
+	h.ep.state = StateSynRecv
+	h.ep.mu.Unlock()
 }
 
 // checkAck checks if the ACK number, if present, of a segment received during
@@ -219,6 +220,9 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// but resend our own SYN and wait for it to be acknowledged in the
 	// SYN-RCVD state.
 	h.state = handshakeSynRcvd
+	h.ep.mu.Lock()
+	h.ep.state = StateSynRecv
+	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
 		WS:    h.rcvWndScale,
 		TS:    rcvSynOpts.TS,
@@ -281,18 +285,6 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 	// We have previously received (and acknowledged) the peer's SYN. If the
 	// peer acknowledges our SYN, the handshake is completed.
 	if s.flagIsSet(header.TCPFlagAck) {
-		// listenContext is also used by a tcp.Forwarder and in that
-		// context we do not have a listening endpoint to check the
-		// backlog. So skip this check if listenEP is nil.
-		if h.listenEP != nil && len(h.listenEP.acceptedChan) == cap(h.listenEP.acceptedChan) {
-			// If there is no space in the accept queue to accept
-			// this endpoint then silently drop this ACK. The peer
-			// will anyway resend the ack and we can complete the
-			// connection the next time it's retransmitted.
-			h.ep.stack.Stats().TCP.ListenOverflowAckDrop.Increment()
-			h.ep.stack.Stats().DroppedPackets.Increment()
-			return nil
-		}
 		// If the timestamp option is negotiated and the segment does
 		// not carry a timestamp option then the segment must be dropped
 		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
@@ -663,7 +655,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
 // sendRaw sends a TCP segment to the endpoint's peer.
 func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
 	var sackBlocks []header.SACKBlock
-	if e.state == stateConnected && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
+	if e.state == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
@@ -714,8 +706,7 @@ func (e *endpoint) handleClose() *tcpip.Error {
 // protocol goroutine.
 func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, e.snd.sndUna, e.rcv.rcvNxt, 0)
-
-	e.state = stateError
+	e.state = StateError
 	e.hardError = err
 }
 
@@ -871,14 +862,19 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		// handshake, and then inform potential waiters about its
 		// completion.
 		h := newHandshake(e, seqnum.Size(e.receiveBufferAvailable()))
+		e.mu.Lock()
+		h.ep.state = StateSynSent
+		e.mu.Unlock()
+
 		if err := h.execute(); err != nil {
 			e.lastErrorMu.Lock()
 			e.lastError = err
 			e.lastErrorMu.Unlock()
 
 			e.mu.Lock()
-			e.state = stateError
+			e.state = StateError
 			e.hardError = err
+
 			// Lock released below.
 			epilogue()
 
@@ -900,7 +896,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 	// Tell waiters that the endpoint is connected and writable.
 	e.mu.Lock()
-	e.state = stateConnected
+	e.state = StateEstablished
 	drained := e.drainDone != nil
 	e.mu.Unlock()
 	if drained {
@@ -1000,7 +996,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 							return err
 						}
 					}
-					if e.state != stateError {
+					if e.state != StateError {
 						close(e.drainDone)
 						<-e.undrain
 					}
@@ -1056,8 +1052,8 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 	// Mark endpoint as closed.
 	e.mu.Lock()
-	if e.state != stateError {
-		e.state = stateClosed
+	if e.state != StateError {
+		e.state = StateClose
 	}
 	// Lock released below.
 	epilogue()
diff --git a/pkg/tcpip/transport/tcp/cubic.go b/pkg/tcpip/transport/tcp/cubic.go
index e618cd2b9..7b1f5e763 100644
--- a/pkg/tcpip/transport/tcp/cubic.go
+++ b/pkg/tcpip/transport/tcp/cubic.go
@@ -23,6 +23,7 @@ import (
 // control algorithm state.
 //
 // See: https://tools.ietf.org/html/rfc8312.
+// +stateify savable
 type cubicState struct {
 	// wLastMax is the previous wMax value.
 	wLastMax float64
@@ -33,7 +34,7 @@ type cubicState struct {
 
 	// t denotes the time when the current congestion avoidance
 	// was entered.
-	t time.Time
+	t time.Time `state:".(unixTime)"`
 
 	// numCongestionEvents tracks the number of congestion events since last
 	// RTO.
diff --git a/pkg/tcpip/transport/tcp/cubic_state.go b/pkg/tcpip/transport/tcp/cubic_state.go
new file mode 100644
index 000000000..d0f58cfaf
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/cubic_state.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+)
+
+// saveT is invoked by stateify.
+func (c *cubicState) saveT() unixTime {
+	return unixTime{c.t.Unix(), c.t.UnixNano()}
+}
+
+// loadT is invoked by stateify.
+func (c *cubicState) loadT(unix unixTime) {
+	c.t = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b66610ee2..1efe9d3fb 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -17,6 +17,7 @@ package tcp
 import (
 	"fmt"
 	"math"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -32,18 +33,81 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-type endpointState int
+// EndpointState represents the state of a TCP endpoint.
+type EndpointState uint32
 
+// Endpoint states. Note that are represented in a netstack-specific manner and
+// may not be meaningful externally. Specifically, they need to be translated to
+// Linux's representation for these states if presented to userspace.
 const (
-	stateInitial endpointState = iota
-	stateBound
-	stateListen
-	stateConnecting
-	stateConnected
-	stateClosed
-	stateError
+	// Endpoint states internal to netstack. These map to the TCP state CLOSED.
+	StateInitial EndpointState = iota
+	StateBound
+	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
+	StateError
+
+	// TCP protocol states.
+	StateEstablished
+	StateSynSent
+	StateSynRecv
+	StateFinWait1
+	StateFinWait2
+	StateTimeWait
+	StateClose
+	StateCloseWait
+	StateLastAck
+	StateListen
+	StateClosing
 )
 
+// connected is the set of states where an endpoint is connected to a peer.
+func (s EndpointState) connected() bool {
+	switch s {
+	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
+		return true
+	default:
+		return false
+	}
+}
+
+// String implements fmt.Stringer.String.
+func (s EndpointState) String() string {
+	switch s {
+	case StateInitial:
+		return "INITIAL"
+	case StateBound:
+		return "BOUND"
+	case StateConnecting:
+		return "CONNECTING"
+	case StateError:
+		return "ERROR"
+	case StateEstablished:
+		return "ESTABLISHED"
+	case StateSynSent:
+		return "SYN-SENT"
+	case StateSynRecv:
+		return "SYN-RCVD"
+	case StateFinWait1:
+		return "FIN-WAIT1"
+	case StateFinWait2:
+		return "FIN-WAIT2"
+	case StateTimeWait:
+		return "TIME-WAIT"
+	case StateClose:
+		return "CLOSED"
+	case StateCloseWait:
+		return "CLOSE-WAIT"
+	case StateLastAck:
+		return "LAST-ACK"
+	case StateListen:
+		return "LISTEN"
+	case StateClosing:
+		return "CLOSING"
+	default:
+		panic("unreachable")
+	}
+}
+
 // Reasons for notifying the protocol goroutine.
 const (
 	notifyNonZeroReceiveWindow = 1 << iota
@@ -108,10 +172,14 @@ type endpoint struct {
 	rcvBufUsed int
 
 	// The following fields are protected by the mutex.
-	mu                sync.RWMutex `state:"nosave"`
-	id                stack.TransportEndpointID
-	state             endpointState `state:".(endpointState)"`
-	isPortReserved    bool          `state:"manual"`
+	mu sync.RWMutex `state:"nosave"`
+	id stack.TransportEndpointID
+
+	// state             endpointState `state:".(endpointState)"`
+	// pState            ProtocolState
+	state EndpointState `state:".(EndpointState)"`
+
+	isPortReserved    bool `state:"manual"`
 	isRegistered      bool
 	boundNICID        tcpip.NICID `state:"manual"`
 	route             stack.Route `state:"manual"`
@@ -219,7 +287,7 @@ type endpoint struct {
 
 	// cc stores the name of the Congestion Control algorithm to use for
 	// this endpoint.
-	cc CongestionControlOption
+	cc tcpip.CongestionControlOption
 
 	// The following are used when a "packet too big" control packet is
 	// received. They are protected by sndBufMu. They are used to
@@ -304,6 +372,7 @@ func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waite
 		stack:       stack,
 		netProto:    netProto,
 		waiterQueue: waiterQueue,
+		state:       StateInitial,
 		rcvBufSize:  DefaultBufferSize,
 		sndBufSize:  DefaultBufferSize,
 		sndMTU:      int(math.MaxInt32),
@@ -326,7 +395,7 @@ func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waite
 		e.rcvBufSize = rs.Default
 	}
 
-	var cs CongestionControlOption
+	var cs tcpip.CongestionControlOption
 	if err := stack.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
 		e.cc = cs
 	}
@@ -335,7 +404,7 @@ func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waite
 		e.probe = p
 	}
 
-	e.segmentQueue.setLimit(2 * e.rcvBufSize)
+	e.segmentQueue.setLimit(MaxUnprocessedSegments)
 	e.workMu.Init()
 	e.workMu.Lock()
 	e.tsOffset = timeStampOffset()
@@ -351,14 +420,14 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	defer e.mu.RUnlock()
 
 	switch e.state {
-	case stateInitial, stateBound, stateConnecting:
+	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
 		// Ready for nothing.
 
-	case stateClosed, stateError:
+	case StateClose, StateError:
 		// Ready for anything.
 		result = mask
 
-	case stateListen:
+	case StateListen:
 		// Check if there's anything in the accepted channel.
 		if (mask & waiter.EventIn) != 0 {
 			if len(e.acceptedChan) > 0 {
@@ -366,7 +435,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 			}
 		}
 
-	case stateConnected:
+	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
 		// Determine if the endpoint is writable if requested.
 		if (mask & waiter.EventOut) != 0 {
 			e.sndBufMu.Lock()
@@ -427,7 +496,7 @@ func (e *endpoint) Close() {
 	// are immediately available for reuse after Close() is called. If also
 	// registered, we unregister as well otherwise the next user would fail
 	// in Listen() when trying to register.
-	if e.state == stateListen && e.isPortReserved {
+	if e.state == StateListen && e.isPortReserved {
 		if e.isRegistered {
 			e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
 			e.isRegistered = false
@@ -487,15 +556,15 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 	e.mu.RLock()
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data. Also note that a RST being received
-	// would cause the state to become stateError so we should allow the
+	// would cause the state to become StateError so we should allow the
 	// reads to proceed before returning a ECONNRESET.
 	e.rcvListMu.Lock()
 	bufUsed := e.rcvBufUsed
-	if s := e.state; s != stateConnected && s != stateClosed && bufUsed == 0 {
+	if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 {
 		e.rcvListMu.Unlock()
 		he := e.hardError
 		e.mu.RUnlock()
-		if s == stateError {
+		if s == StateError {
 			return buffer.View{}, tcpip.ControlMessages{}, he
 		}
 		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
@@ -511,7 +580,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	if e.rcvBufUsed == 0 {
-		if e.rcvClosed || e.state != stateConnected {
+		if e.rcvClosed || !e.state.connected() {
 			return buffer.View{}, tcpip.ErrClosedForReceive
 		}
 		return buffer.View{}, tcpip.ErrWouldBlock
@@ -547,9 +616,9 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 	defer e.mu.RUnlock()
 
 	// The endpoint cannot be written to if it's not connected.
-	if e.state != stateConnected {
+	if !e.state.connected() {
 		switch e.state {
-		case stateError:
+		case StateError:
 			return 0, nil, e.hardError
 		default:
 			return 0, nil, tcpip.ErrClosedForSend
@@ -612,8 +681,8 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Er
 
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data.
-	if s := e.state; s != stateConnected && s != stateClosed {
-		if s == stateError {
+	if s := e.state; !s.connected() && s != StateClose {
+		if s == StateError {
 			return 0, tcpip.ControlMessages{}, e.hardError
 		}
 		return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
@@ -623,7 +692,7 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Er
 	defer e.rcvListMu.Unlock()
 
 	if e.rcvBufUsed == 0 {
-		if e.rcvClosed || e.state != stateConnected {
+		if e.rcvClosed || !e.state.connected() {
 			return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
 		}
 		return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
@@ -757,8 +826,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		e.rcvListMu.Unlock()
 
-		e.segmentQueue.setLimit(2 * size)
-
 		e.notifyProtocolGoroutine(mask)
 		return nil
 
@@ -791,7 +858,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		defer e.mu.Unlock()
 
 		// We only allow this to be set when we're in the initial state.
-		if e.state != stateInitial {
+		if e.state != StateInitial {
 			return tcpip.ErrInvalidEndpointState
 		}
 
@@ -832,6 +899,40 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.CongestionControlOption:
+		// Query the available cc algorithms in the stack and
+		// validate that the specified algorithm is actually
+		// supported in the stack.
+		var avail tcpip.AvailableCongestionControlOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
+			return err
+		}
+		availCC := strings.Split(string(avail), " ")
+		for _, cc := range availCC {
+			if v == tcpip.CongestionControlOption(cc) {
+				// Acquire the work mutex as we may need to
+				// reinitialize the congestion control state.
+				e.mu.Lock()
+				state := e.state
+				e.cc = v
+				e.mu.Unlock()
+				switch state {
+				case StateEstablished:
+					e.workMu.Lock()
+					e.mu.Lock()
+					if e.state == state {
+						e.snd.cc = e.snd.initCongestionControl(e.cc)
+					}
+					e.mu.Unlock()
+					e.workMu.Unlock()
+				}
+				return nil
+			}
+		}
+
+		// Linux returns ENOENT when an invalid congestion
+		// control algorithm is specified.
+		return tcpip.ErrNoSuchFile
 	default:
 		return nil
 	}
@@ -843,7 +944,7 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
 	defer e.mu.RUnlock()
 
 	// The endpoint cannot be in listen state.
-	if e.state == stateListen {
+	if e.state == StateListen {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
@@ -1001,6 +1102,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		return nil
 
+	case *tcpip.CongestionControlOption:
+		e.mu.Lock()
+		*o = e.cc
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -1059,7 +1166,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 
 	nicid := addr.NIC
 	switch e.state {
-	case stateBound:
+	case StateBound:
 		// If we're already bound to a NIC but the caller is requesting
 		// that we use a different one now, we cannot proceed.
 		if e.boundNICID == 0 {
@@ -1072,16 +1179,16 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 
 		nicid = e.boundNICID
 
-	case stateInitial:
-		// Nothing to do. We'll eventually fill-in the gaps in the ID
-		// (if any) when we find a route.
+	case StateInitial:
+		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
+		// when we find a route.
 
-	case stateConnecting:
-		// A connection request has already been issued but hasn't
-		// completed yet.
+	case StateConnecting, StateSynSent, StateSynRecv:
+		// A connection request has already been issued but hasn't completed
+		// yet.
 		return tcpip.ErrAlreadyConnecting
 
-	case stateConnected:
+	case StateEstablished:
 		// The endpoint is already connected. If caller hasn't been notified yet, return success.
 		if !e.isConnectNotified {
 			e.isConnectNotified = true
@@ -1090,7 +1197,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 		// Otherwise return that it's already connected.
 		return tcpip.ErrAlreadyConnected
 
-	case stateError:
+	case StateError:
 		return e.hardError
 
 	default:
@@ -1156,7 +1263,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 	}
 
 	e.isRegistered = true
-	e.state = stateConnecting
+	e.state = StateConnecting
 	e.route = r.Clone()
 	e.boundNICID = nicid
 	e.effectiveNetProtos = netProtos
@@ -1177,7 +1284,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 		}
 		e.segmentQueue.mu.Unlock()
 		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
-		e.state = stateConnected
+		e.state = StateEstablished
 	}
 
 	if run {
@@ -1201,8 +1308,8 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 	defer e.mu.Unlock()
 	e.shutdownFlags |= flags
 
-	switch e.state {
-	case stateConnected:
+	switch {
+	case e.state.connected():
 		// Close for read.
 		if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
 			// Mark read side as closed.
@@ -1243,7 +1350,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			e.sndCloseWaker.Assert()
 		}
 
-	case stateListen:
+	case e.state == StateListen:
 		// Tell protocolListenLoop to stop.
 		if flags&tcpip.ShutdownRead != 0 {
 			e.notifyProtocolGoroutine(notifyClose)
@@ -1271,7 +1378,7 @@ func (e *endpoint) Listen(backlog int) (err *tcpip.Error) {
 	// When the endpoint shuts down, it sets workerCleanup to true, and from
 	// that point onward, acceptedChan is the responsibility of the cleanup()
 	// method (and should not be touched anywhere else, including here).
-	if e.state == stateListen && !e.workerCleanup {
+	if e.state == StateListen && !e.workerCleanup {
 		// Adjust the size of the channel iff we can fix existing
 		// pending connections into the new one.
 		if len(e.acceptedChan) > backlog {
@@ -1290,7 +1397,7 @@ func (e *endpoint) Listen(backlog int) (err *tcpip.Error) {
 	}
 
 	// Endpoint must be bound before it can transition to listen mode.
-	if e.state != stateBound {
+	if e.state != StateBound {
 		return tcpip.ErrInvalidEndpointState
 	}
 
@@ -1300,7 +1407,7 @@ func (e *endpoint) Listen(backlog int) (err *tcpip.Error) {
 	}
 
 	e.isRegistered = true
-	e.state = stateListen
+	e.state = StateListen
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
@@ -1327,7 +1434,7 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	defer e.mu.RUnlock()
 
 	// Endpoint must be in listen state before it can accept connections.
-	if e.state != stateListen {
+	if e.state != StateListen {
 		return nil, nil, tcpip.ErrInvalidEndpointState
 	}
 
@@ -1355,7 +1462,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore. This is because once the endpoint goes into a connected or
 	// listen state, it is already bound.
-	if e.state != stateInitial {
+	if e.state != StateInitial {
 		return tcpip.ErrAlreadyBound
 	}
 
@@ -1410,7 +1517,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 	}
 
 	// Mark endpoint as bound.
-	e.state = stateBound
+	e.state = StateBound
 
 	return nil
 }
@@ -1432,7 +1539,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
-	if e.state != stateConnected {
+	if !e.state.connected() {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
 	}
 
@@ -1741,3 +1848,11 @@ func (e *endpoint) initGSO() {
 	gso.MaxSize = e.route.GSOMaxSize()
 	e.gso = gso
 }
+
+// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
+// state for diagnostics.
+func (e *endpoint) State() uint32 {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return uint32(e.state)
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 27b0be046..5f30c2374 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -49,8 +49,8 @@ func (e *endpoint) beforeSave() {
 	defer e.mu.Unlock()
 
 	switch e.state {
-	case stateInitial, stateBound:
-	case stateConnected:
+	case StateInitial, StateBound:
+	case StateEstablished, StateSynSent, StateSynRecv, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
 		if e.route.Capabilities()&stack.CapabilitySaveRestore == 0 {
 			if e.route.Capabilities()&stack.CapabilityDisconnectOk == 0 {
 				panic(tcpip.ErrSaveRejection{fmt.Errorf("endpoint cannot be saved in connected state: local %v:%d, remote %v:%d", e.id.LocalAddress, e.id.LocalPort, e.id.RemoteAddress, e.id.RemotePort)})
@@ -66,17 +66,17 @@ func (e *endpoint) beforeSave() {
 			break
 		}
 		fallthrough
-	case stateListen, stateConnecting:
+	case StateListen, StateConnecting:
 		e.drainSegmentLocked()
-		if e.state != stateClosed && e.state != stateError {
+		if e.state != StateClose && e.state != StateError {
 			if !e.workerRunning {
 				panic("endpoint has no worker running in listen, connecting, or connected state")
 			}
 			break
 		}
 		fallthrough
-	case stateError, stateClosed:
-		for e.state == stateError && e.workerRunning {
+	case StateError, StateClose:
+		for e.state == StateError && e.workerRunning {
 			e.mu.Unlock()
 			time.Sleep(100 * time.Millisecond)
 			e.mu.Lock()
@@ -92,7 +92,7 @@ func (e *endpoint) beforeSave() {
 		panic("endpoint still has waiters upon save")
 	}
 
-	if e.state != stateClosed && !((e.state == stateBound || e.state == stateListen) == e.isPortReserved) {
+	if e.state != StateClose && !((e.state == StateBound || e.state == StateListen) == e.isPortReserved) {
 		panic("endpoints which are not in the closed state must have a reserved port IFF they are in bound or listen state")
 	}
 }
@@ -132,7 +132,7 @@ func (e *endpoint) loadAcceptedChan(acceptedEndpoints []*endpoint) {
 }
 
 // saveState is invoked by stateify.
-func (e *endpoint) saveState() endpointState {
+func (e *endpoint) saveState() EndpointState {
 	return e.state
 }
 
@@ -146,15 +146,15 @@ var connectingLoading sync.WaitGroup
 // Bound endpoint loading happens last.
 
 // loadState is invoked by stateify.
-func (e *endpoint) loadState(state endpointState) {
+func (e *endpoint) loadState(state EndpointState) {
 	// This is to ensure that the loading wait groups include all applicable
 	// endpoints before any asynchronous calls to the Wait() methods.
 	switch state {
-	case stateConnected:
+	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
 		connectedLoading.Add(1)
-	case stateListen:
+	case StateListen:
 		listenLoading.Add(1)
-	case stateConnecting:
+	case StateConnecting, StateSynSent, StateSynRecv:
 		connectingLoading.Add(1)
 	}
 	e.state = state
@@ -163,12 +163,12 @@ func (e *endpoint) loadState(state endpointState) {
 // afterLoad is invoked by stateify.
 func (e *endpoint) afterLoad() {
 	e.stack = stack.StackFromEnv
-	e.segmentQueue.setLimit(2 * e.rcvBufSize)
+	e.segmentQueue.setLimit(MaxUnprocessedSegments)
 	e.workMu.Init()
 
 	state := e.state
 	switch state {
-	case stateInitial, stateBound, stateListen, stateConnecting, stateConnected:
+	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
 		var ss SendBufferSizeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 			if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max {
@@ -181,7 +181,7 @@ func (e *endpoint) afterLoad() {
 	}
 
 	bind := func() {
-		e.state = stateInitial
+		e.state = StateInitial
 		if len(e.bindAddress) == 0 {
 			e.bindAddress = e.id.LocalAddress
 		}
@@ -191,7 +191,7 @@ func (e *endpoint) afterLoad() {
 	}
 
 	switch state {
-	case stateConnected:
+	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
 		bind()
 		if len(e.connectingAddress) == 0 {
 			// This endpoint is accepted by netstack but not yet by
@@ -211,7 +211,7 @@ func (e *endpoint) afterLoad() {
 			panic("endpoint connecting failed: " + err.String())
 		}
 		connectedLoading.Done()
-	case stateListen:
+	case StateListen:
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -223,7 +223,7 @@ func (e *endpoint) afterLoad() {
 			listenLoading.Done()
 			tcpip.AsyncLoading.Done()
 		}()
-	case stateConnecting:
+	case StateConnecting, StateSynSent, StateSynRecv:
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -235,7 +235,7 @@ func (e *endpoint) afterLoad() {
 			connectingLoading.Done()
 			tcpip.AsyncLoading.Done()
 		}()
-	case stateBound:
+	case StateBound:
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -244,7 +244,7 @@ func (e *endpoint) afterLoad() {
 			bind()
 			tcpip.AsyncLoading.Done()
 		}()
-	case stateClosed:
+	case StateClose:
 		if e.isPortReserved {
 			tcpip.AsyncLoading.Add(1)
 			go func() {
@@ -252,12 +252,12 @@ func (e *endpoint) afterLoad() {
 				listenLoading.Wait()
 				connectingLoading.Wait()
 				bind()
-				e.state = stateClosed
+				e.state = StateClose
 				tcpip.AsyncLoading.Done()
 			}()
 		}
 		fallthrough
-	case stateError:
+	case StateError:
 		tcpip.DeleteDanglingEndpoint(e)
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index d31a1edcb..59f4009a1 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -48,6 +48,10 @@ const (
 
 	// MaxBufferSize is the largest size a receive and send buffer can grow to.
 	maxBufferSize = 4 << 20 // 4MB
+
+	// MaxUnprocessedSegments is the maximum number of unprocessed segments
+	// that can be queued for a given endpoint.
+	MaxUnprocessedSegments = 300
 )
 
 // SACKEnabled option can be used to enable SACK support in the TCP
@@ -75,13 +79,6 @@ const (
 	ccCubic = "cubic"
 )
 
-// CongestionControlOption sets the current congestion control algorithm.
-type CongestionControlOption string
-
-// AvailableCongestionControlOption returns the supported congestion control
-// algorithms.
-type AvailableCongestionControlOption string
-
 type protocol struct {
 	mu                         sync.Mutex
 	sackEnabled                bool
@@ -89,7 +86,6 @@ type protocol struct {
 	recvBufferSize             ReceiveBufferSizeOption
 	congestionControl          string
 	availableCongestionControl []string
-	allowedCongestionControl   []string
 }
 
 // Number returns the tcp protocol number.
@@ -184,7 +180,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
-	case CongestionControlOption:
+	case tcpip.CongestionControlOption:
 		for _, c := range p.availableCongestionControl {
 			if string(v) == c {
 				p.mu.Lock()
@@ -193,7 +189,9 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 				return nil
 			}
 		}
-		return tcpip.ErrInvalidOptionValue
+		// linux returns ENOENT when an invalid congestion control
+		// is specified.
+		return tcpip.ErrNoSuchFile
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -219,14 +217,14 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 		*v = p.recvBufferSize
 		p.mu.Unlock()
 		return nil
-	case *CongestionControlOption:
+	case *tcpip.CongestionControlOption:
 		p.mu.Lock()
-		*v = CongestionControlOption(p.congestionControl)
+		*v = tcpip.CongestionControlOption(p.congestionControl)
 		p.mu.Unlock()
 		return nil
-	case *AvailableCongestionControlOption:
+	case *tcpip.AvailableCongestionControlOption:
 		p.mu.Lock()
-		*v = AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
+		*v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
 		p.mu.Unlock()
 		return nil
 	default:
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index b08a0e356..f02fa6105 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -134,6 +134,7 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 	// sequence numbers that have been consumed.
 	TrimSACKBlockList(&r.ep.sack, r.rcvNxt)
 
+	// Handle FIN or FIN-ACK.
 	if s.flagIsSet(header.TCPFlagFin) {
 		r.rcvNxt++
 
@@ -144,6 +145,25 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		r.closed = true
 		r.ep.readyToRead(nil)
 
+		// We just received a FIN, our next state depends on whether we sent a
+		// FIN already or not.
+		r.ep.mu.Lock()
+		switch r.ep.state {
+		case StateEstablished:
+			r.ep.state = StateCloseWait
+		case StateFinWait1:
+			if s.flagIsSet(header.TCPFlagAck) {
+				// FIN-ACK, transition to TIME-WAIT.
+				r.ep.state = StateTimeWait
+			} else {
+				// Simultaneous close, expecting a final ACK.
+				r.ep.state = StateClosing
+			}
+		case StateFinWait2:
+			r.ep.state = StateTimeWait
+		}
+		r.ep.mu.Unlock()
+
 		// Flush out any pending segments, except the very first one if
 		// it happens to be the one we're handling now because the
 		// caller is using it.
@@ -156,6 +176,23 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 			r.pendingRcvdSegments[i].decRef()
 		}
 		r.pendingRcvdSegments = r.pendingRcvdSegments[:first]
+
+		return true
+	}
+
+	// Handle ACK (not FIN-ACK, which we handled above) during one of the
+	// shutdown states.
+	if s.flagIsSet(header.TCPFlagAck) {
+		r.ep.mu.Lock()
+		switch r.ep.state {
+		case StateFinWait1:
+			r.ep.state = StateFinWait2
+		case StateClosing:
+			r.ep.state = StateTimeWait
+		case StateLastAck:
+			r.ep.state = StateClose
+		}
+		r.ep.mu.Unlock()
 	}
 
 	return true
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 3b020e580..e0759225e 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -16,8 +16,6 @@ package tcp
 
 import (
 	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 )
 
 // segmentQueue is a bounded, thread-safe queue of TCP segments.
@@ -58,7 +56,7 @@ func (q *segmentQueue) enqueue(s *segment) bool {
 	r := q.used < q.limit
 	if r {
 		q.list.PushBack(s)
-		q.used += s.data.Size() + header.TCPMinimumSize
+		q.used++
 	}
 	q.mu.Unlock()
 
@@ -73,7 +71,7 @@ func (q *segmentQueue) dequeue() *segment {
 	s := q.list.Front()
 	if s != nil {
 		q.list.Remove(s)
-		q.used -= s.data.Size() + header.TCPMinimumSize
+		q.used--
 	}
 	q.mu.Unlock()
 
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index afc1d0a55..daa3e8341 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -194,8 +194,6 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 
 	s := &sender{
 		ep:               ep,
-		sndCwnd:          InitialCwnd,
-		sndSsthresh:      math.MaxInt64,
 		sndWnd:           sndWnd,
 		sndUna:           iss + 1,
 		sndNxt:           iss + 1,
@@ -238,7 +236,13 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 	return s
 }
 
-func (s *sender) initCongestionControl(congestionControlName CongestionControlOption) congestionControl {
+// initCongestionControl initializes the specified congestion control module and
+// returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
+// their initial values.
+func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
+	s.sndCwnd = InitialCwnd
+	s.sndSsthresh = math.MaxInt64
+
 	switch congestionControlName {
 	case ccCubic:
 		return newCubicCC(s)
@@ -632,6 +636,10 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		}
 		seg.flags = header.TCPFlagAck | header.TCPFlagFin
 		segEnd = seg.sequenceNumber.Add(1)
+		// Transition to FIN-WAIT1 state since we're initiating an active close.
+		s.ep.mu.Lock()
+		s.ep.state = StateFinWait1
+		s.ep.mu.Unlock()
 	} else {
 		// We're sending a non-FIN segment.
 		if seg.flags&header.TCPFlagFin != 0 {
@@ -779,7 +787,7 @@ func (s *sender) sendData() {
 				break
 			}
 			dataSent = true
-			s.outstanding++
+			s.outstanding += s.pCount(seg)
 			s.writeNext = seg.Next()
 		}
 	}
diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
new file mode 100644
index 000000000..4d1519860
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
@@ -0,0 +1,519 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// These tests are flaky when run under the go race detector due to some
+// iterations taking long enough that the retransmit timer can kick in causing
+// the congestion window measurements to fail due to extra packets etc.
+//
+// +build !race
+
+package tcp_test
+
+import (
+	"fmt"
+	"math"
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp/testing/context"
+)
+
+func TestFastRecovery(t *testing.T) {
+	maxPayload := 32
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, nil)
+
+	const iterations = 7
+	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in one shot. Packets will only be written at the
+	// MTU size though.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	// Do slow start for a few iterations.
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		expected = tcp.InitialCwnd << uint(i)
+		if i > 0 {
+			// Acknowledge all the data received so far if not on
+			// first iteration.
+			c.SendAck(790, bytesRead)
+		}
+
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+	}
+
+	// Send 3 duplicate acks. This should force an immediate retransmit of
+	// the pending packet and put the sender into fast recovery.
+	rtxOffset := bytesRead - maxPayload*expected
+	for i := 0; i < 3; i++ {
+		c.SendAck(790, rtxOffset)
+	}
+
+	// Receive the retransmitted packet.
+	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+	}
+
+	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+	}
+
+	if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.FastRecovery.Value = %v, want = %v", got, want)
+	}
+
+	// Now send 7 mode duplicate acks. Each of these should cause a window
+	// inflation by 1 and cause the sender to send an extra packet.
+	for i := 0; i < 7; i++ {
+		c.SendAck(790, rtxOffset)
+	}
+
+	recover := bytesRead
+
+	// Ensure no new packets arrive.
+	c.CheckNoPacketTimeout("More packets received than expected during recovery after dupacks for this cwnd.",
+		50*time.Millisecond)
+
+	// Acknowledge half of the pending data.
+	rtxOffset = bytesRead - expected*maxPayload/2
+	c.SendAck(790, rtxOffset)
+
+	// Receive the retransmit due to partial ack.
+	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
+		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+	}
+
+	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
+		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+	}
+
+	// Receive the 10 extra packets that should have been released due to
+	// the congestion window inflation in recovery.
+	for i := 0; i < 10; i++ {
+		c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+		bytesRead += maxPayload
+	}
+
+	// A partial ACK during recovery should reduce congestion window by the
+	// number acked. Since we had "expected" packets outstanding before sending
+	// partial ack and we acked expected/2 , the cwnd and outstanding should
+	// be expected/2 + 10 (7 dupAcks + 3 for the original 3 dupacks that triggered
+	// fast recovery). Which means the sender should not send any more packets
+	// till we ack this one.
+	c.CheckNoPacketTimeout("More packets received than expected during recovery after partial ack for this cwnd.",
+		50*time.Millisecond)
+
+	// Acknowledge all pending data to recover point.
+	c.SendAck(790, recover)
+
+	// At this point, the cwnd should reset to expected/2 and there are 10
+	// packets outstanding.
+	//
+	// NOTE: Technically netstack is incorrect in that we adjust the cwnd on
+	// the same segment that takes us out of recovery. But because of that
+	// the actual cwnd at exit of recovery will be expected/2 + 1 as we
+	// acked a cwnd worth of packets which will increase the cwnd further by
+	// 1 in congestion avoidance.
+	//
+	// Now in the first iteration since there are 10 packets outstanding.
+	// We would expect to get expected/2 +1 - 10 packets. But subsequent
+	// iterations will send us expected/2 + 1 + 1 (per iteration).
+	expected = expected/2 + 1 - 10
+	for i := 0; i < iterations; i++ {
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout(fmt.Sprintf("More packets received(after deflation) than expected %d for this cwnd.", expected), 50*time.Millisecond)
+
+		// Acknowledge all the data received so far.
+		c.SendAck(790, bytesRead)
+
+		// In cogestion avoidance, the packets trains increase by 1 in
+		// each iteration.
+		if i == 0 {
+			// After the first iteration we expect to get the full
+			// congestion window worth of packets in every
+			// iteration.
+			expected += 10
+		}
+		expected++
+	}
+}
+
+func TestExponentialIncreaseDuringSlowStart(t *testing.T) {
+	maxPayload := 32
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, nil)
+
+	const iterations = 7
+	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in one shot. Packets will only be written at the
+	// MTU size though.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+
+		// Acknowledge all the data received so far.
+		c.SendAck(790, bytesRead)
+
+		// Double the number of expected packets for the next iteration.
+		expected *= 2
+	}
+}
+
+func TestCongestionAvoidance(t *testing.T) {
+	maxPayload := 32
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, nil)
+
+	const iterations = 7
+	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in one shot. Packets will only be written at the
+	// MTU size though.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	// Do slow start for a few iterations.
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		expected = tcp.InitialCwnd << uint(i)
+		if i > 0 {
+			// Acknowledge all the data received so far if not on
+			// first iteration.
+			c.SendAck(790, bytesRead)
+		}
+
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd (slow start phase).", 50*time.Millisecond)
+	}
+
+	// Don't acknowledge the first packet of the last packet train. Let's
+	// wait for them to time out, which will trigger a restart of slow
+	// start, and initialization of ssthresh to cwnd/2.
+	rtxOffset := bytesRead - maxPayload*expected
+	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+	// Acknowledge all the data received so far.
+	c.SendAck(790, bytesRead)
+
+	// This part is tricky: when the timeout happened, we had "expected"
+	// packets pending, cwnd reset to 1, and ssthresh set to expected/2.
+	// By acknowledging "expected" packets, the slow-start part will
+	// increase cwnd to expected/2 (which "consumes" expected/2-1 of the
+	// acknowledgements), then the congestion avoidance part will consume
+	// an extra expected/2 acks to take cwnd to expected/2 + 1. One ack
+	// remains in the "ack count" (which will cause cwnd to be incremented
+	// once it reaches cwnd acks).
+	//
+	// So we're straight into congestion avoidance with cwnd set to
+	// expected/2 + 1.
+	//
+	// Check that packets trains of cwnd packets are sent, and that cwnd is
+	// incremented by 1 after we acknowledge each packet.
+	expected = expected/2 + 1
+	for i := 0; i < iterations; i++ {
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd (congestion avoidance phase).", 50*time.Millisecond)
+
+		// Acknowledge all the data received so far.
+		c.SendAck(790, bytesRead)
+
+		// In cogestion avoidance, the packets trains increase by 1 in
+		// each iteration.
+		expected++
+	}
+}
+
+// cubicCwnd returns an estimate of a cubic window given the
+// originalCwnd, wMax, last congestion event time and sRTT.
+func cubicCwnd(origCwnd int, wMax int, congEventTime time.Time, sRTT time.Duration) int {
+	cwnd := float64(origCwnd)
+	// We wait 50ms between each iteration so sRTT as computed by cubic
+	// should be close to 50ms.
+	elapsed := (time.Since(congEventTime) + sRTT).Seconds()
+	k := math.Cbrt(float64(wMax) * 0.3 / 0.7)
+	wtRTT := 0.4*math.Pow(elapsed-k, 3) + float64(wMax)
+	cwnd += (wtRTT - cwnd) / cwnd
+	return int(cwnd)
+}
+
+func TestCubicCongestionAvoidance(t *testing.T) {
+	maxPayload := 32
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	enableCUBIC(t, c)
+
+	c.CreateConnected(789, 30000, nil)
+
+	const iterations = 7
+	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in one shot. Packets will only be written at the
+	// MTU size though.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	// Do slow start for a few iterations.
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		expected = tcp.InitialCwnd << uint(i)
+		if i > 0 {
+			// Acknowledge all the data received so far if not on
+			// first iteration.
+			c.SendAck(790, bytesRead)
+		}
+
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd (during slow-start phase).", 50*time.Millisecond)
+	}
+
+	// Don't acknowledge the first packet of the last packet train. Let's
+	// wait for them to time out, which will trigger a restart of slow
+	// start, and initialization of ssthresh to cwnd * 0.7.
+	rtxOffset := bytesRead - maxPayload*expected
+	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+	// Acknowledge all pending data.
+	c.SendAck(790, bytesRead)
+
+	// Store away the time we sent the ACK and assuming a 200ms RTO
+	// we estimate that the sender will have an RTO 200ms from now
+	// and go back into slow start.
+	packetDropTime := time.Now().Add(200 * time.Millisecond)
+
+	// This part is tricky: when the timeout happened, we had "expected"
+	// packets pending, cwnd reset to 1, and ssthresh set to expected * 0.7.
+	// By acknowledging "expected" packets, the slow-start part will
+	// increase cwnd to expected/2 essentially putting the connection
+	// straight into congestion avoidance.
+	wMax := expected
+	// Lower expected as per cubic spec after a congestion event.
+	expected = int(float64(expected) * 0.7)
+	cwnd := expected
+	for i := 0; i < iterations; i++ {
+		// Cubic grows window independent of ACKs. Cubic Window growth
+		// is a function of time elapsed since last congestion event.
+		// As a result the congestion window does not grow
+		// deterministically in response to ACKs.
+		//
+		// We need to roughly estimate what the cwnd of the sender is
+		// based on when we sent the dupacks.
+		cwnd := cubicCwnd(cwnd, wMax, packetDropTime, 50*time.Millisecond)
+
+		packetsExpected := cwnd
+		for j := 0; j < packetsExpected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+		t.Logf("expected packets received, next trying to receive any extra packets that may come")
+
+		// If our estimate was correct there should be no more pending packets.
+		// We attempt to read a packet a few times with a short sleep in between
+		// to ensure that we don't see the sender send any unexpected packets.
+		unexpectedPackets := 0
+		for {
+			gotPacket := c.ReceiveNonBlockingAndCheckPacket(data, bytesRead, maxPayload)
+			if !gotPacket {
+				break
+			}
+			bytesRead += maxPayload
+			unexpectedPackets++
+			time.Sleep(1 * time.Millisecond)
+		}
+		if unexpectedPackets != 0 {
+			t.Fatalf("received %d unexpected packets for iteration %d", unexpectedPackets, i)
+		}
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd(congestion avoidance)", 5*time.Millisecond)
+
+		// Acknowledge all the data received so far.
+		c.SendAck(790, bytesRead)
+	}
+}
+
+func TestRetransmit(t *testing.T) {
+	maxPayload := 32
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, nil)
+
+	const iterations = 7
+	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in two shots. Packets will only be written at the
+	// MTU size though.
+	half := data[:len(data)/2]
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+	half = data[len(data)/2:]
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	// Do slow start for a few iterations.
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		expected = tcp.InitialCwnd << uint(i)
+		if i > 0 {
+			// Acknowledge all the data received so far if not on
+			// first iteration.
+			c.SendAck(790, bytesRead)
+		}
+
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+	}
+
+	// Wait for a timeout and retransmit.
+	rtxOffset := bytesRead - maxPayload*expected
+	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+	if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.Timeouts.Value = %v, want = %v", got, want)
+	}
+
+	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
+	}
+
+	if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.SlowStartRetransmits.Value = %v, want = %v", got, want)
+	}
+
+	// Acknowledge half of the pending data.
+	rtxOffset = bytesRead - expected*maxPayload/2
+	c.SendAck(790, rtxOffset)
+
+	// Receive the remaining data, making sure that acknowledged data is not
+	// retransmitted.
+	for offset := rtxOffset; offset < len(data); offset += maxPayload {
+		c.ReceiveAndCheckPacket(data, offset, maxPayload)
+		c.SendAck(790, offset+maxPayload)
+	}
+
+	c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index fe037602b..7d8987219 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -168,8 +168,8 @@ func TestTCPResetsSentIncrement(t *testing.T) {
 
 	// Receive the SYN-ACK reply.
 	b := c.GetPacket()
-	tcp := header.TCP(header.IPv4(b).Payload())
-	c.IRS = seqnum.Value(tcp.SequenceNumber())
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
 
 	ackHeaders := &context.Headers{
 		SrcPort: context.TestPort,
@@ -269,8 +269,8 @@ func TestConnectResetAfterClose(t *testing.T) {
 	time.Sleep(3 * time.Second)
 	for {
 		b := c.GetPacket()
-		tcp := header.TCP(header.IPv4(b).Payload())
-		if tcp.Flags() == header.TCPFlagAck|header.TCPFlagFin {
+		tcpHdr := header.TCP(header.IPv4(b).Payload())
+		if tcpHdr.Flags() == header.TCPFlagAck|header.TCPFlagFin {
 			// This is a retransmit of the FIN, ignore it.
 			continue
 		}
@@ -553,9 +553,13 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
 			// We shouldn't consume a sequence number on RST.
 			checker.SeqNum(uint32(c.IRS)+1),
 		))
+	// The RST puts the endpoint into an error state.
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
 
-	// This final should be ignored because an ACK on a reset doesn't
-	// mean anything.
+	// This final ACK should be ignored because an ACK on a reset doesn't mean
+	// anything.
 	c.SendPacket(nil, &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: c.Port,
@@ -618,6 +622,10 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 			checker.SeqNum(uint32(c.IRS)+1),
 		))
 
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
 	// Cause a RST to be generated by closing the read end now since we have
 	// unread data.
 	c.EP.Shutdown(tcpip.ShutdownRead)
@@ -630,6 +638,10 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 			// We shouldn't consume a sequence number on RST.
 			checker.SeqNum(uint32(c.IRS)+1),
 		))
+	// The RST puts the endpoint into an error state.
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
 
 	// The ACK to the FIN should now be rejected since the connection has been
 	// closed by a RST.
@@ -1510,8 +1522,8 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
 	for bytesReceived != dataLen {
 		b := c.GetPacket()
 		numPackets++
-		tcp := header.TCP(header.IPv4(b).Payload())
-		payloadLen := len(tcp.Payload())
+		tcpHdr := header.TCP(header.IPv4(b).Payload())
+		payloadLen := len(tcpHdr.Payload())
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
@@ -1522,7 +1534,7 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
 		)
 
 		pdata := data[bytesReceived : bytesReceived+payloadLen]
-		if p := tcp.Payload(); !bytes.Equal(pdata, p) {
+		if p := tcpHdr.Payload(); !bytes.Equal(pdata, p) {
 			t.Fatalf("got data = %v, want = %v", p, pdata)
 		}
 		bytesReceived += payloadLen
@@ -1530,7 +1542,7 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
 		if c.TimeStampEnabled {
 			// If timestamp option is enabled, echo back the timestamp and increment
 			// the TSEcr value included in the packet and send that back as the TSVal.
-			parsedOpts := tcp.ParsedOptions()
+			parsedOpts := tcpHdr.ParsedOptions()
 			tsOpt := [12]byte{header.TCPOptionNOP, header.TCPOptionNOP}
 			header.EncodeTSOption(parsedOpts.TSEcr+1, parsedOpts.TSVal, tsOpt[2:])
 			options = tsOpt[:]
@@ -1757,8 +1769,8 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 		),
 	)
 
-	tcp := header.TCP(header.IPv4(b).Payload())
-	c.IRS = seqnum.Value(tcp.SequenceNumber())
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
 
 	// Wait for retransmit.
 	time.Sleep(1 * time.Second)
@@ -1766,8 +1778,8 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagSyn),
-			checker.SrcPort(tcp.SourcePort()),
-			checker.SeqNum(tcp.SequenceNumber()),
+			checker.SrcPort(tcpHdr.SourcePort()),
+			checker.SeqNum(tcpHdr.SequenceNumber()),
 			checker.TCPSynOptions(header.TCPSynOptions{MSS: mss, WS: wndScale}),
 		),
 	)
@@ -1775,8 +1787,8 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 	// Send SYN-ACK.
 	iss := seqnum.Value(789)
 	c.SendPacket(nil, &context.Headers{
-		SrcPort: tcp.DestinationPort(),
-		DstPort: tcp.SourcePort(),
+		SrcPort: tcpHdr.DestinationPort(),
+		DstPort: tcpHdr.SourcePort(),
 		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
 		SeqNum:  iss,
 		AckNum:  c.IRS.Add(1),
@@ -2336,491 +2348,6 @@ func TestFinWithPartialAck(t *testing.T) {
 	})
 }
 
-func TestExponentialIncreaseDuringSlowStart(t *testing.T) {
-	maxPayload := 32
-	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
-	defer c.Cleanup()
-
-	c.CreateConnected(789, 30000, nil)
-
-	const iterations = 7
-	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
-	for i := range data {
-		data[i] = byte(i)
-	}
-
-	// Write all the data in one shot. Packets will only be written at the
-	// MTU size though.
-	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
-	}
-
-	expected := tcp.InitialCwnd
-	bytesRead := 0
-	for i := 0; i < iterations; i++ {
-		// Read all packets expected on this iteration. Don't
-		// acknowledge any of them just yet, so that we can measure the
-		// congestion window.
-		for j := 0; j < expected; j++ {
-			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
-			bytesRead += maxPayload
-		}
-
-		// Check we don't receive any more packets on this iteration.
-		// The timeout can't be too high or we'll trigger a timeout.
-		c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
-
-		// Acknowledge all the data received so far.
-		c.SendAck(790, bytesRead)
-
-		// Double the number of expected packets for the next iteration.
-		expected *= 2
-	}
-}
-
-func TestCongestionAvoidance(t *testing.T) {
-	maxPayload := 32
-	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
-	defer c.Cleanup()
-
-	c.CreateConnected(789, 30000, nil)
-
-	const iterations = 7
-	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
-	for i := range data {
-		data[i] = byte(i)
-	}
-
-	// Write all the data in one shot. Packets will only be written at the
-	// MTU size though.
-	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
-	}
-
-	// Do slow start for a few iterations.
-	expected := tcp.InitialCwnd
-	bytesRead := 0
-	for i := 0; i < iterations; i++ {
-		expected = tcp.InitialCwnd << uint(i)
-		if i > 0 {
-			// Acknowledge all the data received so far if not on
-			// first iteration.
-			c.SendAck(790, bytesRead)
-		}
-
-		// Read all packets expected on this iteration. Don't
-		// acknowledge any of them just yet, so that we can measure the
-		// congestion window.
-		for j := 0; j < expected; j++ {
-			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
-			bytesRead += maxPayload
-		}
-
-		// Check we don't receive any more packets on this iteration.
-		// The timeout can't be too high or we'll trigger a timeout.
-		c.CheckNoPacketTimeout("More packets received than expected for this cwnd (slow start phase).", 50*time.Millisecond)
-	}
-
-	// Don't acknowledge the first packet of the last packet train. Let's
-	// wait for them to time out, which will trigger a restart of slow
-	// start, and initialization of ssthresh to cwnd/2.
-	rtxOffset := bytesRead - maxPayload*expected
-	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
-
-	// Acknowledge all the data received so far.
-	c.SendAck(790, bytesRead)
-
-	// This part is tricky: when the timeout happened, we had "expected"
-	// packets pending, cwnd reset to 1, and ssthresh set to expected/2.
-	// By acknowledging "expected" packets, the slow-start part will
-	// increase cwnd to expected/2 (which "consumes" expected/2-1 of the
-	// acknowledgements), then the congestion avoidance part will consume
-	// an extra expected/2 acks to take cwnd to expected/2 + 1. One ack
-	// remains in the "ack count" (which will cause cwnd to be incremented
-	// once it reaches cwnd acks).
-	//
-	// So we're straight into congestion avoidance with cwnd set to
-	// expected/2 + 1.
-	//
-	// Check that packets trains of cwnd packets are sent, and that cwnd is
-	// incremented by 1 after we acknowledge each packet.
-	expected = expected/2 + 1
-	for i := 0; i < iterations; i++ {
-		// Read all packets expected on this iteration. Don't
-		// acknowledge any of them just yet, so that we can measure the
-		// congestion window.
-		for j := 0; j < expected; j++ {
-			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
-			bytesRead += maxPayload
-		}
-
-		// Check we don't receive any more packets on this iteration.
-		// The timeout can't be too high or we'll trigger a timeout.
-		c.CheckNoPacketTimeout("More packets received than expected for this cwnd (congestion avoidance phase).", 50*time.Millisecond)
-
-		// Acknowledge all the data received so far.
-		c.SendAck(790, bytesRead)
-
-		// In cogestion avoidance, the packets trains increase by 1 in
-		// each iteration.
-		expected++
-	}
-}
-
-// cubicCwnd returns an estimate of a cubic window given the
-// originalCwnd, wMax, last congestion event time and sRTT.
-func cubicCwnd(origCwnd int, wMax int, congEventTime time.Time, sRTT time.Duration) int {
-	cwnd := float64(origCwnd)
-	// We wait 50ms between each iteration so sRTT as computed by cubic
-	// should be close to 50ms.
-	elapsed := (time.Since(congEventTime) + sRTT).Seconds()
-	k := math.Cbrt(float64(wMax) * 0.3 / 0.7)
-	wtRTT := 0.4*math.Pow(elapsed-k, 3) + float64(wMax)
-	cwnd += (wtRTT - cwnd) / cwnd
-	return int(cwnd)
-}
-
-func TestCubicCongestionAvoidance(t *testing.T) {
-	maxPayload := 32
-	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
-	defer c.Cleanup()
-
-	enableCUBIC(t, c)
-
-	c.CreateConnected(789, 30000, nil)
-
-	const iterations = 7
-	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
-
-	for i := range data {
-		data[i] = byte(i)
-	}
-
-	// Write all the data in one shot. Packets will only be written at the
-	// MTU size though.
-	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
-	}
-
-	// Do slow start for a few iterations.
-	expected := tcp.InitialCwnd
-	bytesRead := 0
-	for i := 0; i < iterations; i++ {
-		expected = tcp.InitialCwnd << uint(i)
-		if i > 0 {
-			// Acknowledge all the data received so far if not on
-			// first iteration.
-			c.SendAck(790, bytesRead)
-		}
-
-		// Read all packets expected on this iteration. Don't
-		// acknowledge any of them just yet, so that we can measure the
-		// congestion window.
-		for j := 0; j < expected; j++ {
-			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
-			bytesRead += maxPayload
-		}
-
-		// Check we don't receive any more packets on this iteration.
-		// The timeout can't be too high or we'll trigger a timeout.
-		c.CheckNoPacketTimeout("More packets received than expected for this cwnd (during slow-start phase).", 50*time.Millisecond)
-	}
-
-	// Don't acknowledge the first packet of the last packet train. Let's
-	// wait for them to time out, which will trigger a restart of slow
-	// start, and initialization of ssthresh to cwnd * 0.7.
-	rtxOffset := bytesRead - maxPayload*expected
-	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
-
-	// Acknowledge all pending data.
-	c.SendAck(790, bytesRead)
-
-	// Store away the time we sent the ACK and assuming a 200ms RTO
-	// we estimate that the sender will have an RTO 200ms from now
-	// and go back into slow start.
-	packetDropTime := time.Now().Add(200 * time.Millisecond)
-
-	// This part is tricky: when the timeout happened, we had "expected"
-	// packets pending, cwnd reset to 1, and ssthresh set to expected * 0.7.
-	// By acknowledging "expected" packets, the slow-start part will
-	// increase cwnd to expected/2 essentially putting the connection
-	// straight into congestion avoidance.
-	wMax := expected
-	// Lower expected as per cubic spec after a congestion event.
-	expected = int(float64(expected) * 0.7)
-	cwnd := expected
-	for i := 0; i < iterations; i++ {
-		// Cubic grows window independent of ACKs. Cubic Window growth
-		// is a function of time elapsed since last congestion event.
-		// As a result the congestion window does not grow
-		// deterministically in response to ACKs.
-		//
-		// We need to roughly estimate what the cwnd of the sender is
-		// based on when we sent the dupacks.
-		cwnd := cubicCwnd(cwnd, wMax, packetDropTime, 50*time.Millisecond)
-
-		packetsExpected := cwnd
-		for j := 0; j < packetsExpected; j++ {
-			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
-			bytesRead += maxPayload
-		}
-		t.Logf("expected packets received, next trying to receive any extra packets that may come")
-
-		// If our estimate was correct there should be no more pending packets.
-		// We attempt to read a packet a few times with a short sleep in between
-		// to ensure that we don't see the sender send any unexpected packets.
-		unexpectedPackets := 0
-		for {
-			gotPacket := c.ReceiveNonBlockingAndCheckPacket(data, bytesRead, maxPayload)
-			if !gotPacket {
-				break
-			}
-			bytesRead += maxPayload
-			unexpectedPackets++
-			time.Sleep(1 * time.Millisecond)
-		}
-		if unexpectedPackets != 0 {
-			t.Fatalf("received %d unexpected packets for iteration %d", unexpectedPackets, i)
-		}
-		// Check we don't receive any more packets on this iteration.
-		// The timeout can't be too high or we'll trigger a timeout.
-		c.CheckNoPacketTimeout("More packets received than expected for this cwnd(congestion avoidance)", 5*time.Millisecond)
-
-		// Acknowledge all the data received so far.
-		c.SendAck(790, bytesRead)
-	}
-}
-
-func TestFastRecovery(t *testing.T) {
-	maxPayload := 32
-	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
-	defer c.Cleanup()
-
-	c.CreateConnected(789, 30000, nil)
-
-	const iterations = 7
-	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
-	for i := range data {
-		data[i] = byte(i)
-	}
-
-	// Write all the data in one shot. Packets will only be written at the
-	// MTU size though.
-	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
-	}
-
-	// Do slow start for a few iterations.
-	expected := tcp.InitialCwnd
-	bytesRead := 0
-	for i := 0; i < iterations; i++ {
-		expected = tcp.InitialCwnd << uint(i)
-		if i > 0 {
-			// Acknowledge all the data received so far if not on
-			// first iteration.
-			c.SendAck(790, bytesRead)
-		}
-
-		// Read all packets expected on this iteration. Don't
-		// acknowledge any of them just yet, so that we can measure the
-		// congestion window.
-		for j := 0; j < expected; j++ {
-			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
-			bytesRead += maxPayload
-		}
-
-		// Check we don't receive any more packets on this iteration.
-		// The timeout can't be too high or we'll trigger a timeout.
-		c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
-	}
-
-	// Send 3 duplicate acks. This should force an immediate retransmit of
-	// the pending packet and put the sender into fast recovery.
-	rtxOffset := bytesRead - maxPayload*expected
-	for i := 0; i < 3; i++ {
-		c.SendAck(790, rtxOffset)
-	}
-
-	// Receive the retransmitted packet.
-	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
-
-	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
-	}
-
-	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
-	}
-
-	if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.FastRecovery.Value = %v, want = %v", got, want)
-	}
-
-	// Now send 7 mode duplicate acks. Each of these should cause a window
-	// inflation by 1 and cause the sender to send an extra packet.
-	for i := 0; i < 7; i++ {
-		c.SendAck(790, rtxOffset)
-	}
-
-	recover := bytesRead
-
-	// Ensure no new packets arrive.
-	c.CheckNoPacketTimeout("More packets received than expected during recovery after dupacks for this cwnd.",
-		50*time.Millisecond)
-
-	// Acknowledge half of the pending data.
-	rtxOffset = bytesRead - expected*maxPayload/2
-	c.SendAck(790, rtxOffset)
-
-	// Receive the retransmit due to partial ack.
-	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
-
-	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
-		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
-	}
-
-	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
-		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
-	}
-
-	// Receive the 10 extra packets that should have been released due to
-	// the congestion window inflation in recovery.
-	for i := 0; i < 10; i++ {
-		c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
-		bytesRead += maxPayload
-	}
-
-	// A partial ACK during recovery should reduce congestion window by the
-	// number acked. Since we had "expected" packets outstanding before sending
-	// partial ack and we acked expected/2 , the cwnd and outstanding should
-	// be expected/2 + 10 (7 dupAcks + 3 for the original 3 dupacks that triggered
-	// fast recovery). Which means the sender should not send any more packets
-	// till we ack this one.
-	c.CheckNoPacketTimeout("More packets received than expected during recovery after partial ack for this cwnd.",
-		50*time.Millisecond)
-
-	// Acknowledge all pending data to recover point.
-	c.SendAck(790, recover)
-
-	// At this point, the cwnd should reset to expected/2 and there are 10
-	// packets outstanding.
-	//
-	// NOTE: Technically netstack is incorrect in that we adjust the cwnd on
-	// the same segment that takes us out of recovery. But because of that
-	// the actual cwnd at exit of recovery will be expected/2 + 1 as we
-	// acked a cwnd worth of packets which will increase the cwnd further by
-	// 1 in congestion avoidance.
-	//
-	// Now in the first iteration since there are 10 packets outstanding.
-	// We would expect to get expected/2 +1 - 10 packets. But subsequent
-	// iterations will send us expected/2 + 1 + 1 (per iteration).
-	expected = expected/2 + 1 - 10
-	for i := 0; i < iterations; i++ {
-		// Read all packets expected on this iteration. Don't
-		// acknowledge any of them just yet, so that we can measure the
-		// congestion window.
-		for j := 0; j < expected; j++ {
-			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
-			bytesRead += maxPayload
-		}
-
-		// Check we don't receive any more packets on this iteration.
-		// The timeout can't be too high or we'll trigger a timeout.
-		c.CheckNoPacketTimeout(fmt.Sprintf("More packets received(after deflation) than expected %d for this cwnd.", expected), 50*time.Millisecond)
-
-		// Acknowledge all the data received so far.
-		c.SendAck(790, bytesRead)
-
-		// In cogestion avoidance, the packets trains increase by 1 in
-		// each iteration.
-		if i == 0 {
-			// After the first iteration we expect to get the full
-			// congestion window worth of packets in every
-			// iteration.
-			expected += 10
-		}
-		expected++
-	}
-}
-
-func TestRetransmit(t *testing.T) {
-	maxPayload := 32
-	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
-	defer c.Cleanup()
-
-	c.CreateConnected(789, 30000, nil)
-
-	const iterations = 7
-	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
-	for i := range data {
-		data[i] = byte(i)
-	}
-
-	// Write all the data in two shots. Packets will only be written at the
-	// MTU size though.
-	half := data[:len(data)/2]
-	if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
-	}
-	half = data[len(data)/2:]
-	if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
-	}
-
-	// Do slow start for a few iterations.
-	expected := tcp.InitialCwnd
-	bytesRead := 0
-	for i := 0; i < iterations; i++ {
-		expected = tcp.InitialCwnd << uint(i)
-		if i > 0 {
-			// Acknowledge all the data received so far if not on
-			// first iteration.
-			c.SendAck(790, bytesRead)
-		}
-
-		// Read all packets expected on this iteration. Don't
-		// acknowledge any of them just yet, so that we can measure the
-		// congestion window.
-		for j := 0; j < expected; j++ {
-			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
-			bytesRead += maxPayload
-		}
-
-		// Check we don't receive any more packets on this iteration.
-		// The timeout can't be too high or we'll trigger a timeout.
-		c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
-	}
-
-	// Wait for a timeout and retransmit.
-	rtxOffset := bytesRead - maxPayload*expected
-	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
-
-	if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.Timeouts.Value = %v, want = %v", got, want)
-	}
-
-	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
-	}
-
-	if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
-		t.Errorf("got stats.TCP.SlowStartRetransmits.Value = %v, want = %v", got, want)
-	}
-
-	// Acknowledge half of the pending data.
-	rtxOffset = bytesRead - expected*maxPayload/2
-	c.SendAck(790, rtxOffset)
-
-	// Receive the remaining data, making sure that acknowledged data is not
-	// retransmitted.
-	for offset := rtxOffset; offset < len(data); offset += maxPayload {
-		c.ReceiveAndCheckPacket(data, offset, maxPayload)
-		c.SendAck(790, offset+maxPayload)
-	}
-
-	c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
-}
-
 func TestUpdateListenBacklog(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -3008,8 +2535,8 @@ func TestReceivedSegmentQueuing(t *testing.T) {
 				checker.TCPFlags(header.TCPFlagAck),
 			),
 		)
-		tcp := header.TCP(header.IPv4(b).Payload())
-		ack := seqnum.Value(tcp.AckNumber())
+		tcpHdr := header.TCP(header.IPv4(b).Payload())
+		ack := seqnum.Value(tcpHdr.AckNumber())
 		if ack == last {
 			break
 		}
@@ -3053,6 +2580,10 @@ func TestReadAfterClosedState(t *testing.T) {
 		),
 	)
 
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
 	// Send some data and acknowledge the FIN.
 	data := []byte{1, 2, 3}
 	c.SendPacket(data, &context.Headers{
@@ -3074,9 +2605,15 @@ func TestReadAfterClosedState(t *testing.T) {
 		),
 	)
 
-	// Give the stack the chance to transition to closed state.
+	// Give the stack the chance to transition to closed state. Note that since
+	// both the sender and receiver are now closed, we effectively skip the
+	// TIME-WAIT state.
 	time.Sleep(1 * time.Second)
 
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateClose; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
 	// Wait for receive to be notified.
 	select {
 	case <-ch:
@@ -3668,13 +3205,14 @@ func TestTCPEndpointProbe(t *testing.T) {
 	}
 }
 
-func TestSetCongestionControl(t *testing.T) {
+func TestStackSetCongestionControl(t *testing.T) {
 	testCases := []struct {
-		cc       tcp.CongestionControlOption
-		mustPass bool
+		cc  tcpip.CongestionControlOption
+		err *tcpip.Error
 	}{
-		{"reno", true},
-		{"cubic", true},
+		{"reno", nil},
+		{"cubic", nil},
+		{"blahblah", tcpip.ErrNoSuchFile},
 	}
 
 	for _, tc := range testCases {
@@ -3684,62 +3222,135 @@ func TestSetCongestionControl(t *testing.T) {
 
 			s := c.Stack()
 
-			if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tc.cc); err != nil && tc.mustPass {
-				t.Fatalf("s.SetTransportProtocolOption(%v, %v) = %v, want not-nil", tcp.ProtocolNumber, tc.cc, err)
+			var oldCC tcpip.CongestionControlOption
+			if err := s.TransportProtocolOption(tcp.ProtocolNumber, &oldCC); err != nil {
+				t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &oldCC, err)
 			}
 
-			var cc tcp.CongestionControlOption
+			if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tc.cc); err != tc.err {
+				t.Fatalf("s.SetTransportProtocolOption(%v, %v) = %v, want %v", tcp.ProtocolNumber, tc.cc, err, tc.err)
+			}
+
+			var cc tcpip.CongestionControlOption
 			if err := s.TransportProtocolOption(tcp.ProtocolNumber, &cc); err != nil {
 				t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &cc, err)
 			}
-			if got, want := cc, tc.cc; got != want {
+
+			got, want := cc, oldCC
+			// If SetTransportProtocolOption is expected to succeed
+			// then the returned value for congestion control should
+			// match the one specified in the
+			// SetTransportProtocolOption call above, else it should
+			// be what it was before the call to
+			// SetTransportProtocolOption.
+			if tc.err == nil {
+				want = tc.cc
+			}
+			if got != want {
 				t.Fatalf("got congestion control: %v, want: %v", got, want)
 			}
 		})
 	}
 }
 
-func TestAvailableCongestionControl(t *testing.T) {
+func TestStackAvailableCongestionControl(t *testing.T) {
 	c := context.New(t, 1500)
 	defer c.Cleanup()
 
 	s := c.Stack()
 
 	// Query permitted congestion control algorithms.
-	var aCC tcp.AvailableCongestionControlOption
+	var aCC tcpip.AvailableCongestionControlOption
 	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &aCC); err != nil {
 		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &aCC, err)
 	}
-	if got, want := aCC, tcp.AvailableCongestionControlOption("reno cubic"); got != want {
-		t.Fatalf("got tcp.AvailableCongestionControlOption: %v, want: %v", got, want)
+	if got, want := aCC, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
+		t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
 	}
 }
 
-func TestSetAvailableCongestionControl(t *testing.T) {
+func TestStackSetAvailableCongestionControl(t *testing.T) {
 	c := context.New(t, 1500)
 	defer c.Cleanup()
 
 	s := c.Stack()
 
 	// Setting AvailableCongestionControlOption should fail.
-	aCC := tcp.AvailableCongestionControlOption("xyz")
+	aCC := tcpip.AvailableCongestionControlOption("xyz")
 	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &aCC); err == nil {
 		t.Fatalf("s.TransportProtocolOption(%v, %v) = nil, want non-nil", tcp.ProtocolNumber, &aCC)
 	}
 
 	// Verify that we still get the expected list of congestion control options.
-	var cc tcp.AvailableCongestionControlOption
+	var cc tcpip.AvailableCongestionControlOption
 	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &cc); err != nil {
 		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &cc, err)
 	}
-	if got, want := cc, tcp.AvailableCongestionControlOption("reno cubic"); got != want {
-		t.Fatalf("got tcp.AvailableCongestionControlOption: %v, want: %v", got, want)
+	if got, want := cc, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
+		t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
+	}
+}
+
+func TestEndpointSetCongestionControl(t *testing.T) {
+	testCases := []struct {
+		cc  tcpip.CongestionControlOption
+		err *tcpip.Error
+	}{
+		{"reno", nil},
+		{"cubic", nil},
+		{"blahblah", tcpip.ErrNoSuchFile},
+	}
+
+	for _, connected := range []bool{false, true} {
+		for _, tc := range testCases {
+			t.Run(fmt.Sprintf("SetSockOpt(.., %v) w/ connected = %v", tc.cc, connected), func(t *testing.T) {
+				c := context.New(t, 1500)
+				defer c.Cleanup()
+
+				// Create TCP endpoint.
+				var err *tcpip.Error
+				c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+				if err != nil {
+					t.Fatalf("NewEndpoint failed: %v", err)
+				}
+
+				var oldCC tcpip.CongestionControlOption
+				if err := c.EP.GetSockOpt(&oldCC); err != nil {
+					t.Fatalf("c.EP.SockOpt(%v) = %v", &oldCC, err)
+				}
+
+				if connected {
+					c.Connect(789 /* iss */, 32768 /* rcvWnd */, nil)
+				}
+
+				if err := c.EP.SetSockOpt(tc.cc); err != tc.err {
+					t.Fatalf("c.EP.SetSockOpt(%v) = %v, want %v", tc.cc, err, tc.err)
+				}
+
+				var cc tcpip.CongestionControlOption
+				if err := c.EP.GetSockOpt(&cc); err != nil {
+					t.Fatalf("c.EP.SockOpt(%v) = %v", &cc, err)
+				}
+
+				got, want := cc, oldCC
+				// If SetSockOpt is expected to succeed then the
+				// returned value for congestion control should match
+				// the one specified in the SetSockOpt above, else it
+				// should be what it was before the call to SetSockOpt.
+				if tc.err == nil {
+					want = tc.cc
+				}
+				if got != want {
+					t.Fatalf("got congestion control: %v, want: %v", got, want)
+				}
+			})
+		}
 	}
 }
 
 func enableCUBIC(t *testing.T, c *context.Context) {
 	t.Helper()
-	opt := tcp.CongestionControlOption("cubic")
+	opt := tcpip.CongestionControlOption("cubic")
 	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt); err != nil {
 		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, %v = %v", opt, err)
 	}
@@ -3868,7 +3479,7 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki
 		RcvWnd:  30000,
 	})
 
-	// Receive the SYN-ACK reply.
+	// Receive the SYN-ACK reply.w
 	b := c.GetPacket()
 	tcp := header.TCP(header.IPv4(b).Payload())
 	iss = seqnum.Value(tcp.SequenceNumber())
@@ -3932,12 +3543,18 @@ func TestListenBacklogFull(t *testing.T) {
 
 	time.Sleep(50 * time.Millisecond)
 
-	// Now execute one more handshake. This should not be completed and
-	// delivered on an Accept() call as the backlog is full at this point.
-	irs, iss := executeHandshake(t, c, context.TestPort+uint16(listenBacklog), false /* synCookieInUse */)
+	// Now execute send one more SYN. The stack should not respond as the backlog
+	// is full at this point.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort + 2,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  seqnum.Value(789),
+		RcvWnd:  30000,
+	})
+	c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
 
-	time.Sleep(50 * time.Millisecond)
-	// Try to accept the connection.
+	// Try to accept the connections in the backlog.
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
@@ -3969,16 +3586,8 @@ func TestListenBacklogFull(t *testing.T) {
 		}
 	}
 
-	// Now craft the ACK again and verify that the connection is now ready
-	// to be accepted.
-	c.SendPacket(nil, &context.Headers{
-		SrcPort: context.TestPort + uint16(listenBacklog),
-		DstPort: context.StackPort,
-		Flags:   header.TCPFlagAck,
-		SeqNum:  irs + 1,
-		AckNum:  iss + 1,
-		RcvWnd:  30000,
-	})
+	// Now a new handshake must succeed.
+	executeHandshake(t, c, context.TestPort+2, false /*synCookieInUse */)
 
 	newEP, _, err := c.EP.Accept()
 	if err == tcpip.ErrWouldBlock {
@@ -3994,6 +3603,7 @@ func TestListenBacklogFull(t *testing.T) {
 			t.Fatalf("Timed out waiting for accept")
 		}
 	}
+
 	// Now verify that the TCP socket is usable and in a connected state.
 	data := "Don't panic"
 	newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
@@ -4004,13 +3614,7 @@ func TestListenBacklogFull(t *testing.T) {
 	}
 }
 
-func TestListenBacklogFullSynCookieInUse(t *testing.T) {
-	saved := tcp.SynRcvdCountThreshold
-	defer func() {
-		tcp.SynRcvdCountThreshold = saved
-	}()
-	tcp.SynRcvdCountThreshold = 1
-
+func TestListenSynRcvdQueueFull(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
@@ -4029,48 +3633,72 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	// Test acceptance.
 	// Start listening.
 	listenBacklog := 1
-	portOffset := uint16(0)
 	if err := c.EP.Listen(listenBacklog); err != nil {
 		t.Fatalf("Listen failed: %v", err)
 	}
 
-	executeHandshake(t, c, context.TestPort+portOffset, false)
-	portOffset++
-	// Wait for this to be delivered to the accept queue.
-	time.Sleep(50 * time.Millisecond)
+	// Send two SYN's the first one should get a SYN-ACK, the
+	// second one should not get any response and is dropped as
+	// the synRcvd count will be equal to backlog.
+	irs := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  seqnum.Value(789),
+		RcvWnd:  30000,
+	})
 
-	nonCookieIRS, nonCookieISS := executeHandshake(t, c, context.TestPort+portOffset, false)
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcp := header.TCP(header.IPv4(b).Payload())
+	iss := seqnum.Value(tcp.SequenceNumber())
+	tcpCheckers := []checker.TransportChecker{
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+		checker.AckNum(uint32(irs) + 1),
+	}
+	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
 
-	// Since the backlog is full at this point this connection will not
-	// transition out of handshake and ignore the ACK.
-	//
-	// At this point there should be 1 completed connection in the backlog
-	// and one incomplete one pending for a final ACK and hence not ready to be
-	// delivered to the endpoint.
-	//
-	// Now execute one more handshake. This should not be completed and
-	// delivered on an Accept() call as the backlog is full at this point
-	// and there is already 1 pending endpoint.
+	// Now execute send one more SYN. The stack should not respond as the backlog
+	// is full at this point.
 	//
-	// This one should use a SYN cookie as the synRcvdCount is equal to the
-	// SynRcvdCountThreshold.
-	time.Sleep(50 * time.Millisecond)
-	portOffset++
-	irs, iss := executeHandshake(t, c, context.TestPort+portOffset, true)
+	// NOTE: we did not complete the handshake for the previous one so the
+	// accept backlog should be empty and there should be one connection in
+	// synRcvd state.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort + 1,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  seqnum.Value(889),
+		RcvWnd:  30000,
+	})
+	c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
 
-	time.Sleep(50 * time.Millisecond)
+	// Now complete the previous connection and verify that there is a connection
+	// to accept.
+	// Send ACK.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
 
-	// Verify that there is only one acceptable connection at this point.
+	// Try to accept the connections in the backlog.
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	_, _, err = c.EP.Accept()
+	newEP, _, err := c.EP.Accept()
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			newEP, _, err = c.EP.Accept()
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
@@ -4080,27 +3708,68 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 		}
 	}
 
-	// Now verify that there are no more connections that can be accepted.
-	_, _, err = c.EP.Accept()
-	if err != tcpip.ErrWouldBlock {
-		select {
-		case <-ch:
-			t.Fatalf("unexpected endpoint delivered on Accept: %+v", c.EP)
-		case <-time.After(1 * time.Second):
-		}
+	// Now verify that the TCP socket is usable and in a connected state.
+	data := "Don't panic"
+	newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
+	pkt := c.GetPacket()
+	tcp = header.TCP(header.IPv4(pkt).Payload())
+	if string(tcp.Payload()) != data {
+		t.Fatalf("Unexpected data: got %v, want %v", string(tcp.Payload()), data)
 	}
+}
 
-	// Now send an ACK for the half completed connection
+func TestListenBacklogFullSynCookieInUse(t *testing.T) {
+	saved := tcp.SynRcvdCountThreshold
+	defer func() {
+		tcp.SynRcvdCountThreshold = saved
+	}()
+	tcp.SynRcvdCountThreshold = 1
+
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test acceptance.
+	// Start listening.
+	listenBacklog := 1
+	portOffset := uint16(0)
+	if err := c.EP.Listen(listenBacklog); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+
+	executeHandshake(t, c, context.TestPort+portOffset, false)
+	portOffset++
+	// Wait for this to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+
+	// Send a SYN request.
+	irs := seqnum.Value(789)
 	c.SendPacket(nil, &context.Headers{
-		SrcPort: context.TestPort + portOffset - 1,
+		SrcPort: context.TestPort,
 		DstPort: context.StackPort,
-		Flags:   header.TCPFlagAck,
-		SeqNum:  nonCookieIRS + 1,
-		AckNum:  nonCookieISS + 1,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
 		RcvWnd:  30000,
 	})
+	// The Syn should be dropped as the endpoint's backlog is full.
+	c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
+
+	// Verify that there is only one acceptable connection at this point.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
 
-	// Verify that the connection is now delivered to the backlog.
 	_, _, err = c.EP.Accept()
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
@@ -4116,41 +3785,15 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 		}
 	}
 
-	// Finally send an ACK for the connection that used a cookie and verify that
-	// it's also completed and delivered.
-	c.SendPacket(nil, &context.Headers{
-		SrcPort: context.TestPort + portOffset,
-		DstPort: context.StackPort,
-		Flags:   header.TCPFlagAck,
-		SeqNum:  irs,
-		AckNum:  iss,
-		RcvWnd:  30000,
-	})
-
-	time.Sleep(50 * time.Millisecond)
-	newEP, _, err := c.EP.Accept()
-	if err == tcpip.ErrWouldBlock {
-		// Wait for connection to be established.
+	// Now verify that there are no more connections that can be accepted.
+	_, _, err = c.EP.Accept()
+	if err != tcpip.ErrWouldBlock {
 		select {
 		case <-ch:
-			newEP, _, err = c.EP.Accept()
-			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
-			}
-
+			t.Fatalf("unexpected endpoint delivered on Accept: %+v", c.EP)
 		case <-time.After(1 * time.Second):
-			t.Fatalf("Timed out waiting for accept")
 		}
 	}
-
-	// Now verify that the TCP socket is usable and in a connected state.
-	data := "Don't panic"
-	newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
-	b := c.GetPacket()
-	tcp := header.TCP(header.IPv4(b).Payload())
-	if string(tcp.Payload()) != data {
-		t.Fatalf("Unexpected data: got %v, want %v", string(tcp.Payload()), data)
-	}
 }
 
 func TestPassiveConnectionAttemptIncrement(t *testing.T) {
@@ -4165,9 +3808,15 @@ func TestPassiveConnectionAttemptIncrement(t *testing.T) {
 	if err := ep.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
 	if err := c.EP.Listen(1); err != nil {
 		t.Fatalf("Listen failed: %v", err)
 	}
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateListen; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
 
 	stats := c.Stack().Stats()
 	want := stats.TCP.PassiveConnectionOpenings.Value() + 1
@@ -4218,18 +3867,12 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
 	}
 
 	srcPort := uint16(context.TestPort)
-	// Now attempt 3 handshakes, the first two will fill up the accept and the SYN-RCVD
-	// queue for the endpoint.
+	// Now attempt a handshakes it will fill up the accept backlog.
 	executeHandshake(t, c, srcPort, false)
 
 	// Give time for the final ACK to be processed as otherwise the next handshake could
 	// get accepted before the previous one based on goroutine scheduling.
 	time.Sleep(50 * time.Millisecond)
-	irs, iss := executeHandshake(t, c, srcPort+1, false)
-
-	// Wait for a short while for the accepted connection to be delivered to
-	// the channel before trying to send the 3rd SYN.
-	time.Sleep(40 * time.Millisecond)
 
 	want := stats.TCP.ListenOverflowSynDrop.Value() + 1
 
@@ -4267,26 +3910,44 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
 			t.Fatalf("Timed out waiting for accept")
 		}
 	}
+}
 
-	// Now complete the next connection in SYN-RCVD state as it should
-	// have dropped the final ACK to the handshake due to accept queue
-	// being full.
-	c.SendPacket(nil, &context.Headers{
-		SrcPort: srcPort + 1,
-		DstPort: context.StackPort,
-		Flags:   header.TCPFlagAck,
-		SeqNum:  irs + 1,
-		AckNum:  iss + 1,
-		RcvWnd:  30000,
-	})
+func TestEndpointBindListenAcceptState(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
 
-	// Now check that there is one more acceptable connections.
-	_, _, err = c.EP.Accept()
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	c.PassiveConnectWithOptions(100, 5, header.TCPSynOptions{MSS: defaultIPv4MSS})
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	aep, _, err := ep.Accept()
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			_, _, err = c.EP.Accept()
+			aep, _, err = ep.Accept()
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
@@ -4295,19 +3956,23 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
 			t.Fatalf("Timed out waiting for accept")
 		}
 	}
+	if got, want := tcp.EndpointState(aep.State()), tcp.StateEstablished; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+	// Listening endpoint remains in listen state.
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
 
-	// Try and accept a 3rd one this should fail.
-	_, _, err = c.EP.Accept()
-	if err == tcpip.ErrWouldBlock {
-		// Wait for connection to be established.
-		select {
-		case <-ch:
-			ep, _, err = c.EP.Accept()
-			if err == nil {
-				t.Fatalf("Accept succeeded when it should have failed got: %+v", ep)
-			}
-
-		case <-time.After(1 * time.Second):
-		}
+	ep.Close()
+	// Give worker goroutines time to receive the close notification.
+	time.Sleep(1 * time.Second)
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+	// Accepted endpoint remains open when the listen endpoint is closed.
+	if got, want := tcp.EndpointState(aep.State()), tcp.StateEstablished; got != want {
+		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
+
 }
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 6e12413c6..a4d89e24d 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -520,32 +520,21 @@ func (c *Context) CreateConnected(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf
 	c.CreateConnectedWithRawOptions(iss, rcvWnd, epRcvBuf, nil)
 }
 
-// CreateConnectedWithRawOptions creates a connected TCP endpoint and sends
-// the specified option bytes as the Option field in the initial SYN packet.
+// Connect performs the 3-way handshake for c.EP with the provided Initial
+// Sequence Number (iss) and receive window(rcvWnd) and any options if
+// specified.
 //
 // It also sets the receive buffer for the endpoint to the specified
 // value in epRcvBuf.
-func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf *tcpip.ReceiveBufferSizeOption, options []byte) {
-	// Create TCP endpoint.
-	var err *tcpip.Error
-	c.EP, err = c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
-	if err != nil {
-		c.t.Fatalf("NewEndpoint failed: %v", err)
-	}
-
-	if epRcvBuf != nil {
-		if err := c.EP.SetSockOpt(*epRcvBuf); err != nil {
-			c.t.Fatalf("SetSockOpt failed failed: %v", err)
-		}
-	}
-
+//
+// PreCondition: c.EP must already be created.
+func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte) {
 	// Start connection attempt.
 	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&waitEntry, waiter.EventOut)
 	defer c.WQ.EventUnregister(&waitEntry)
 
-	err = c.EP.Connect(tcpip.FullAddress{Addr: TestAddr, Port: TestPort})
-	if err != tcpip.ErrConnectStarted {
+	if err := c.EP.Connect(tcpip.FullAddress{Addr: TestAddr, Port: TestPort}); err != tcpip.ErrConnectStarted {
 		c.t.Fatalf("Unexpected return value from Connect: %v", err)
 	}
 
@@ -557,13 +546,16 @@ func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.
 			checker.TCPFlags(header.TCPFlagSyn),
 		),
 	)
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
 
-	tcp := header.TCP(header.IPv4(b).Payload())
-	c.IRS = seqnum.Value(tcp.SequenceNumber())
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
 
 	c.SendPacket(nil, &Headers{
-		SrcPort: tcp.DestinationPort(),
-		DstPort: tcp.SourcePort(),
+		SrcPort: tcpHdr.DestinationPort(),
+		DstPort: tcpHdr.SourcePort(),
 		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
 		SeqNum:  iss,
 		AckNum:  c.IRS.Add(1),
@@ -584,15 +576,38 @@ func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.
 	// Wait for connection to be established.
 	select {
 	case <-notifyCh:
-		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
-		if err != nil {
+		if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
 			c.t.Fatalf("Unexpected error when connecting: %v", err)
 		}
 	case <-time.After(1 * time.Second):
 		c.t.Fatalf("Timed out waiting for connection")
 	}
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateEstablished; got != want {
+		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	c.Port = tcpHdr.SourcePort()
+}
+
+// CreateConnectedWithRawOptions creates a connected TCP endpoint and sends
+// the specified option bytes as the Option field in the initial SYN packet.
+//
+// It also sets the receive buffer for the endpoint to the specified
+// value in epRcvBuf.
+func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf *tcpip.ReceiveBufferSizeOption, options []byte) {
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		c.t.Fatalf("NewEndpoint failed: %v", err)
+	}
 
-	c.Port = tcp.SourcePort()
+	if epRcvBuf != nil {
+		if err := c.EP.SetSockOpt(*epRcvBuf); err != nil {
+			c.t.Fatalf("SetSockOpt failed failed: %v", err)
+		}
+	}
+	c.Connect(iss, rcvWnd, options)
 }
 
 // RawEndpoint is just a small wrapper around a TCP endpoint's state to make
@@ -690,6 +705,9 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 	if err != nil {
 		c.t.Fatalf("c.s.NewEndpoint(tcp, ipv4...) = %v", err)
 	}
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateInitial; got != want {
+		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
 
 	// Start connection attempt.
 	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
@@ -719,6 +737,10 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 			}),
 		),
 	)
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
 	tcpSeg := header.TCP(header.IPv4(b).Payload())
 	synOptions := header.ParseSynOptions(tcpSeg.Options(), false)
 
@@ -782,6 +804,9 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
 	case <-time.After(1 * time.Second):
 		c.t.Fatalf("Timed out waiting for connection")
 	}
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateEstablished; got != want {
+		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
 
 	// Store the source port in use by the endpoint.
 	c.Port = tcpSeg.SourcePort()
@@ -821,10 +846,16 @@ func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOption
 	if err := ep.Bind(tcpip.FullAddress{Port: StackPort}); err != nil {
 		c.t.Fatalf("Bind failed: %v", err)
 	}
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
+		c.t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
 
 	if err := ep.Listen(10); err != nil {
 		c.t.Fatalf("Listen failed: %v", err)
 	}
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
+		c.t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
 
 	rep := c.PassiveConnectWithOptions(100, wndScale, synOptions)
 
@@ -847,6 +878,10 @@ func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOption
 			c.t.Fatalf("Timed out waiting for accept")
 		}
 	}
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateEstablished; got != want {
+		c.t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
 	return rep
 }
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 3d52a4f31..fa7278286 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1000,3 +1000,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
 func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
 }
+
+// State implements socket.Socket.State.
+func (e *endpoint) State() uint32 {
+	// TODO(b/112063468): Translate internal state to values returned by Linux.
+	return 0
+}
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index 0f155ec74..4ea684659 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -35,7 +35,7 @@ import (
 )
 
 // maxFiles determines the maximum file payload.
-const maxFiles = 16
+const maxFiles = 32
 
 // ErrTooManyFiles is returned when too many file descriptors are mapped.
 var ErrTooManyFiles = errors.New("too many files")
diff --git a/runsc/BUILD b/runsc/BUILD
index af8e928c5..8a57c597b 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,6 +1,4 @@
-package(
-    licenses = ["notice"],  # Apache 2.0
-)
+package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar")
@@ -84,8 +82,9 @@ pkg_tar(
 genrule(
     name = "deb-version",
     outs = ["version.txt"],
-    cmd = "cat bazel-out/volatile-status.txt | grep VERSION | sed 's/^[^0-9]*//' >$@",
+    cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@",
     stamp = 1,
+    tools = [":runsc"],
 )
 
 pkg_deb(
@@ -98,4 +97,7 @@ pkg_deb(
     package = "runsc",
     postinst = "debian/postinst.sh",
     version_file = ":version.txt",
+    visibility = [
+        "//visibility:public",
+    ],
 )
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index df9907e52..744f852a1 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -16,6 +16,7 @@ go_library(
         "limits.go",
         "loader.go",
         "network.go",
+        "pprof.go",
         "strace.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/boot",
@@ -30,6 +31,7 @@ go_library(
         "//pkg/cpuid",
         "//pkg/eventchannel",
         "//pkg/log",
+        "//pkg/memutil",
         "//pkg/rand",
         "//pkg/sentry/arch",
         "//pkg/sentry/arch:registers_go_proto",
@@ -51,7 +53,6 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
-        "//pkg/sentry/memutil",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm",
@@ -94,6 +95,7 @@ go_test(
     size = "small",
     srcs = [
         "compat_test.go",
+        "fs_test.go",
         "loader_test.go",
     ],
     embed = [":boot"],
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 15f624f9b..6112b6c0a 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -221,6 +221,17 @@ type Config struct {
 	// user, and without chrooting the sandbox process. This can be
 	// necessary in test environments that have limited capabilities.
 	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+	// NumNetworkChannels controls the number of AF_PACKET sockets that map
+	// to the same underlying network device. This allows netstack to better
+	// scale for high throughput use cases.
+	NumNetworkChannels int
+
+	// Rootless allows the sandbox to be started with a user that is not root.
+	// Defense is depth measures are weaker with rootless. Specifically, the
+	// sandbox and Gofer process run as root inside a user namespace with root
+	// mapped to the caller's user.
+	Rootless bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -244,6 +255,8 @@ func (c *Config) ToFlags() []string {
 		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
 		"--profile=" + strconv.FormatBool(c.ProfileEnable),
 		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
+		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
+		"--rootless=" + strconv.FormatBool(c.Rootless),
 	}
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 72ab9ef86..26765cc46 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -237,7 +237,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
 	}
 
-	err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
 	if err != nil {
 		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
 		return err
@@ -340,8 +340,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 
 	// Set up the restore environment.
-	fds := &fdDispenser{fds: cm.l.goferFDs}
-	renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
+	mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k, cm.l.mountHints)
+	renv, err := mntr.createRestoreEnvironment(cm.l.conf)
 	if err != nil {
 		return fmt.Errorf("creating RestoreEnvironment: %v", err)
 	}
@@ -359,6 +359,17 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("file cannot be empty")
 	}
 
+	if cm.l.conf.ProfileEnable {
+		// initializePProf opens /proc/self/maps, so has to be
+		// called before installing seccomp filters.
+		initializePProf()
+	}
+
+	// Seccomp filters have to be applied before parsing the state file.
+	if err := cm.l.installSeccompFilters(); err != nil {
+		return err
+	}
+
 	// Load the state.
 	loadOpts := state.LoadOpts{Source: specFile}
 	if err := loadOpts.Load(k, networkStack); err != nil {
@@ -369,11 +380,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	k.Timekeeper().SetClocks(time.NewCalibratedClocks())
 
 	// Since we have a new kernel we also must make a new watchdog.
-	watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+	dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
 
 	// Change the loader fields to reflect the changes made when restoring.
 	cm.l.k = k
-	cm.l.watchdog = watchdog
+	cm.l.watchdog = dog
 	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
 	cm.l.restore = true
 
@@ -420,16 +431,12 @@ type WaitPIDArgs struct {
 
 	// CID is the container ID.
 	CID string
-
-	// ClearStatus determines whether the exit status of the process should
-	// be cleared when WaitPID returns.
-	ClearStatus bool
 }
 
 // WaitPID waits for the process with PID 'pid' in the sandbox.
 func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
 	log.Debugf("containerManager.Wait")
-	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus)
+	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
 }
 
 // SignalDeliveryMode enumerates different signal delivery modes.
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 4e428b49c..0811e10f4 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -28,11 +28,12 @@ import (
 // createFDMap creates an FD map that contains stdin, stdout, and stderr. If
 // console is true, then ioctl calls will be passed through to the host FD.
 // Upon success, createFDMap dups then closes stdioFDs.
-func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
+func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
 	if len(stdioFDs) != 3 {
 		return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
 	}
 
+	k := kernel.KernelFromContext(ctx)
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
 	mounter := fs.FileOwnerFromContext(ctx)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 652da1cef..ef2dbfad2 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -246,6 +246,10 @@ var allowedSyscalls = seccomp.SyscallRules{
 	},
 	syscall.SYS_SETITIMER: {},
 	syscall.SYS_SHUTDOWN: []seccomp.Rule{
+		// Used by fs/host to shutdown host sockets.
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+		// Used by unet to shutdown connections.
 		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK:     {},
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 4b1557b9a..2fa0725d1 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"path"
 	"path/filepath"
+	"sort"
 	"strconv"
 	"strings"
 	"syscall"
@@ -29,9 +30,6 @@ import (
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -40,6 +38,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -51,6 +51,9 @@ const (
 	// Device name for root mount.
 	rootDevice = "9pfs-/"
 
+	// MountPrefix is the annotation prefix for mount hints.
+	MountPrefix = "gvisor.dev/spec/mount"
+
 	// ChildContainersDir is the directory where child container root
 	// filesystems are mounted.
 	ChildContainersDir = "/__runsc_containers__"
@@ -65,67 +68,24 @@ const (
 	nonefs   = "none"
 )
 
-type fdDispenser struct {
-	fds []int
-}
-
-func (f *fdDispenser) remove() int {
-	if f.empty() {
-		panic("fdDispenser out of fds")
-	}
-	rv := f.fds[0]
-	f.fds = f.fds[1:]
-	return rv
-}
-
-func (f *fdDispenser) empty() bool {
-	return len(f.fds) == 0
-}
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+	// Upper layer uses the same flags as lower, but it must be read-write.
+	upperFlags := lowerFlags
+	upperFlags.ReadOnly = false
 
-func adjustDirentCache(k *kernel.Kernel) error {
-	var hl syscall.Rlimit
-	if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
-		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
-	}
-	if int64(hl.Cur) != syscall.RLIM_INFINITY {
-		newSize := hl.Cur / 2
-		if newSize < gofer.DefaultDirentCacheSize {
-			log.Infof("Setting gofer dirent cache size to %d", newSize)
-			gofer.DefaultDirentCacheSize = newSize
-			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
-		}
+	tmpFS := mustFindFilesystem("tmpfs")
+	if !fs.IsDir(lower.StableAttr) {
+		// Create overlay on top of mount file, e.g. /etc/hostname.
+		msrc := fs.NewCachingMountSource(tmpFS, upperFlags)
+		return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags)
 	}
-	return nil
-}
 
-// setupRootContainerFS creates a mount namespace containing the root filesystem
-// and all mounts. 'rootCtx' is used to walk directories to find mount points.
-// 'setMountNS' is called after namespace is created. It must set the mount NS
-// to 'rootCtx'.
-func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int, setMountNS func(*fs.MountNamespace)) error {
-	mounts := compileMounts(spec)
-
-	// Create a tmpfs mount where we create and mount a root filesystem for
-	// each child container.
-	mounts = append(mounts, specs.Mount{
-		Type:        tmpfs,
-		Destination: ChildContainersDir,
-	})
-
-	fds := &fdDispenser{fds: goferFDs}
-	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
-	if err != nil {
-		return fmt.Errorf("creating root mount: %v", err)
-	}
-	mns, err := fs.NewMountNamespace(userCtx, rootInode)
+	// Create overlay on top of mount dir.
+	upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil)
 	if err != nil {
-		return fmt.Errorf("creating root mount namespace: %v", err)
+		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
 	}
-	setMountNS(mns)
-
-	root := mns.Root()
-	defer root.DecRef()
-	return mountSubmounts(rootCtx, conf, mns, root, mounts, fds)
+	return fs.NewOverlayRoot(ctx, upper, lower, upperFlags)
 }
 
 // compileMounts returns the supported mounts from the mount spec, adding any
@@ -184,186 +144,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	return mounts
 }
 
-// createRootMount creates the root filesystem.
-func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
-	// First construct the filesystem from the spec.Root.
-	mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly || conf.Overlay}
-
-	var (
-		rootInode *fs.Inode
-		err       error
-	)
-
-	fd := fds.remove()
-	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	p9FS := mustFindFilesystem("9p")
-	opts := p9MountOptions(fd, conf.FileAccess)
-	rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
-	if err != nil {
-		return nil, fmt.Errorf("creating root mount point: %v", err)
-	}
-
-	// We need to overlay the root on top of a ramfs with stub directories
-	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
-	// mounted even if they are not in the spec.
-	submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
-	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
-	if err != nil {
-		return nil, fmt.Errorf("adding submount overlay: %v", err)
-	}
-
-	if conf.Overlay && !spec.Root.Readonly {
-		log.Debugf("Adding overlay on top of root mount")
-		// Overlay a tmpfs filesystem on top of the root.
-		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
-		if err != nil {
-			return nil, err
-		}
-	}
-
-	log.Infof("Mounted %q to %q type root", spec.Root.Path, "/")
-	return rootInode, nil
-}
-
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
-	// Upper layer uses the same flags as lower, but it must be read-write.
-	lowerFlags.ReadOnly = false
-
-	tmpFS := mustFindFilesystem("tmpfs")
-	if !fs.IsDir(lower.StableAttr) {
-		// Create overlay on top of mount file, e.g. /etc/hostname.
-		msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
-		return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
-	}
-
-	// Create overlay on top of mount dir.
-	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil)
-	if err != nil {
-		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
-	}
-	return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
-}
-
-// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
-// used for mounts.
-func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
-	var (
-		fsName     string
-		opts       []string
-		useOverlay bool
-		err        error
-	)
-
-	switch m.Type {
-	case devpts, devtmpfs, proc, sysfs:
-		fsName = m.Type
-	case nonefs:
-		fsName = sysfs
-	case tmpfs:
-		fsName = m.Type
-
-		// tmpfs has some extra supported options that we must pass through.
-		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
-
-	case bind:
-		fd := fds.remove()
-		fsName = "9p"
-		// Non-root bind mounts are always shared.
-		opts = p9MountOptions(fd, FileAccessShared)
-		// If configured, add overlay to all writable mounts.
-		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
-
-	default:
-		// TODO(nlacasse): Support all the mount types and make this a
-		// fatal error.  Most applications will "just work" without
-		// them, so this is a warning for now.
-		// we do not support.
-		log.Warningf("ignoring unknown filesystem type %q", m.Type)
-	}
-	return fsName, opts, useOverlay, err
-}
-
-func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount, fds *fdDispenser) error {
-	for _, m := range mounts {
-		if err := mountSubmount(ctx, conf, mns, root, fds, m, mounts); err != nil {
-			return fmt.Errorf("mount submount %q: %v", m.Destination, err)
-		}
-	}
-
-	if err := mountTmp(ctx, conf, mns, root, mounts); err != nil {
-		return fmt.Errorf("mount submount %q: %v", "tmp", err)
-	}
-
-	if !fds.empty() {
-		return fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
-	}
-	return nil
-}
-
-// mountSubmount mounts volumes inside the container's root. Because mounts may
-// be readonly, a lower ramfs overlay is added to create the mount point dir.
-// Another overlay is added with tmpfs on top if Config.Overlay is true.
-// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
-	// Map mount type to filesystem name, and parse out the options that we are
-	// capable of dealing with.
-	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
-
-	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
-	if err != nil {
-		return err
-	}
-	if fsName == "" {
-		return nil
-	}
-
-	// All filesystem names should have been mapped to something we know.
-	filesystem := mustFindFilesystem(fsName)
-
-	mf := mountFlags(m.Options)
-	if useOverlay {
-		// All writes go to upper, be paranoid and make lower readonly.
-		mf.ReadOnly = true
-	}
-
-	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
-	if err != nil {
-		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
-	}
-
-	// If there are submounts, we need to overlay the mount on top of a
-	// ramfs with stub directories for submount paths.
-	submounts := subtargets(m.Destination, mounts)
-	if len(submounts) > 0 {
-		log.Infof("Adding submount overlay over %q", m.Destination)
-		inode, err = addSubmountOverlay(ctx, inode, submounts)
-		if err != nil {
-			return fmt.Errorf("adding submount overlay: %v", err)
-		}
-	}
-
-	if useOverlay {
-		log.Debugf("Adding overlay on top of mount %q", m.Destination)
-		inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
-		if err != nil {
-			return err
-		}
-	}
-
-	maxTraversals := uint(0)
-	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
-	if err != nil {
-		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
-	}
-	defer dirent.DecRef()
-	if err := mns.Mount(ctx, dirent, inode); err != nil {
-		return fmt.Errorf("mount %q error: %v", m.Destination, err)
-	}
-
-	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
-	return nil
-}
-
 // p9MountOptions creates a slice of options for a p9 mount.
 func p9MountOptions(fd int, fa FileAccessType) []string {
 	opts := []string{
@@ -416,82 +196,6 @@ func mountDevice(m specs.Mount) string {
 	return "none"
 }
 
-// addRestoreMount adds a mount to the MountSources map used for restoring a
-// checkpointed container.
-func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
-	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
-
-	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
-	if err != nil {
-		return err
-	}
-	// TODO(nlacasse): Fix this when we support all the mount types and
-	// make this a fatal error.
-	if fsName == "" {
-		return nil
-	}
-
-	newMount := fs.MountArgs{
-		Dev:        mountDevice(m),
-		Flags:      mountFlags(m.Options),
-		DataString: strings.Join(opts, ","),
-	}
-	if useOverlay {
-		newMount.Flags.ReadOnly = true
-	}
-	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
-	log.Infof("Added mount at %q: %+v", fsName, newMount)
-	return nil
-}
-
-// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
-// to the environment.
-func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
-	renv := &fs.RestoreEnvironment{
-		MountSources: make(map[string][]fs.MountArgs),
-	}
-
-	// Add root mount.
-	fd := fds.remove()
-	opts := p9MountOptions(fd, conf.FileAccess)
-
-	mf := fs.MountSourceFlags{}
-	if spec.Root.Readonly || conf.Overlay {
-		mf.ReadOnly = true
-	}
-
-	rootMount := fs.MountArgs{
-		Dev:        rootDevice,
-		Flags:      mf,
-		DataString: strings.Join(opts, ","),
-	}
-	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
-
-	// Add submounts.
-	var tmpMounted bool
-	for _, m := range compileMounts(spec) {
-		if err := addRestoreMount(conf, renv, m, fds); err != nil {
-			return nil, err
-		}
-		if filepath.Clean(m.Destination) == "/tmp" {
-			tmpMounted = true
-		}
-	}
-
-	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
-	if !tmpMounted {
-		tmpMount := specs.Mount{
-			Type:        tmpfs,
-			Destination: "/tmp",
-		}
-		if err := addRestoreMount(conf, renv, tmpMount, fds); err != nil {
-			return nil, err
-		}
-	}
-
-	return renv, nil
-}
-
 func mountFlags(opts []string) fs.MountSourceFlags {
 	mf := fs.MountSourceFlags{}
 	for _, o := range opts {
@@ -546,22 +250,254 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
-// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
-// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
-func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
-	ctx := procArgs.NewContext(k)
-
-	// Create the FD map, which will set stdin, stdout, and stderr.  If console
-	// is true, then ioctl calls will be passed through to the host fd.
-	fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
+// setExecutablePath sets the procArgs.Filename by searching the PATH for an
+// executable matching the procArgs.Argv[0].
+func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
+	paths := fs.GetPath(procArgs.Envv)
+	exe := procArgs.Argv[0]
+	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
 	if err != nil {
-		return fmt.Errorf("importing fds: %v", err)
+		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+	}
+	procArgs.Filename = f
+	return nil
+}
+
+func adjustDirentCache(k *kernel.Kernel) error {
+	var hl syscall.Rlimit
+	if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
 	}
+	if int64(hl.Cur) != syscall.RLIM_INFINITY {
+		newSize := hl.Cur / 2
+		if newSize < gofer.DefaultDirentCacheSize {
+			log.Infof("Setting gofer dirent cache size to %d", newSize)
+			gofer.DefaultDirentCacheSize = newSize
+			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+		}
+	}
+	return nil
+}
 
-	// CreateProcess takes a reference on FDMap if successful. We
-	// won't need ours either way.
-	procArgs.FDMap = fdm
+type fdDispenser struct {
+	fds []int
+}
 
+func (f *fdDispenser) remove() int {
+	if f.empty() {
+		panic("fdDispenser out of fds")
+	}
+	rv := f.fds[0]
+	f.fds = f.fds[1:]
+	return rv
+}
+
+func (f *fdDispenser) empty() bool {
+	return len(f.fds) == 0
+}
+
+type shareType int
+
+const (
+	invalid shareType = iota
+
+	// container shareType indicates that the mount is used by a single container.
+	container
+
+	// pod shareType indicates that the mount is used by more than one container
+	// inside the pod.
+	pod
+
+	// shared shareType indicates that the mount can also be shared with a process
+	// outside the pod, e.g. NFS.
+	shared
+)
+
+func parseShare(val string) (shareType, error) {
+	switch val {
+	case "container":
+		return container, nil
+	case "pod":
+		return pod, nil
+	case "shared":
+		return shared, nil
+	default:
+		return 0, fmt.Errorf("invalid share value %q", val)
+	}
+}
+
+func (s shareType) String() string {
+	switch s {
+	case invalid:
+		return "invalid"
+	case container:
+		return "container"
+	case pod:
+		return "pod"
+	case shared:
+		return "shared"
+	default:
+		return fmt.Sprintf("invalid share value %d", s)
+	}
+}
+
+// mountHint represents extra information about mounts that are provided via
+// annotations. They can override mount type, and provide sharing information
+// so that mounts can be correctly shared inside the pod.
+type mountHint struct {
+	name  string
+	share shareType
+	mount specs.Mount
+
+	// root is the inode where the volume is mounted. For mounts with 'pod' share
+	// the volume is mounted once and then bind mounted inside the containers.
+	root *fs.Inode
+}
+
+func (m *mountHint) setField(key, val string) error {
+	switch key {
+	case "source":
+		if len(val) == 0 {
+			return fmt.Errorf("source cannot be empty")
+		}
+		m.mount.Source = val
+	case "type":
+		return m.setType(val)
+	case "share":
+		share, err := parseShare(val)
+		if err != nil {
+			return err
+		}
+		m.share = share
+	case "options":
+		return m.setOptions(val)
+	default:
+		return fmt.Errorf("invalid mount annotation: %s=%s", key, val)
+	}
+	return nil
+}
+
+func (m *mountHint) setType(val string) error {
+	switch val {
+	case "tmpfs", "bind":
+		m.mount.Type = val
+	default:
+		return fmt.Errorf("invalid type %q", val)
+	}
+	return nil
+}
+
+func (m *mountHint) setOptions(val string) error {
+	opts := strings.Split(val, ",")
+	if err := specutils.ValidateMountOptions(opts); err != nil {
+		return err
+	}
+	// Sort options so it can be compared with container mount options later on.
+	sort.Strings(opts)
+	m.mount.Options = opts
+	return nil
+}
+
+func (m *mountHint) isSupported() bool {
+	return m.mount.Type == tmpfs && m.share == pod
+}
+
+// podMountHints contains a collection of mountHints for the pod.
+type podMountHints struct {
+	mounts map[string]*mountHint
+}
+
+func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
+	mnts := make(map[string]*mountHint)
+	for k, v := range spec.Annotations {
+		// Look for 'gvisor.dev/spec/mount' annotations and parse them.
+		if strings.HasPrefix(k, MountPrefix) {
+			parts := strings.Split(k, "/")
+			if len(parts) != 5 {
+				return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
+			}
+			name := parts[3]
+			if len(name) == 0 || path.Clean(name) != name {
+				return nil, fmt.Errorf("invalid mount name: %s", name)
+			}
+			mnt := mnts[name]
+			if mnt == nil {
+				mnt = &mountHint{name: name}
+				mnts[name] = mnt
+			}
+			if err := mnt.setField(parts[4], v); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	// Validate all hints after done parsing.
+	for name, m := range mnts {
+		log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share)
+		if m.share == invalid {
+			return nil, fmt.Errorf("share field for %q has not been set", m.name)
+		}
+		if len(m.mount.Source) == 0 {
+			return nil, fmt.Errorf("source field for %q has not been set", m.name)
+		}
+		if len(m.mount.Type) == 0 {
+			return nil, fmt.Errorf("type field for %q has not been set", m.name)
+		}
+
+		// Check for duplicate mount sources.
+		for name2, m2 := range mnts {
+			if name != name2 && m.mount.Source == m2.mount.Source {
+				return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source)
+			}
+		}
+	}
+
+	return &podMountHints{mounts: mnts}, nil
+}
+
+func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
+	for _, m := range p.mounts {
+		if m.mount.Source == mount.Source {
+			return m
+		}
+	}
+	return nil
+}
+
+type containerMounter struct {
+	// cid is the container ID. May be set to empty for the root container.
+	cid string
+
+	root *specs.Root
+
+	// mounts is the set of submounts for the container. It's a copy from the spec
+	// that may be freely modified without affecting the original spec.
+	mounts []specs.Mount
+
+	// fds is the list of FDs to be dispensed for mounts that require it.
+	fds fdDispenser
+
+	k *kernel.Kernel
+
+	hints *podMountHints
+}
+
+func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+	return &containerMounter{
+		cid:    cid,
+		root:   spec.Root,
+		mounts: compileMounts(spec),
+		fds:    fdDispenser{fds: goferFDs},
+		k:      k,
+		hints:  hints,
+	}
+}
+
+// setupFS is used to set up the file system for containers and amend
+// the procArgs accordingly. This is the main entry point for this rest of
+// functions in this file. procArgs are passed by reference and the FDMap field
+// is modified. It dups stdioFDs.
+func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error {
 	// Use root user to configure mounts. The current user might not have
 	// permission to do so.
 	rootProcArgs := kernel.CreateProcessArgs{
@@ -570,16 +506,19 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 		Umask:                0022,
 		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
 	}
-	rootCtx := rootProcArgs.NewContext(k)
+	rootCtx := rootProcArgs.NewContext(c.k)
 
 	// If this is the root container, we also need to setup the root mount
 	// namespace.
-	mns := k.RootMountNamespace()
+	mns := c.k.RootMountNamespace()
 	if mns == nil {
 		// Setup the root container.
-		return setupRootContainerFS(ctx, rootCtx, spec, conf, goferFDs, func(mns *fs.MountNamespace) {
-			k.SetRootMountNamespace(mns)
-		})
+		if err := c.setupRootContainer(ctx, rootCtx, conf, func(mns *fs.MountNamespace) {
+			c.k.SetRootMountNamespace(mns)
+		}); err != nil {
+			return err
+		}
+		return c.checkDispenser()
 	}
 
 	// Setup a child container.
@@ -593,18 +532,17 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	if err != nil {
 		return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
 	}
-	if err := contDir.CreateDirectory(ctx, globalRoot, cid, fs.FilePermsFromMode(0755)); err != nil {
-		return fmt.Errorf("create directory %q: %v", cid, err)
+	if err := contDir.CreateDirectory(ctx, globalRoot, c.cid, fs.FilePermsFromMode(0755)); err != nil {
+		return fmt.Errorf("create directory %q: %v", c.cid, err)
 	}
-	containerRoot, err := contDir.Walk(ctx, globalRoot, cid)
+	containerRoot, err := contDir.Walk(ctx, globalRoot, c.cid)
 	if err != nil {
-		return fmt.Errorf("walk to %q failed: %v", cid, err)
+		return fmt.Errorf("walk to %q failed: %v", c.cid, err)
 	}
 	defer containerRoot.DecRef()
 
 	// Create the container's root filesystem mount.
-	fds := &fdDispenser{fds: goferFDs}
-	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
+	rootInode, err := c.createRootMount(rootCtx, conf)
 	if err != nil {
 		return fmt.Errorf("creating filesystem for container: %v", err)
 	}
@@ -614,39 +552,32 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 		return fmt.Errorf("mount container root: %v", err)
 	}
 
-	// We have to re-walk to the dirent to find the mounted
-	// directory. The old dirent is invalid at this point.
-	containerRoot, err = contDir.Walk(ctx, globalRoot, cid)
+	// We have to re-walk to the dirent to find the mounted directory. The old
+	// dirent is invalid at this point.
+	containerRoot, err = contDir.Walk(ctx, globalRoot, c.cid)
 	if err != nil {
-		return fmt.Errorf("find container mount point %q: %v", cid, err)
+		return fmt.Errorf("find container mount point %q: %v", c.cid, err)
 	}
 	cu := specutils.MakeCleanup(func() { containerRoot.DecRef() })
 	defer cu.Clean()
 
-	log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid))
+	log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, c.cid))
 
 	// Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
 	procArgs.Root = containerRoot
 
 	// Mount all submounts.
-	mounts := compileMounts(spec)
-	if err := mountSubmounts(rootCtx, conf, mns, containerRoot, mounts, fds); err != nil {
+	if err := c.mountSubmounts(rootCtx, conf, mns, containerRoot); err != nil {
 		return err
 	}
 	cu.Release()
-	return nil
+	return c.checkDispenser()
 }
 
-// setExecutablePath sets the procArgs.Filename by searching the PATH for an
-// executable matching the procArgs.Argv[0].
-func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
-	paths := fs.GetPath(procArgs.Envv)
-	exe := procArgs.Argv[0]
-	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
-	if err != nil {
-		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+func (c *containerMounter) checkDispenser() error {
+	if !c.fds.empty() {
+		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
 	}
-	procArgs.Filename = f
 	return nil
 }
 
@@ -715,17 +646,354 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 	return nil
 }
 
+// setupRootContainer creates a mount namespace containing the root filesystem
+// and all mounts. 'rootCtx' is used to walk directories to find mount points.
+// 'setMountNS' is called after namespace is created. It must set the mount NS
+// to 'rootCtx'.
+func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
+	for _, hint := range c.hints.mounts {
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		inode, err := c.mountSharedMaster(rootCtx, conf, hint)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.root = inode
+	}
+
+	// Create a tmpfs mount where we create and mount a root filesystem for
+	// each child container.
+	c.mounts = append(c.mounts, specs.Mount{
+		Type:        tmpfs,
+		Destination: ChildContainersDir,
+	})
+
+	rootInode, err := c.createRootMount(rootCtx, conf)
+	if err != nil {
+		return fmt.Errorf("creating root mount: %v", err)
+	}
+	mns, err := fs.NewMountNamespace(userCtx, rootInode)
+	if err != nil {
+		return fmt.Errorf("creating root mount namespace: %v", err)
+	}
+	setMountNS(mns)
+
+	root := mns.Root()
+	defer root.DecRef()
+	return c.mountSubmounts(rootCtx, conf, mns, root)
+}
+
+// mountSharedMaster mounts the master of a volume that is shared among
+// containers in a pod. It returns the root mount's inode.
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
+	if err != nil {
+		return nil, err
+	}
+	if len(fsName) == 0 {
+		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+	}
+
+	// Mount with revalidate because it's shared among containers.
+	opts = append(opts, "cache=revalidate")
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(hint.mount.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+
+	inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of shared mount %q", hint.name)
+		inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return inode, nil
+}
+
+// createRootMount creates the root filesystem.
+func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+	// First construct the filesystem from the spec.Root.
+	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
+
+	fd := c.fds.remove()
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	p9FS := mustFindFilesystem("9p")
+	opts := p9MountOptions(fd, conf.FileAccess)
+	rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating root mount point: %v", err)
+	}
+
+	// We need to overlay the root on top of a ramfs with stub directories
+	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
+	// mounted even if they are not in the spec.
+	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	if err != nil {
+		return nil, fmt.Errorf("adding submount overlay: %v", err)
+	}
+
+	if conf.Overlay && !c.root.Readonly {
+		log.Debugf("Adding overlay on top of root mount")
+		// Overlay a tmpfs filesystem on top of the root.
+		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	log.Infof("Mounted %q to %q type root", c.root.Path, "/")
+	return rootInode, nil
+}
+
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+	var (
+		fsName     string
+		opts       []string
+		useOverlay bool
+		err        error
+	)
+
+	switch m.Type {
+	case devpts, devtmpfs, proc, sysfs:
+		fsName = m.Type
+	case nonefs:
+		fsName = sysfs
+	case tmpfs:
+		fsName = m.Type
+
+		// tmpfs has some extra supported options that we must pass through.
+		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+
+	case bind:
+		fd := c.fds.remove()
+		fsName = "9p"
+		// Non-root bind mounts are always shared.
+		opts = p9MountOptions(fd, FileAccessShared)
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+	default:
+		// TODO(nlacasse): Support all the mount types and make this a fatal error.
+		// Most applications will "just work" without them, so this is a warning
+		// for now.
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+	}
+	return fsName, opts, useOverlay, err
+}
+
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+	for _, m := range c.mounts {
+		if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
+			if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
+			}
+		} else {
+			if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+				return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+			}
+		}
+	}
+
+	if err := c.mountTmp(ctx, conf, mns, root); err != nil {
+		return fmt.Errorf("mount submount %q: %v", "tmp", err)
+	}
+	return nil
+}
+
+// mountSubmount mounts volumes inside the container's root. Because mounts may
+// be readonly, a lower ramfs overlay is added to create the mount point dir.
+// Another overlay is added with tmpfs on top if Config.Overlay is true.
+// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(m.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+	}
+
+	// If there are submounts, we need to overlay the mount on top of a ramfs
+	// with stub directories for submount paths.
+	submounts := subtargets(m.Destination, c.mounts)
+	if len(submounts) > 0 {
+		log.Infof("Adding submount overlay over %q", m.Destination)
+		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		if err != nil {
+			return fmt.Errorf("adding submount overlay: %v", err)
+		}
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of mount %q", m.Destination)
+		inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
+		if err != nil {
+			return err
+		}
+	}
+
+	maxTraversals := uint(0)
+	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
+	if err != nil {
+		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
+	}
+	defer dirent.DecRef()
+	if err := mns.Mount(ctx, dirent, inode); err != nil {
+		return fmt.Errorf("mount %q error: %v", m.Destination, err)
+	}
+
+	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	return nil
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
+	// For now enforce that all options are the same. Once bind mount is properly
+	// supported, then we should ensure the master is less restrictive than the
+	// container, e.g. master can be 'rw' while container mounts as 'ro'.
+	if len(mount.Options) != len(source.mount.Options) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
+	}
+	sort.Strings(mount.Options)
+	for i, opt := range mount.Options {
+		if opt != source.mount.Options[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
+		}
+	}
+
+	maxTraversals := uint(0)
+	target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals)
+	if err != nil {
+		return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
+	}
+	defer target.DecRef()
+
+	if err := mns.Mount(ctx, target, source.root); err != nil {
+		return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
+	}
+
+	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+	return nil
+}
+
+// addRestoreMount adds a mount to the MountSources map used for restoring a
+// checkpointed container.
+func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	newMount := fs.MountArgs{
+		Dev:        mountDevice(m),
+		Flags:      mountFlags(m.Options),
+		DataString: strings.Join(opts, ","),
+	}
+	if useOverlay {
+		newMount.Flags.ReadOnly = true
+	}
+	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+	log.Infof("Added mount at %q: %+v", fsName, newMount)
+	return nil
+}
+
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
+// the mounts to the environment.
+func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+	renv := &fs.RestoreEnvironment{
+		MountSources: make(map[string][]fs.MountArgs),
+	}
+
+	// Add root mount.
+	fd := c.fds.remove()
+	opts := p9MountOptions(fd, conf.FileAccess)
+
+	mf := fs.MountSourceFlags{}
+	if c.root.Readonly || conf.Overlay {
+		mf.ReadOnly = true
+	}
+
+	rootMount := fs.MountArgs{
+		Dev:        rootDevice,
+		Flags:      mf,
+		DataString: strings.Join(opts, ","),
+	}
+	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
+
+	// Add submounts.
+	var tmpMounted bool
+	for _, m := range c.mounts {
+		if err := c.addRestoreMount(conf, renv, m); err != nil {
+			return nil, err
+		}
+		if filepath.Clean(m.Destination) == "/tmp" {
+			tmpMounted = true
+		}
+	}
+
+	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
+	if !tmpMounted {
+		tmpMount := specs.Mount{
+			Type:        tmpfs,
+			Destination: "/tmp",
+		}
+		if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
+			return nil, err
+		}
+	}
+
+	return renv, nil
+}
+
 // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
 // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
 // the host /tmp, but this is a nice optimization, and fixes some apps that call
 // mknod in /tmp. It's unsafe to mount tmpfs if:
-//   1. /tmp is mounted explictly: we should not override user's wish
+//   1. /tmp is mounted explicitly: we should not override user's wish
 //   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount) error {
-	for _, m := range mounts {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+	for _, m := range c.mounts {
 		if filepath.Clean(m.Destination) == "/tmp" {
 			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
 			return nil
@@ -766,7 +1034,7 @@ func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *f
 			// another user. This is normally done for /tmp.
 			Options: []string{"mode=1777"},
 		}
-		return mountSubmount(ctx, conf, mns, root, nil, tmpMount, mounts)
+		return c.mountSubmount(ctx, conf, mns, root, tmpMount)
 
 	default:
 		return err
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
new file mode 100644
index 000000000..49ab34b33
--- /dev/null
+++ b/runsc/boot/fs_test.go
@@ -0,0 +1,193 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"path"
+	"reflect"
+	"strings"
+	"testing"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestPodMountHintsHappy(t *testing.T) {
+	spec := &specs.Spec{
+		Annotations: map[string]string{
+			path.Join(MountPrefix, "mount1", "source"): "foo",
+			path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+			path.Join(MountPrefix, "mount1", "share"):  "pod",
+
+			path.Join(MountPrefix, "mount2", "source"):  "bar",
+			path.Join(MountPrefix, "mount2", "type"):    "bind",
+			path.Join(MountPrefix, "mount2", "share"):   "container",
+			path.Join(MountPrefix, "mount2", "options"): "rw,private",
+		},
+	}
+	podHints, err := newPodMountHints(spec)
+	if err != nil {
+		t.Errorf("newPodMountHints failed: %v", err)
+	}
+
+	// Check that fields were set correctly.
+	mount1 := podHints.mounts["mount1"]
+	if want := "mount1"; want != mount1.name {
+		t.Errorf("mount1 name, want: %q, got: %q", want, mount1.name)
+	}
+	if want := "foo"; want != mount1.mount.Source {
+		t.Errorf("mount1 source, want: %q, got: %q", want, mount1.mount.Source)
+	}
+	if want := "tmpfs"; want != mount1.mount.Type {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Type)
+	}
+	if want := pod; want != mount1.share {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.share)
+	}
+	if want := []string(nil); !reflect.DeepEqual(want, mount1.mount.Options) {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Options)
+	}
+
+	mount2 := podHints.mounts["mount2"]
+	if want := "mount2"; want != mount2.name {
+		t.Errorf("mount2 name, want: %q, got: %q", want, mount2.name)
+	}
+	if want := "bar"; want != mount2.mount.Source {
+		t.Errorf("mount2 source, want: %q, got: %q", want, mount2.mount.Source)
+	}
+	if want := "bind"; want != mount2.mount.Type {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Type)
+	}
+	if want := container; want != mount2.share {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.share)
+	}
+	if want := []string{"private", "rw"}; !reflect.DeepEqual(want, mount2.mount.Options) {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Options)
+	}
+}
+
+func TestPodMountHintsErrors(t *testing.T) {
+	for _, tst := range []struct {
+		name        string
+		annotations map[string]string
+		error       string
+	}{
+		{
+			name: "too short",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1"): "foo",
+			},
+			error: "invalid mount annotation",
+		},
+		{
+			name: "no name",
+			annotations: map[string]string{
+				MountPrefix + "//source": "foo",
+			},
+			error: "invalid mount name",
+		},
+		{
+			name: "missing source",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "type"):  "tmpfs",
+				path.Join(MountPrefix, "mount1", "share"): "pod",
+			},
+			error: "source field",
+		},
+		{
+			name: "missing type",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "foo",
+				path.Join(MountPrefix, "mount1", "share"):  "pod",
+			},
+			error: "type field",
+		},
+		{
+			name: "missing share",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "foo",
+				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+			},
+			error: "share field",
+		},
+		{
+			name: "invalid field name",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "invalid"): "foo",
+			},
+			error: "invalid mount annotation",
+		},
+		{
+			name: "invalid source",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "",
+				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+				path.Join(MountPrefix, "mount1", "share"):  "pod",
+			},
+			error: "source cannot be empty",
+		},
+		{
+			name: "invalid type",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "foo",
+				path.Join(MountPrefix, "mount1", "type"):   "invalid-type",
+				path.Join(MountPrefix, "mount1", "share"):  "pod",
+			},
+			error: "invalid type",
+		},
+		{
+			name: "invalid share",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "foo",
+				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+				path.Join(MountPrefix, "mount1", "share"):  "invalid-share",
+			},
+			error: "invalid share",
+		},
+		{
+			name: "invalid options",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"):  "foo",
+				path.Join(MountPrefix, "mount1", "type"):    "tmpfs",
+				path.Join(MountPrefix, "mount1", "share"):   "pod",
+				path.Join(MountPrefix, "mount1", "options"): "invalid-option",
+			},
+			error: "unknown mount option",
+		},
+		{
+			name: "duplicate source",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "foo",
+				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+				path.Join(MountPrefix, "mount1", "share"):  "pod",
+
+				path.Join(MountPrefix, "mount2", "source"): "foo",
+				path.Join(MountPrefix, "mount2", "type"):   "bind",
+				path.Join(MountPrefix, "mount2", "share"):  "container",
+			},
+			error: "have the same mount source",
+		},
+	} {
+		t.Run(tst.name, func(t *testing.T) {
+			spec := &specs.Spec{Annotations: tst.annotations}
+			podHints, err := newPodMountHints(spec)
+			if err == nil || !strings.Contains(err.Error(), tst.error) {
+				t.Errorf("newPodMountHints invalid error, want: .*%s.*, got: %v", tst.error, err)
+			}
+			if podHints != nil {
+				t.Errorf("newPodMountHints must return nil on failure: %+v", podHints)
+			}
+		})
+	}
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 6ac6b94dd..c1dea736f 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/memutil"
 	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
@@ -37,7 +38,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
@@ -117,6 +117,10 @@ type Loader struct {
 	//
 	// processes is guardded by mu.
 	processes map[execID]*execProcess
+
+	// mountHints provides extra information about mounts for containers that
+	// apply to the entire pod.
+	mountHints *podMountHints
 }
 
 // execID uniquely identifies a sentry process that is executed in a container.
@@ -288,7 +292,7 @@ func New(args Args) (*Loader, error) {
 	}
 
 	// Create a watchdog.
-	watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
+	dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
 
 	procArgs, err := newProcess(args.ID, args.Spec, creds, k)
 	if err != nil {
@@ -299,18 +303,24 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("initializing compat logs: %v", err)
 	}
 
+	mountHints, err := newPodMountHints(args.Spec)
+	if err != nil {
+		return nil, fmt.Errorf("creating pod mount hints: %v", err)
+	}
+
 	eid := execID{cid: args.ID}
 	l := &Loader{
 		k:            k,
 		conf:         args.Conf,
 		console:      args.Console,
-		watchdog:     watchdog,
+		watchdog:     dog,
 		spec:         args.Spec,
 		goferFDs:     args.GoferFDs,
 		stdioFDs:     args.StdioFDs,
 		rootProcArgs: procArgs,
 		sandboxID:    args.ID,
 		processes:    map[execID]*execProcess{eid: {}},
+		mountHints:   mountHints,
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -424,6 +434,9 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 		return nil, fmt.Errorf("error creating memfd: %v", err)
 	}
 	memfile := os.NewFile(uintptr(memfd), memfileName)
+	// We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
+	// there are memory cgroups specified, because at this point we're already
+	// in a mount namespace in which the relevant cgroupfs is not visible.
 	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
 	if err != nil {
 		memfile.Close()
@@ -432,7 +445,24 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 	return mf, nil
 }
 
-// Run runs the root container..
+func (l *Loader) installSeccompFilters() error {
+	if l.conf.DisableSeccomp {
+		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+	} else {
+		opts := filter.Options{
+			Platform:      l.k.Platform,
+			HostNetwork:   l.conf.Network == NetworkHost,
+			ProfileEnable: l.conf.ProfileEnable,
+			ControllerFD:  l.ctrl.srv.FD(),
+		}
+		if err := filter.Install(opts); err != nil {
+			return fmt.Errorf("installing seccomp filters: %v", err)
+		}
+	}
+	return nil
+}
+
+// Run runs the root container.
 func (l *Loader) Run() error {
 	err := l.run()
 	l.ctrl.manager.startResultChan <- err
@@ -467,36 +497,34 @@ func (l *Loader) run() error {
 		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
 	}
 
-	// Finally done with all configuration. Setup filters before user code
-	// is loaded.
-	if l.conf.DisableSeccomp {
-		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
-	} else {
-		opts := filter.Options{
-			Platform:      l.k.Platform,
-			HostNetwork:   l.conf.Network == NetworkHost,
-			ProfileEnable: l.conf.ProfileEnable,
-			ControllerFD:  l.ctrl.srv.FD(),
-		}
-		if err := filter.Install(opts); err != nil {
-			return fmt.Errorf("installing seccomp filters: %v", err)
-		}
-	}
-
 	// If we are restoring, we do not want to create a process.
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
-		if err := setupContainerFS(
-			&l.rootProcArgs,
-			l.spec,
-			l.conf,
-			l.stdioFDs,
-			l.goferFDs,
-			l.console,
-			l.rootProcArgs.Credentials,
-			l.rootProcArgs.Limits,
-			l.k,
-			"" /* CID, which isn't needed for the root container */); err != nil {
+		if l.conf.ProfileEnable {
+			initializePProf()
+		}
+
+		// Finally done with all configuration. Setup filters before user code
+		// is loaded.
+		if err := l.installSeccompFilters(); err != nil {
+			return err
+		}
+
+		// Create the FD map, which will set stdin, stdout, and stderr.  If console
+		// is true, then ioctl calls will be passed through to the host fd.
+		ctx := l.rootProcArgs.NewContext(l.k)
+		fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs)
+		if err != nil {
+			return fmt.Errorf("importing fds: %v", err)
+		}
+		// CreateProcess takes a reference on FDMap if successful. We won't need
+		// ours either way.
+		l.rootProcArgs.FDMap = fdm
+
+		// cid for root container can be empty. Only subcontainers need it to set
+		// the mount location.
+		mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints)
+		if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil {
 			return err
 		}
 
@@ -552,7 +580,7 @@ func (l *Loader) createContainer(cid string) error {
 // startContainer starts a child container. It returns the thread group ID of
 // the newly created process. Caller owns 'files' and may close them after
 // this method returns.
-func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
@@ -596,6 +624,16 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		stdioFDs = append(stdioFDs, int(f.Fd()))
 	}
 
+	// Create the FD map, which will set stdin, stdout, and stderr.
+	ctx := procArgs.NewContext(l.k)
+	fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs)
+	if err != nil {
+		return fmt.Errorf("importing fds: %v", err)
+	}
+	// CreateProcess takes a reference on FDMap if successful. We won't need ours
+	// either way.
+	procArgs.FDMap = fdm
+
 	// Can't take ownership away from os.File. dup them to get a new FDs.
 	var goferFDs []int
 	for _, f := range files[3:] {
@@ -606,22 +644,12 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		goferFDs = append(goferFDs, fd)
 	}
 
-	if err := setupContainerFS(
-		&procArgs,
-		spec,
-		conf,
-		stdioFDs,
-		goferFDs,
-		false,
-		creds,
-		procArgs.Limits,
-		k,
-		cid); err != nil {
+	mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints)
+	if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil {
 		return fmt.Errorf("configuring container FS: %v", err)
 	}
 
-	ctx := procArgs.NewContext(l.k)
-	mns := k.RootMountNamespace()
+	mns := l.k.RootMountNamespace()
 	if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
 		return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
 	}
@@ -724,7 +752,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	return nil
 }
 
-func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error {
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
 	if tgid <= 0 {
 		return fmt.Errorf("PID (%d) must be positive", tgid)
 	}
@@ -736,13 +764,10 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 		ws := l.wait(execTG)
 		*waitStatus = ws
 
-		// Remove tg from the cache if caller requested it.
-		if clearStatus {
-			l.mu.Lock()
-			delete(l.processes, eid)
-			log.Debugf("updated processes (removal): %v", l.processes)
-			l.mu.Unlock()
-		}
+		l.mu.Lock()
+		delete(l.processes, eid)
+		log.Debugf("updated processes (removal): %v", l.processes)
+		l.mu.Unlock()
 		return nil
 	}
 
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 4603f751d..2f2499811 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -397,14 +397,15 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			// setupRootContainerFS needs to find root from the context after the
+			// setupRootContainer needs to find root from the context after the
 			// namespace is created.
 			var mns *fs.MountNamespace
 			setMountNS := func(m *fs.MountNamespace) {
 				mns = m
 				ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
 			}
-			if err := setupRootContainerFS(ctx, ctx, &tc.spec, conf, []int{sandEnd}, setMountNS); err != nil {
+			mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil, &podMountHints{})
+			if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
 				t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
 			}
 			root := mns.Root()
@@ -609,8 +610,8 @@ func TestRestoreEnvironment(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
-			fds := &fdDispenser{fds: tc.ioFDs}
-			actualRenv, err := createRestoreEnvironment(tc.spec, conf, fds)
+			mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil, &podMountHints{})
+			actualRenv, err := mntr.createRestoreEnvironment(conf)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
 			} else if tc.errorExpected {
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0a154d90b..d86803252 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -56,7 +56,11 @@ type FDBasedLink struct {
 	Addresses   []net.IP
 	Routes      []Route
 	GSOMaxSize  uint32
-	LinkAddress []byte
+	LinkAddress net.HardwareAddr
+
+	// NumChannels controls how many underlying FD's are to be used to
+	// create this endpoint.
+	NumChannels int
 }
 
 // LoopbackLink configures a loopback li nk.
@@ -68,8 +72,9 @@ type LoopbackLink struct {
 
 // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
 type CreateLinksAndRoutesArgs struct {
-	// FilePayload contains the fds associated with the FDBasedLinks.  The
-	// two slices must have the same length.
+	// FilePayload contains the fds associated with the FDBasedLinks. The
+	// number of fd's should match the sum of the NumChannels field of the
+	// FDBasedLink entries below.
 	urpc.FilePayload
 
 	LoopbackLinks []LoopbackLink
@@ -95,8 +100,12 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
 // CreateLinksAndRoutes creates links and routes in a network stack.  It should
 // only be called once.
 func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
-	if len(args.FilePayload.Files) != len(args.FDBasedLinks) {
-		return fmt.Errorf("FilePayload must be same length at FDBasedLinks")
+	wantFDs := 0
+	for _, l := range args.FDBasedLinks {
+		wantFDs += l.NumChannels
+	}
+	if got := len(args.FilePayload.Files); got != wantFDs {
+		return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs)
 	}
 
 	var nicID tcpip.NICID
@@ -123,20 +132,26 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 	}
 
-	for i, link := range args.FDBasedLinks {
+	fdOffset := 0
+	for _, link := range args.FDBasedLinks {
 		nicID++
 		nicids[link.Name] = nicID
 
-		// Copy the underlying FD.
-		oldFD := args.FilePayload.Files[i].Fd()
-		newFD, err := syscall.Dup(int(oldFD))
-		if err != nil {
-			return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+		FDs := []int{}
+		for j := 0; j < link.NumChannels; j++ {
+			// Copy the underlying FD.
+			oldFD := args.FilePayload.Files[fdOffset].Fd()
+			newFD, err := syscall.Dup(int(oldFD))
+			if err != nil {
+				return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+			}
+			FDs = append(FDs, newFD)
+			fdOffset++
 		}
 
 		mac := tcpip.LinkAddress(link.LinkAddress)
 		linkEP, err := fdbased.New(&fdbased.Options{
-			FD:                 newFD,
+			FDs:                FDs,
 			MTU:                uint32(link.MTU),
 			EthernetHeader:     true,
 			Address:            mac,
@@ -148,7 +163,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			return err
 		}
 
-		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
+		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
 		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
 			return err
 		}
diff --git a/pkg/sentry/memutil/memutil.go b/runsc/boot/pprof.go
index a4154c42a..463362f02 100644
--- a/pkg/sentry/memutil/memutil.go
+++ b/runsc/boot/pprof.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,5 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package memutil contains the utility functions for memory operations.
-package memutil
+package boot
+
+func initializePProf() {
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index b7551a5ab..df6af0ced 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -14,9 +14,11 @@ go_library(
         "debug.go",
         "delete.go",
         "do.go",
+        "error.go",
         "events.go",
         "exec.go",
         "gofer.go",
+        "help.go",
         "kill.go",
         "list.go",
         "path.go",
@@ -28,6 +30,7 @@ go_library(
         "spec.go",
         "start.go",
         "state.go",
+        "syscalls.go",
         "wait.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/cmd",
@@ -38,6 +41,7 @@ go_library(
         "//pkg/log",
         "//pkg/p9",
         "//pkg/sentry/control",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
         "//pkg/urpc",
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 3a547d4aa..e0a950e9c 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -130,6 +130,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Ensure that if there is a panic, all goroutine stacks are printed.
 	debug.SetTraceback("all")
 
+	conf := args[0].(*boot.Config)
+
 	if b.setUpRoot {
 		if err := setUpChroot(b.pidns); err != nil {
 			Fatalf("error setting up chroot: %v", err)
@@ -143,14 +145,16 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 					args = append(args, arg)
 				}
 			}
-			// Note that we've already read the spec from the spec FD, and
-			// we will read it again after the exec call. This works
-			// because the ReadSpecFromFile function seeks to the beginning
-			// of the file before reading.
-			if err := callSelfAsNobody(args); err != nil {
-				Fatalf("%v", err)
+			if !conf.Rootless {
+				// Note that we've already read the spec from the spec FD, and
+				// we will read it again after the exec call. This works
+				// because the ReadSpecFromFile function seeks to the beginning
+				// of the file before reading.
+				if err := callSelfAsNobody(args); err != nil {
+					Fatalf("%v", err)
+				}
+				panic("callSelfAsNobody must never return success")
 			}
-			panic("callSelfAsNobody must never return success")
 		}
 	}
 
@@ -163,9 +167,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 	specutils.LogSpec(spec)
 
-	conf := args[0].(*boot.Config)
-	waitStatus := args[1].(*syscall.WaitStatus)
-
 	if b.applyCaps {
 		caps := spec.Process.Capabilities
 		if caps == nil {
@@ -251,6 +252,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	ws := l.WaitExit()
 	log.Infof("application exiting with %+v", ws)
+	waitStatus := args[1].(*syscall.WaitStatus)
 	*waitStatus = syscall.WaitStatus(ws.Status())
 	l.Destroy()
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index ee74d33d8..2825dfaa5 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -116,6 +116,6 @@ func TestCapabilities(t *testing.T) {
 }
 
 func TestMain(m *testing.M) {
-	testutil.RunAsRoot()
+	specutils.MaybeRunAsRoot()
 	os.Exit(m.Run())
 }
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index a2fc377d1..5b4cc4a39 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -17,34 +17,15 @@ package cmd
 
 import (
 	"fmt"
-	"os"
 	"runtime"
 	"strconv"
 	"syscall"
 
-	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
-// Errorf logs to stderr and returns subcommands.ExitFailure.
-func Errorf(s string, args ...interface{}) subcommands.ExitStatus {
-	// If runsc is being invoked by docker or cri-o, then we might not have
-	// access to stderr, so we log a serious-looking warning in addition to
-	// writing to stderr.
-	log.Warningf("FATAL ERROR: "+s, args...)
-	fmt.Fprintf(os.Stderr, s+"\n", args...)
-	// Return an error that is unlikely to be used by the application.
-	return subcommands.ExitFailure
-}
-
-// Fatalf logs to stderr and exits with a failure status code.
-func Fatalf(s string, args ...interface{}) {
-	Errorf(s, args...)
-	os.Exit(128)
-}
-
 // intFlags can be used with int flags that appear multiple times.
 type intFlags []int
 
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 629c198fd..e82e8c667 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -16,7 +16,6 @@ package cmd
 
 import (
 	"context"
-
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
@@ -83,13 +82,17 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	id := f.Arg(0)
 	conf := args[0].(*boot.Config)
 
+	if conf.Rootless {
+		return Errorf("Rootless mode not supported with %q", c.Name())
+	}
+
 	bundleDir := c.bundleDir
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
 	spec, err := specutils.ReadSpec(bundleDir)
 	if err != nil {
-		Fatalf("reading spec: %v", err)
+		return Errorf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
@@ -97,7 +100,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	// container unless the metadata specifies that it should be run in an
 	// existing container.
 	if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, c.userLog); err != nil {
-		Fatalf("creating container: %v", err)
+		return Errorf("creating container: %v", err)
 	}
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 8ea59046c..3f6e46fce 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -39,10 +39,9 @@ import (
 // Do implements subcommands.Command for the "do" command. It sets up a simple
 // sandbox and executes the command inside it. See Usage() for more details.
 type Do struct {
-	root             string
-	cwd              string
-	ip               string
-	networkNamespace bool
+	root string
+	cwd  string
+	ip   string
 }
 
 // Name implements subcommands.Command.Name.
@@ -72,7 +71,6 @@ func (c *Do) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
 	f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory")
 	f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox")
-	f.BoolVar(&c.networkNamespace, "netns", true, "run in a new network namespace")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -85,15 +83,21 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
-	// Map the entire host file system, but make it readonly with a writable
-	// overlay on top (ignore --overlay option).
-	conf.Overlay = true
+	if conf.Rootless {
+		if err := specutils.MaybeRunAsRoot(); err != nil {
+			return Errorf("Error executing inside namespace: %v", err)
+		}
+		// Execution will continue here if no more capabilities are needed...
+	}
 
 	hostname, err := os.Hostname()
 	if err != nil {
 		return Errorf("Error to retrieve hostname: %v", err)
 	}
 
+	// Map the entire host file system, but make it readonly with a writable
+	// overlay on top (ignore --overlay option).
+	conf.Overlay = true
 	absRoot, err := resolvePath(c.root)
 	if err != nil {
 		return Errorf("Error resolving root: %v", err)
@@ -119,11 +123,22 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	specutils.LogSpec(spec)
 
 	cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
-	if !c.networkNamespace {
-		if conf.Network != boot.NetworkHost {
-			Fatalf("The current network namespace can be used only if --network=host is set", nil)
+	if conf.Network == boot.NetworkNone {
+		netns := specs.LinuxNamespace{
+			Type: specs.NetworkNamespace,
+		}
+		if spec.Linux != nil {
+			panic("spec.Linux is not nil")
 		}
-	} else if conf.Network != boot.NetworkNone {
+		spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}}
+
+	} else if conf.Rootless {
+		if conf.Network == boot.NetworkSandbox {
+			fmt.Println("*** Rootless requires changing network type to host ***")
+			conf.Network = boot.NetworkHost
+		}
+
+	} else {
 		clean, err := c.setupNet(cid, spec)
 		if err != nil {
 			return Errorf("Error setting up network: %v", err)
diff --git a/runsc/cmd/error.go b/runsc/cmd/error.go
new file mode 100644
index 000000000..700b19f14
--- /dev/null
+++ b/runsc/cmd/error.go
@@ -0,0 +1,72 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"time"
+
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// ErrorLogger is where error messages should be written to. These messages are
+// consumed by containerd and show up to users of command line tools,
+// like docker/kubectl.
+var ErrorLogger io.Writer
+
+type jsonError struct {
+	Msg   string    `json:"msg"`
+	Level string    `json:"level"`
+	Time  time.Time `json:"time"`
+}
+
+// Errorf logs error to containerd log (--log), to stderr, and debug logs. It
+// returns subcommands.ExitFailure for convenience with subcommand.Execute()
+// methods:
+//    return Errorf("Danger! Danger!")
+//
+func Errorf(format string, args ...interface{}) subcommands.ExitStatus {
+	// If runsc is being invoked by docker or cri-o, then we might not have
+	// access to stderr, so we log a serious-looking warning in addition to
+	// writing to stderr.
+	log.Warningf("FATAL ERROR: "+format, args...)
+	fmt.Fprintf(os.Stderr, format+"\n", args...)
+
+	j := jsonError{
+		Msg:   fmt.Sprintf(format, args...),
+		Level: "error",
+		Time:  time.Now(),
+	}
+	b, err := json.Marshal(j)
+	if err != nil {
+		panic(err)
+	}
+	if ErrorLogger != nil {
+		ErrorLogger.Write(b)
+	}
+
+	return subcommands.ExitFailure
+}
+
+// Fatalf logs the same way as Errorf() does, plus *exits* the process.
+func Fatalf(format string, args ...interface{}) {
+	Errorf(format, args...)
+	// Return an error that is unlikely to be used by the application.
+	os.Exit(128)
+}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 52fd7ac4b..0eeaaadba 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -40,8 +40,6 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
-const privateClearStatusFlag = "private-clear-status"
-
 // Exec implements subcommands.Command for the "exec" command.
 type Exec struct {
 	cwd string
@@ -51,7 +49,6 @@ type Exec struct {
 	extraKGIDs      stringSlice
 	caps            stringSlice
 	detach          bool
-	clearStatus     bool
 	processPath     string
 	pidFile         string
 	internalPidFile string
@@ -103,10 +100,6 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
 	f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
 	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
-
-	// This flag clears the status of the exec'd process upon completion. It is
-	// only used when we fork due to --detach being set on the parent.
-	f.BoolVar(&ex.clearStatus, privateClearStatusFlag, true, "private flag, do not use")
 }
 
 // Execute implements subcommands.Command.Execute. It starts a process in an
@@ -150,13 +143,16 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// write the child's PID to the pid file. So when the container returns, the
 	// child process will also return and signal containerd.
 	if ex.detach {
-		return ex.execAndWait(waitStatus)
+		return ex.execChildAndWait(waitStatus)
 	}
+	return ex.exec(c, e, waitStatus)
+}
 
+func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
 	// Start the new process and get it pid.
 	pid, err := c.Execute(e)
 	if err != nil {
-		Fatalf("getting processes for container: %v", err)
+		return Errorf("executing processes for container: %v", err)
 	}
 
 	if e.StdioIsPty {
@@ -170,33 +166,37 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if ex.internalPidFile != "" {
 		pidStr := []byte(strconv.Itoa(int(pid)))
 		if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil {
-			Fatalf("writing internal pid file %q: %v", ex.internalPidFile, err)
+			return Errorf("writing internal pid file %q: %v", ex.internalPidFile, err)
 		}
 	}
 
-	// Generate the pid file after the internal pid file is generated, so that users
-	// can safely assume that the internal pid file is ready after `runsc exec -d`
-	// returns.
+	// Generate the pid file after the internal pid file is generated, so that
+	// users can safely assume that the internal pid file is ready after
+	// `runsc exec -d` returns.
 	if ex.pidFile != "" {
 		if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
-			Fatalf("writing pid file: %v", err)
+			return Errorf("writing pid file: %v", err)
 		}
 	}
 
 	// Wait for the process to exit.
-	ws, err := c.WaitPID(pid, ex.clearStatus)
+	ws, err := c.WaitPID(pid)
 	if err != nil {
-		Fatalf("waiting on pid %d: %v", pid, err)
+		return Errorf("waiting on pid %d: %v", pid, err)
 	}
 	*waitStatus = ws
 	return subcommands.ExitSuccess
 }
 
-func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
-	binPath := specutils.ExePath
+func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
 	var args []string
+	for _, a := range os.Args[1:] {
+		if !strings.Contains(a, "detach") {
+			args = append(args, a)
+		}
+	}
 
-	// The command needs to write a pid file so that execAndWait can tell
+	// The command needs to write a pid file so that execChildAndWait can tell
 	// when it has started. If no pid-file was provided, we should use a
 	// filename in a temp directory.
 	pidFile := ex.pidFile
@@ -210,19 +210,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		args = append(args, "--pid-file="+pidFile)
 	}
 
-	// Add the rest of the args, excluding the "detach" flag.
-	for _, a := range os.Args[1:] {
-		if strings.Contains(a, "detach") {
-			// Replace with the "private-clear-status" flag, which tells
-			// the new process it's a detached child and shouldn't
-			// clear the exit status of the sentry process.
-			args = append(args, fmt.Sprintf("--%s=false", privateClearStatusFlag))
-		} else {
-			args = append(args, a)
-		}
-	}
-
-	cmd := exec.Command(binPath, args...)
+	cmd := exec.Command(specutils.ExePath, args...)
 	cmd.Args[0] = "runsc-exec"
 
 	// Exec stdio defaults to current process stdio.
@@ -233,8 +221,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 	// If the console control socket file is provided, then create a new
 	// pty master/slave pair and set the TTY on the sandbox process.
 	if ex.consoleSocket != "" {
-		// Create a new TTY pair and send the master on the provided
-		// socket.
+		// Create a new TTY pair and send the master on the provided socket.
 		tty, err := console.NewWithSocket(ex.consoleSocket)
 		if err != nil {
 			Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err)
@@ -256,7 +243,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		Fatalf("failure to start child exec process, err: %v", err)
 	}
 
-	log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args)
+	log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, specutils.ExePath, args)
 
 	// Wait for PID file to ensure that child process has started. Otherwise,
 	// '--process' file is deleted as soon as this process returns and the child
@@ -278,7 +265,10 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		return false, nil
 	}
 	if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil {
-		Fatalf("unexpected error waiting for PID file, err: %v", err)
+		// Don't log fatal error here, otherwise it will override the error logged
+		// by the child process that has failed to start.
+		log.Warningf("Unexpected error waiting for PID file, err: %v", err)
+		return subcommands.ExitFailure
 	}
 
 	*waitStatus = 0
diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go
new file mode 100644
index 000000000..ff4f901cb
--- /dev/null
+++ b/runsc/cmd/help.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+
+	"flag"
+	"github.com/google/subcommands"
+)
+
+// NewHelp returns a help command for the given commander.
+func NewHelp(cdr *subcommands.Commander) *Help {
+	return &Help{
+		cdr: cdr,
+	}
+}
+
+// Help implements subcommands.Command for the "help" command. The 'help'
+// command prints help for commands registered to a Commander but also allows for
+// registering additional help commands that print other documentation.
+type Help struct {
+	cdr      *subcommands.Commander
+	commands []subcommands.Command
+	help     bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Help) Name() string {
+	return "help"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Help) Synopsis() string {
+	return "Print help documentation."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Help) Usage() string {
+	return `help [<subcommand>]:
+	With an argument, prints detailed information on the use of
+	the specified topic or subcommand. With no argument, print a list of
+	all commands and a brief description of each.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (h *Help) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	switch f.NArg() {
+	case 0:
+		fmt.Fprintf(h.cdr.Output, "Usage: %s <flags> <subcommand> <subcommand args>\n\n", h.cdr.Name())
+		fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open
+Container Initiative (OCI) format. Applications run by runsc are run in an
+isolated gVisor sandbox that emulates a Linux environment.
+
+gVisor is a user-space kernel, written in Go, that implements a substantial
+portion of the Linux system call interface. It provides an additional layer
+of isolation between running applications and the host operating system.
+
+Functionality is provided by subcommands. For additonal help on individual
+subcommands use "%s %s <subcommand>".
+
+`, h.cdr.Name(), h.Name())
+		h.cdr.VisitGroups(func(g *subcommands.CommandGroup) {
+			h.cdr.ExplainGroup(h.cdr.Output, g)
+		})
+
+		fmt.Fprintf(h.cdr.Output, "Additional help topics (Use \"%s %s <topic>\" to see help on the topic):\n", h.cdr.Name(), h.Name())
+		for _, cmd := range h.commands {
+			fmt.Fprintf(h.cdr.Output, "\t%-15s  %s\n", cmd.Name(), cmd.Synopsis())
+		}
+		fmt.Fprintf(h.cdr.Output, "\nUse \"%s flags\" for a list of top-level flags\n", h.cdr.Name())
+		return subcommands.ExitSuccess
+	default:
+		// Look for commands registered to the commander and print help explanation if found.
+		found := false
+		h.cdr.VisitCommands(func(g *subcommands.CommandGroup, cmd subcommands.Command) {
+			if f.Arg(0) == cmd.Name() {
+				h.cdr.ExplainCommand(h.cdr.Output, cmd)
+				found = true
+			}
+		})
+		if found {
+			return subcommands.ExitSuccess
+		}
+
+		// Next check commands registered to the help command.
+		for _, cmd := range h.commands {
+			if f.Arg(0) == cmd.Name() {
+				fs := flag.NewFlagSet(f.Arg(0), flag.ContinueOnError)
+				fs.Usage = func() { h.cdr.ExplainCommand(h.cdr.Error, cmd) }
+				cmd.SetFlags(fs)
+				if fs.Parse(f.Args()[1:]) != nil {
+					return subcommands.ExitUsageError
+				}
+				return cmd.Execute(ctx, f, args...)
+			}
+		}
+
+		fmt.Fprintf(h.cdr.Error, "Subcommand %s not understood\n", f.Arg(0))
+	}
+
+	f.Usage()
+	return subcommands.ExitUsageError
+}
+
+// Register registers a new help command.
+func (h *Help) Register(cmd subcommands.Command) {
+	h.commands = append(h.commands, cmd)
+}
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 3ab2f5676..a78a0dce6 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -80,25 +80,29 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
+	if conf.Rootless {
+		return Errorf("Rootless mode not supported with %q", r.Name())
+	}
+
 	bundleDir := r.bundleDir
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
 	spec, err := specutils.ReadSpec(bundleDir)
 	if err != nil {
-		Fatalf("reading spec: %v", err)
+		return Errorf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
 	if r.imagePath == "" {
-		Fatalf("image-path flag must be provided")
+		return Errorf("image-path flag must be provided")
 	}
 
 	conf.RestoreFile = filepath.Join(r.imagePath, checkpointFileName)
 
 	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach)
 	if err != nil {
-		Fatalf("running container: %v", err)
+		return Errorf("running container: %v", err)
 	}
 	*waitStatus = ws
 
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index c228b4f93..abf602239 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -67,19 +67,23 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
+	if conf.Rootless {
+		return Errorf("Rootless mode not supported with %q", r.Name())
+	}
+
 	bundleDir := r.bundleDir
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
 	spec, err := specutils.ReadSpec(bundleDir)
 	if err != nil {
-		Fatalf("reading spec: %v", err)
+		return Errorf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
 	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach)
 	if err != nil {
-		Fatalf("running container: %v", err)
+		return Errorf("running container: %v", err)
 	}
 
 	*waitStatus = ws
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 657726251..31e8f42bb 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -16,7 +16,6 @@ package cmd
 
 import (
 	"context"
-
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go
new file mode 100644
index 000000000..9c8a66490
--- /dev/null
+++ b/runsc/cmd/syscalls.go
@@ -0,0 +1,347 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/csv"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"strconv"
+	"text/tabwriter"
+
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// Syscalls implements subcommands.Command for the "syscalls" command.
+type Syscalls struct {
+	output string
+	os     string
+	arch   string
+}
+
+// CompatibilityInfo is a map of system and architecture to compatibility doc.
+// Maps operating system to architecture to ArchInfo.
+type CompatibilityInfo map[string]map[string]ArchInfo
+
+// ArchInfo is compatbility doc for an architecture.
+type ArchInfo struct {
+	// Syscalls maps syscall number for the architecture to the doc.
+	Syscalls map[uintptr]SyscallDoc `json:"syscalls"`
+}
+
+// SyscallDoc represents a single item of syscall documentation.
+type SyscallDoc struct {
+	Name string `json:"name"`
+	num  uintptr
+
+	Support string   `json:"support"`
+	Note    string   `json:"note,omitempty"`
+	URLs    []string `json:"urls,omitempty"`
+}
+
+type outputFunc func(io.Writer, CompatibilityInfo) error
+
+var (
+	// The string name to use for printing compatibility for all OSes.
+	osAll = "all"
+
+	// The string name to use for printing compatibility for all architectures.
+	archAll = "all"
+
+	// A map of OS name to map of architecture name to syscall table.
+	syscallTableMap = make(map[string]map[string]*kernel.SyscallTable)
+
+	// A map of output type names to output functions.
+	outputMap = map[string]outputFunc{
+		"table": outputTable,
+		"json":  outputJSON,
+		"csv":   outputCSV,
+	}
+)
+
+// Name implements subcommands.Command.Name.
+func (*Syscalls) Name() string {
+	return "syscalls"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Syscalls) Synopsis() string {
+	return "Print compatibility information for syscalls."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Syscalls) Usage() string {
+	return `syscalls [options] - Print compatibility information for syscalls.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (s *Syscalls) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&s.output, "o", "table", "Output format (table, csv, json).")
+	f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)")
+	f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	out, ok := outputMap[s.output]
+	if !ok {
+		Fatalf("Unsupported output format %q", s.output)
+	}
+
+	// Build map of all supported architectures.
+	tables := kernel.SyscallTables()
+	for _, t := range tables {
+		osMap, ok := syscallTableMap[t.OS.String()]
+		if !ok {
+			osMap = make(map[string]*kernel.SyscallTable)
+			syscallTableMap[t.OS.String()] = osMap
+		}
+		osMap[t.Arch.String()] = t
+	}
+
+	// Build a map of the architectures we want to output.
+	info, err := getCompatibilityInfo(s.os, s.arch)
+	if err != nil {
+		Fatalf("%v", err)
+	}
+
+	if err := out(os.Stdout, info); err != nil {
+		Fatalf("Error writing output: %v", err)
+	}
+
+	return subcommands.ExitSuccess
+}
+
+// getCompatibilityInfo returns compatibility info for the given OS name and
+// architecture name. Supports the special name 'all' for OS and architecture that
+// specifies that all supported OSes or architectures should be included.
+func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, error) {
+	info := CompatibilityInfo(make(map[string]map[string]ArchInfo))
+	if osName == osAll {
+		// Special processing for the 'all' OS name.
+		for osName, _ := range syscallTableMap {
+			info[osName] = make(map[string]ArchInfo)
+			// osName is a specific OS name.
+			if err := addToCompatibilityInfo(info, osName, archName); err != nil {
+				return info, err
+			}
+		}
+	} else {
+		// osName is a specific OS name.
+		info[osName] = make(map[string]ArchInfo)
+		if err := addToCompatibilityInfo(info, osName, archName); err != nil {
+			return info, err
+		}
+	}
+
+	return info, nil
+}
+
+// addToCompatibilityInfo adds ArchInfo for the given specific OS name and
+// architecture name. Supports the special architecture name 'all' to specify
+// that all supported architectures for the OS should be included.
+func addToCompatibilityInfo(info CompatibilityInfo, osName string, archName string) error {
+	if archName == archAll {
+		// Special processing for the 'all' architecture name.
+		for archName, _ := range syscallTableMap[osName] {
+			archInfo, err := getArchInfo(osName, archName)
+			if err != nil {
+				return err
+			}
+			info[osName][archName] = archInfo
+		}
+	} else {
+		// archName is a specific architecture name.
+		archInfo, err := getArchInfo(osName, archName)
+		if err != nil {
+			return err
+		}
+		info[osName][archName] = archInfo
+	}
+
+	return nil
+}
+
+// getArchInfo returns compatibility info for a specific OS and architecture.
+func getArchInfo(osName string, archName string) (ArchInfo, error) {
+	info := ArchInfo{}
+	info.Syscalls = make(map[uintptr]SyscallDoc)
+
+	t, ok := syscallTableMap[osName][archName]
+	if !ok {
+		return info, fmt.Errorf("syscall table for %s/%s not found", osName, archName)
+	}
+
+	for num, sc := range t.Table {
+		info.Syscalls[num] = SyscallDoc{
+			Name:    sc.Name,
+			num:     num,
+			Support: sc.SupportLevel.String(),
+			Note:    sc.Note,
+			URLs:    sc.URLs,
+		}
+	}
+
+	return info, nil
+}
+
+// outputTable outputs the syscall info in tabular format.
+func outputTable(w io.Writer, info CompatibilityInfo) error {
+	tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0)
+
+	// Linux
+	for osName, osInfo := range info {
+		for archName, archInfo := range osInfo {
+			// Print the OS/arch
+			fmt.Fprintf(w, "%s/%s:\n\n", osName, archName)
+
+			// Sort the syscalls for output in the table.
+			sortedCalls := []SyscallDoc{}
+			for _, sc := range archInfo.Syscalls {
+				sortedCalls = append(sortedCalls, sc)
+			}
+			sort.Slice(sortedCalls, func(i, j int) bool {
+				return sortedCalls[i].num < sortedCalls[j].num
+			})
+
+			// Write the header
+			_, err := fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n",
+				"NUM",
+				"NAME",
+				"SUPPORT",
+				"NOTE",
+			)
+			if err != nil {
+				return err
+			}
+
+			// Write each syscall entry
+			for _, sc := range sortedCalls {
+				_, err = fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n",
+					strconv.FormatInt(int64(sc.num), 10),
+					sc.Name,
+					sc.Support,
+					sc.Note,
+				)
+				if err != nil {
+					return err
+				}
+				// Add issue urls to note.
+				for _, url := range sc.URLs {
+					_, err = fmt.Fprintf(tw, "%s\t%s\t%s\tSee: %s\t\n",
+						"",
+						"",
+						"",
+						url,
+					)
+					if err != nil {
+						return err
+					}
+				}
+			}
+
+			err = tw.Flush()
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+// outputJSON outputs the syscall info in JSON format.
+func outputJSON(w io.Writer, info CompatibilityInfo) error {
+	e := json.NewEncoder(w)
+	e.SetIndent("", "  ")
+	return e.Encode(info)
+}
+
+// numberedRow is aCSV row annotated by syscall number (used for sorting)
+type numberedRow struct {
+	num uintptr
+	row []string
+}
+
+// outputCSV outputs the syscall info in tabular format.
+func outputCSV(w io.Writer, info CompatibilityInfo) error {
+	csvWriter := csv.NewWriter(w)
+
+	// Linux
+	for osName, osInfo := range info {
+		for archName, archInfo := range osInfo {
+			// Sort the syscalls for output in the table.
+			sortedCalls := []numberedRow{}
+			for _, sc := range archInfo.Syscalls {
+				// Add issue urls to note.
+				note := sc.Note
+				for _, url := range sc.URLs {
+					note = fmt.Sprintf("%s\nSee: %s", note, url)
+				}
+
+				sortedCalls = append(sortedCalls, numberedRow{
+					num: sc.num,
+					row: []string{
+						osName,
+						archName,
+						strconv.FormatInt(int64(sc.num), 10),
+						sc.Name,
+						sc.Support,
+						note,
+					},
+				})
+			}
+			sort.Slice(sortedCalls, func(i, j int) bool {
+				return sortedCalls[i].num < sortedCalls[j].num
+			})
+
+			// Write the header
+			err := csvWriter.Write([]string{
+				"OS",
+				"Arch",
+				"Num",
+				"Name",
+				"Support",
+				"Note",
+			})
+			if err != nil {
+				return err
+			}
+
+			// Write each syscall entry
+			for _, sc := range sortedCalls {
+				err = csvWriter.Write(sc.row)
+				if err != nil {
+					return err
+				}
+			}
+
+			csvWriter.Flush()
+			err = csvWriter.Error()
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index a55a682f3..58fd01974 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -88,14 +88,14 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		waitStatus = ws
 	// Wait on a PID in the root PID namespace.
 	case wt.rootPID != unsetPID:
-		ws, err := c.WaitRootPID(int32(wt.rootPID), true /* clearStatus */)
+		ws, err := c.WaitRootPID(int32(wt.rootPID))
 		if err != nil {
 			Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
 		}
 		waitStatus = ws
 	// Wait on a PID in the container's PID namespace.
 	case wt.pid != unsetPID:
-		ws, err := c.WaitPID(int32(wt.pid), true /* clearStatus */)
+		ws, err := c.WaitPID(int32(wt.pid))
 		if err != nil {
 			Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
 		}
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index b8af27c15..d016533e6 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -258,7 +258,7 @@ func TestJobControlSignalExec(t *testing.T) {
 	}
 
 	// Make sure the process indicates it was killed by a SIGKILL.
-	ws, err := c.WaitPID(pid, true)
+	ws, err := c.WaitPID(pid)
 	if err != nil {
 		t.Errorf("waiting on container failed: %v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 513085836..04b611b56 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -530,22 +530,22 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 
 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
 // returns its WaitStatus.
-func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID)
 	if !c.isSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
-	return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus)
+	return c.Sandbox.WaitPID(c.Sandbox.ID, pid)
 }
 
 // WaitPID waits for process 'pid' in the container's PID namespace and returns
 // its WaitStatus.
-func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on PID %d in container %q", pid, c.ID)
 	if !c.isSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
-	return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
+	return c.Sandbox.WaitPID(c.ID, pid)
 }
 
 // SignalContainer sends the signal to the container. If all is true and signal
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index dcd9910a0..867bf8187 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -36,6 +36,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
@@ -1841,7 +1842,7 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
 	if err != nil {
 		return 0, fmt.Errorf("error executing: %v", err)
 	}
-	ws, err := cont.WaitPID(pid, true /* clearStatus */)
+	ws, err := cont.WaitPID(pid)
 	if err != nil {
 		return 0, fmt.Errorf("error waiting: %v", err)
 	}
@@ -1853,7 +1854,7 @@ func TestMain(m *testing.M) {
 	if err := testutil.ConfigureExePath(); err != nil {
 		panic(err.Error())
 	}
-	testutil.RunAsRoot()
+	specutils.MaybeRunAsRoot()
 
 	os.Exit(m.Run())
 }
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 39c4dc03d..d57a73d46 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -99,6 +99,36 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 	return containers, cleanup, nil
 }
 
+type execDesc struct {
+	c    *Container
+	cmd  []string
+	want int
+	desc string
+}
+
+func execMany(execs []execDesc) error {
+	for _, exec := range execs {
+		args := &control.ExecArgs{Argv: exec.cmd}
+		if ws, err := exec.c.executeSync(args); err != nil {
+			return fmt.Errorf("error executing %+v: %v", args, err)
+		} else if ws.ExitStatus() != exec.want {
+			return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want)
+		}
+	}
+	return nil
+}
+
+func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
+	for _, spec := range pod {
+		spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source
+		spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type
+		spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod"
+		if len(mount.Options) > 0 {
+			spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",")
+		}
+	}
+}
+
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
@@ -175,12 +205,12 @@ func TestMultiContainerWait(t *testing.T) {
 		go func(c *Container) {
 			defer wg.Done()
 			const pid = 2
-			if ws, err := c.WaitPID(pid, true /* clearStatus */); err != nil {
+			if ws, err := c.WaitPID(pid); err != nil {
 				t.Errorf("failed to wait for PID %d: %v", pid, err)
 			} else if es := ws.ExitStatus(); es != 0 {
 				t.Errorf("PID %d exited with non-zero status %d", pid, es)
 			}
-			if _, err := c.WaitPID(pid, true /* clearStatus */); err == nil {
+			if _, err := c.WaitPID(pid); err == nil {
 				t.Errorf("wait for stopped PID %d should fail", pid)
 			}
 		}(containers[1])
@@ -263,12 +293,12 @@ func TestExecWait(t *testing.T) {
 	}
 
 	// Get the exit status from the exec'd process.
-	if ws, err := containers[0].WaitPID(pid, true /* clearStatus */); err != nil {
+	if ws, err := containers[0].WaitPID(pid); err != nil {
 		t.Fatalf("failed to wait for process %+v with pid %d: %v", args, pid, err)
 	} else if es := ws.ExitStatus(); es != 0 {
 		t.Fatalf("process %+v exited with non-zero status %d", args, es)
 	}
-	if _, err := containers[0].WaitPID(pid, true /* clearStatus */); err == nil {
+	if _, err := containers[0].WaitPID(pid); err == nil {
 		t.Fatalf("wait for stopped process %+v should fail", args)
 	}
 }
@@ -828,3 +858,272 @@ func TestMultiContainerGoferStop(t *testing.T) {
 		}
 	}
 }
+
+// Test that pod shared mounts are properly mounted in 2 containers and that
+// changes from one container is reflected in the other.
+func TestMultiContainerSharedMount(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		// Setup the containers.
+		sleep := []string{"sleep", "100"}
+		podSpec, ids := createSpecs(sleep, sleep)
+		mnt0 := specs.Mount{
+			Destination: "/mydir/test",
+			Source:      "/some/dir",
+			Type:        "tmpfs",
+			Options:     nil,
+		}
+		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+		mnt1 := mnt0
+		mnt1.Destination = "/mydir2/test2"
+		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+		createSharedMount(mnt0, "test-mount", podSpec...)
+
+		containers, cleanup, err := startContainers(conf, podSpec, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
+		}
+		defer cleanup()
+
+		file0 := path.Join(mnt0.Destination, "abc")
+		file1 := path.Join(mnt1.Destination, "abc")
+		execs := []execDesc{
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+				desc: "directory is mounted in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+				desc: "directory is mounted in container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/touch", file0},
+				desc: "create file in container0",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-f", file0},
+				desc: "file appears in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-f", file1},
+				desc: "file appears in container1",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/bin/rm", file1},
+				desc: "file removed from container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "!", "-f", file0},
+				desc: "file removed from container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "!", "-f", file1},
+				desc: "file removed from container1",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/bin/mkdir", file1},
+				desc: "create directory in container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-d", file0},
+				desc: "dir appears in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-d", file1},
+				desc: "dir appears in container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/bin/rmdir", file0},
+				desc: "create directory in container0",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "!", "-d", file0},
+				desc: "dir removed from container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "!", "-d", file1},
+				desc: "dir removed from container1",
+			},
+		}
+		if err := execMany(execs); err != nil {
+			t.Fatal(err.Error())
+		}
+	}
+}
+
+// Test that pod mounts are mounted as readonly when requested.
+func TestMultiContainerSharedMountReadonly(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		// Setup the containers.
+		sleep := []string{"sleep", "100"}
+		podSpec, ids := createSpecs(sleep, sleep)
+		mnt0 := specs.Mount{
+			Destination: "/mydir/test",
+			Source:      "/some/dir",
+			Type:        "tmpfs",
+			Options:     []string{"ro"},
+		}
+		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+		mnt1 := mnt0
+		mnt1.Destination = "/mydir2/test2"
+		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+		createSharedMount(mnt0, "test-mount", podSpec...)
+
+		containers, cleanup, err := startContainers(conf, podSpec, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
+		}
+		defer cleanup()
+
+		file0 := path.Join(mnt0.Destination, "abc")
+		file1 := path.Join(mnt1.Destination, "abc")
+		execs := []execDesc{
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+				desc: "directory is mounted in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+				desc: "directory is mounted in container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/touch", file0},
+				want: 1,
+				desc: "fails to write to container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/touch", file1},
+				want: 1,
+				desc: "fails to write to container1",
+			},
+		}
+		if err := execMany(execs); err != nil {
+			t.Fatal(err.Error())
+		}
+	}
+}
+
+// Test that shared pod mounts continue to work after container is restarted.
+func TestMultiContainerSharedMountRestart(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		// Setup the containers.
+		sleep := []string{"sleep", "100"}
+		podSpec, ids := createSpecs(sleep, sleep)
+		mnt0 := specs.Mount{
+			Destination: "/mydir/test",
+			Source:      "/some/dir",
+			Type:        "tmpfs",
+			Options:     nil,
+		}
+		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+		mnt1 := mnt0
+		mnt1.Destination = "/mydir2/test2"
+		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+		createSharedMount(mnt0, "test-mount", podSpec...)
+
+		containers, cleanup, err := startContainers(conf, podSpec, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
+		}
+		defer cleanup()
+
+		file0 := path.Join(mnt0.Destination, "abc")
+		file1 := path.Join(mnt1.Destination, "abc")
+		execs := []execDesc{
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/touch", file0},
+				desc: "create file in container0",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-f", file0},
+				desc: "file appears in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-f", file1},
+				desc: "file appears in container1",
+			},
+		}
+		if err := execMany(execs); err != nil {
+			t.Fatal(err.Error())
+		}
+
+		containers[1].Destroy()
+
+		bundleDir, err := testutil.SetupBundleDir(podSpec[1])
+		if err != nil {
+			t.Fatalf("error restarting container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+
+		containers[1], err = Create(ids[1], podSpec[1], conf, bundleDir, "", "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		if err := containers[1].Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+
+		execs = []execDesc{
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-f", file0},
+				desc: "file is still in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-f", file1},
+				desc: "file is still in container1",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/bin/rm", file1},
+				desc: "file removed from container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "!", "-f", file0},
+				desc: "file removed from container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "!", "-f", file1},
+				desc: "file removed from container1",
+			},
+		}
+		if err := execMany(execs); err != nil {
+			t.Fatal(err.Error())
+		}
+	}
+}
diff --git a/runsc/main.go b/runsc/main.go
index 11bc73f75..cfe3a78d0 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -48,11 +48,12 @@ var (
 	// system that are not covered by the runtime spec.
 
 	// Debugging flags.
-	debugLog       = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
-	logPackets     = flag.Bool("log-packets", false, "enable network packet logging")
-	logFD          = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
-	debugLogFD     = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
-	debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
+	debugLog        = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+	logPackets      = flag.Bool("log-packets", false, "enable network packet logging")
+	logFD           = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
+	debugLogFD      = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
+	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr")
 
 	// Debugging flags: strace related
 	strace         = flag.Bool("strace", false, "enable strace")
@@ -60,22 +61,27 @@ var (
 	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
 
 	// Flags that control sandbox runtime behavior.
-	platform       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
-	network        = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	gso            = flag.Bool("gso", true, "enable generic segmenation offload")
-	fileAccess     = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
-	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
-	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
-	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
-	profile        = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
-	netRaw         = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
-
+	platform           = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+	network            = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+	gso                = flag.Bool("gso", true, "enable generic segmenation offload")
+	fileAccess         = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+	overlay            = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+	watchdogAction     = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
+	panicSignal        = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+	profile            = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+	netRaw             = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+	numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
+	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
+
+	// Test flags, not to be used outside tests, ever.
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
 )
 
 func main() {
 	// Help and flags commands are generated automatically.
-	subcommands.Register(subcommands.HelpCommand(), "")
+	help := cmd.NewHelp(subcommands.DefaultCommander)
+	help.Register(new(cmd.Syscalls))
+	subcommands.Register(help, "")
 	subcommands.Register(subcommands.FlagsCommand(), "")
 
 	// Register user-facing runsc commands.
@@ -117,6 +123,22 @@ func main() {
 		os.Exit(0)
 	}
 
+	var errorLogger io.Writer
+	if *logFD > -1 {
+		errorLogger = os.NewFile(uintptr(*logFD), "error log file")
+
+	} else if *logFilename != "" {
+		// We must set O_APPEND and not O_TRUNC because Docker passes
+		// the same log file for all commands (and also parses these
+		// log files), so we can't destroy them on each command.
+		var err error
+		errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
+		if err != nil {
+			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
+		}
+	}
+	cmd.ErrorLogger = errorLogger
+
 	platformType, err := boot.MakePlatformType(*platform)
 	if err != nil {
 		cmd.Fatalf("%v", err)
@@ -141,26 +163,33 @@ func main() {
 		cmd.Fatalf("%v", err)
 	}
 
+	if *numNetworkChannels <= 0 {
+		cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
+	}
+
 	// Create a new Config from the flags.
 	conf := &boot.Config{
-		RootDir:        *rootDir,
-		Debug:          *debug,
-		LogFilename:    *logFilename,
-		LogFormat:      *logFormat,
-		DebugLog:       *debugLog,
-		DebugLogFormat: *debugLogFormat,
-		FileAccess:     fsAccess,
-		Overlay:        *overlay,
-		Network:        netType,
-		GSO:            *gso,
-		LogPackets:     *logPackets,
-		Platform:       platformType,
-		Strace:         *strace,
-		StraceLogSize:  *straceLogSize,
-		WatchdogAction: wa,
-		PanicSignal:    *panicSignal,
-		ProfileEnable:  *profile,
-		EnableRaw:      *netRaw,
+		RootDir:            *rootDir,
+		Debug:              *debug,
+		LogFilename:        *logFilename,
+		LogFormat:          *logFormat,
+		DebugLog:           *debugLog,
+		DebugLogFormat:     *debugLogFormat,
+		FileAccess:         fsAccess,
+		Overlay:            *overlay,
+		Network:            netType,
+		GSO:                *gso,
+		LogPackets:         *logPackets,
+		Platform:           platformType,
+		Strace:             *strace,
+		StraceLogSize:      *straceLogSize,
+		WatchdogAction:     wa,
+		PanicSignal:        *panicSignal,
+		ProfileEnable:      *profile,
+		EnableRaw:          *netRaw,
+		NumNetworkChannels: *numNetworkChannels,
+		Rootless:           *rootless,
+
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 	}
 	if len(*straceSyscalls) != 0 {
@@ -174,24 +203,7 @@ func main() {
 
 	subcommand := flag.CommandLine.Arg(0)
 
-	var logFile io.Writer = os.Stderr
-	if *logFD > -1 {
-		logFile = os.NewFile(uintptr(*logFD), "log file")
-	} else if *logFilename != "" {
-		// We must set O_APPEND and not O_TRUNC because Docker passes
-		// the same log file for all commands (and also parses these
-		// log files), so we can't destroy them on each command.
-		f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
-		if err != nil {
-			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
-		}
-		logFile = f
-	} else if subcommand == "do" {
-		logFile = ioutil.Discard
-	}
-
-	e := newEmitter(*logFormat, logFile)
-
+	var e log.Emitter
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
 
@@ -201,28 +213,31 @@ func main() {
 			cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
 		}
 
-		// If we are the boot process, then we own our stdio FDs and
-		// can do what we want with them. Since Docker and Containerd
-		// both eat boot's stderr, we dup our stderr to the provided
-		// log FD so that panics will appear in the logs, rather than
-		// just disappear.
+		// If we are the boot process, then we own our stdio FDs and can do what we
+		// want with them. Since Docker and Containerd both eat boot's stderr, we
+		// dup our stderr to the provided log FD so that panics will appear in the
+		// logs, rather than just disappear.
 		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
 			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
 		}
 
-		if logFile == os.Stderr {
-			// Suppress logging to stderr when debug log is enabled. Otherwise all
-			// messages will be duplicated in the debug log (see Dup2() call above).
-			e = newEmitter(*debugLogFormat, f)
-		} else {
-			e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
-		}
+		e = newEmitter(*debugLogFormat, f)
+
 	} else if *debugLog != "" {
 		f, err := specutils.DebugLogFile(*debugLog, subcommand)
 		if err != nil {
 			cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
 		}
-		e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
+		e = newEmitter(*debugLogFormat, f)
+
+	} else {
+		// Stderr is reserved for the application, just discard the logs if no debug
+		// log is specified.
+		e = newEmitter("text", ioutil.Discard)
+	}
+
+	if *alsoLogToStderr {
+		e = log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
 	}
 
 	log.SetTarget(e)
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 0460d5f1a..e9e24fc58 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -68,7 +68,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 		// Build the path to the net namespace of the sandbox process.
 		// This is what we will copy.
 		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
-		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO); err != nil {
+		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil {
 			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
 		}
 	case boot.NetworkHost:
@@ -138,7 +138,7 @@ func isRootNS() (bool, error) {
 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
 // net namespace with the given path, creates them in the sandbox, and removes
 // them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error {
 	// Join the network namespace that we will be copying.
 	restore, err := joinNetNS(nsPath)
 	if err != nil {
@@ -202,25 +202,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 			continue
 		}
 
-		// Create the socket.
-		const protocol = 0x0300 // htons(ETH_P_ALL)
-		fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
-		if err != nil {
-			return fmt.Errorf("unable to create raw socket: %v", err)
-		}
-		deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
-
-		// Bind to the appropriate device.
-		ll := syscall.SockaddrLinklayer{
-			Protocol: protocol,
-			Ifindex:  iface.Index,
-			Hatype:   0, // No ARP type.
-			Pkttype:  syscall.PACKET_OTHERHOST,
-		}
-		if err := syscall.Bind(fd, &ll); err != nil {
-			return fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
-		}
-
 		// Scrape the routes before removing the address, since that
 		// will remove the routes as well.
 		routes, def, err := routesForIface(iface)
@@ -236,9 +217,10 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 		}
 
 		link := boot.FDBasedLink{
-			Name:   iface.Name,
-			MTU:    iface.MTU,
-			Routes: routes,
+			Name:        iface.Name,
+			MTU:         iface.MTU,
+			Routes:      routes,
+			NumChannels: numNetworkChannels,
 		}
 
 		// Get the link for the interface.
@@ -246,32 +228,25 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 		if err != nil {
 			return fmt.Errorf("getting link for interface %q: %v", iface.Name, err)
 		}
-		link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr)
+		link.LinkAddress = ifaceLink.Attrs().HardwareAddr
 
-		if enableGSO {
-			gso, err := isGSOEnabled(fd, iface.Name)
+		log.Debugf("Setting up network channels")
+		// Create the socket for the device.
+		for i := 0; i < link.NumChannels; i++ {
+			log.Debugf("Creating Channel %d", i)
+			socketEntry, err := createSocket(iface, ifaceLink, enableGSO)
 			if err != nil {
-				return fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+				return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
 			}
-			if gso {
-				if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
-					return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
-				}
-				link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize
+			if i == 0 {
+				link.GSOMaxSize = socketEntry.gsoMaxSize
 			} else {
-				log.Infof("GSO not available in host.")
+				if link.GSOMaxSize != socketEntry.gsoMaxSize {
+					return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
+						link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
+				}
 			}
-		}
-
-		// Use SO_RCVBUFFORCE because on linux the receive buffer for an
-		// AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
-		// defaults to a unusually low value of 208KB. This is too low
-		// for gVisor to be able to receive packets at high throughputs
-		// without incurring packet drops.
-		const rcvBufSize = 4 << 20 // 4MB.
-
-		if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
-			return fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+			args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
 		}
 
 		// Collect the addresses for the interface, enable forwarding,
@@ -285,7 +260,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 			}
 		}
 
-		args.FilePayload.Files = append(args.FilePayload.Files, deviceFile)
 		args.FDBasedLinks = append(args.FDBasedLinks, link)
 	}
 
@@ -296,6 +270,61 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 	return nil
 }
 
+type socketEntry struct {
+	deviceFile *os.File
+	gsoMaxSize uint32
+}
+
+// createSocket creates an underlying AF_PACKET socket and configures it for use by
+// the sentry and returns an *os.File that wraps the underlying socket fd.
+func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
+	// Create the socket.
+	const protocol = 0x0300 // htons(ETH_P_ALL)
+	fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+	if err != nil {
+		return nil, fmt.Errorf("unable to create raw socket: %v", err)
+	}
+	deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
+	// Bind to the appropriate device.
+	ll := syscall.SockaddrLinklayer{
+		Protocol: protocol,
+		Ifindex:  iface.Index,
+		Hatype:   0, // No ARP type.
+		Pkttype:  syscall.PACKET_OTHERHOST,
+	}
+	if err := syscall.Bind(fd, &ll); err != nil {
+		return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+	}
+
+	gsoMaxSize := uint32(0)
+	if enableGSO {
+		gso, err := isGSOEnabled(fd, iface.Name)
+		if err != nil {
+			return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+		}
+		if gso {
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+				return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+			}
+			gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
+		} else {
+			log.Infof("GSO not available in host.")
+		}
+	}
+
+	// Use SO_RCVBUFFORCE because on linux the receive buffer for an
+	// AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
+	// defaults to a unusually low value of 208KB. This is too low
+	// for gVisor to be able to receive packets at high throughputs
+	// without incurring packet drops.
+	const rcvBufSize = 4 << 20 // 4MB.
+
+	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
+		return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+	}
+	return &socketEntry{deviceFile, gsoMaxSize}, nil
+}
+
 // loopbackLinks collects the links for a loopback interface.
 func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
 	var links []boot.LoopbackLink
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 47a66afb2..5ff6f879c 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -515,46 +515,64 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		} else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
 			log.Infof("Sandbox will be started in new user namespace")
 			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
+			cmd.Args = append(cmd.Args, "--setup-root")
 
-			// Map nobody in the new namespace to nobody in the parent namespace.
-			//
-			// A sandbox process will construct an empty
-			// root for itself, so it has to have the CAP_SYS_ADMIN
-			// capability.
-			//
-			// FIXME(b/122554829): The current implementations of
-			// os/exec doesn't allow to set ambient capabilities if
-			// a process is started in a new user namespace. As a
-			// workaround, we start the sandbox process with the 0
-			// UID and then it constructs a chroot and sets UID to
-			// nobody.  https://github.com/golang/go/issues/2315
-			const nobody = 65534
-			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
-				{
-					ContainerID: int(0),
-					HostID:      int(nobody - 1),
-					Size:        int(1),
-				},
-				{
-					ContainerID: int(nobody),
-					HostID:      int(nobody),
-					Size:        int(1),
-				},
-			}
-			cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
-				{
-					ContainerID: int(nobody),
-					HostID:      int(nobody),
-					Size:        int(1),
-				},
+			if conf.Rootless {
+				log.Infof("Rootless mode: sandbox will run as root inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
+				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: 0,
+						HostID:      os.Getuid(),
+						Size:        1,
+					},
+				}
+				cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: 0,
+						HostID:      os.Getgid(),
+						Size:        1,
+					},
+				}
+				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
+
+			} else {
+				// Map nobody in the new namespace to nobody in the parent namespace.
+				//
+				// A sandbox process will construct an empty
+				// root for itself, so it has to have the CAP_SYS_ADMIN
+				// capability.
+				//
+				// FIXME(b/122554829): The current implementations of
+				// os/exec doesn't allow to set ambient capabilities if
+				// a process is started in a new user namespace. As a
+				// workaround, we start the sandbox process with the 0
+				// UID and then it constructs a chroot and sets UID to
+				// nobody.  https://github.com/golang/go/issues/2315
+				const nobody = 65534
+				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: 0,
+						HostID:      nobody - 1,
+						Size:        1,
+					},
+					{
+						ContainerID: nobody,
+						HostID:      nobody,
+						Size:        1,
+					},
+				}
+				cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: nobody,
+						HostID:      nobody,
+						Size:        1,
+					},
+				}
+
+				// Set credentials to run as user and group nobody.
+				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: nobody}
 			}
 
-			// Set credentials to run as user and group nobody.
-			cmd.SysProcAttr.Credential = &syscall.Credential{
-				Uid: 0,
-				Gid: nobody,
-			}
-			cmd.Args = append(cmd.Args, "--setup-root")
 		} else {
 			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
 		}
@@ -649,7 +667,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 
 // WaitPID waits for process 'pid' in the container's sandbox and returns its
 // WaitStatus.
-func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (s *Sandbox) WaitPID(cid string, pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
 	var ws syscall.WaitStatus
 	conn, err := s.sandboxConnect()
@@ -659,9 +677,8 @@ func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.Wait
 	defer conn.Close()
 
 	args := &boot.WaitPIDArgs{
-		PID:         pid,
-		CID:         cid,
-		ClearStatus: clearStatus,
+		PID: pid,
+		CID: cid,
 	}
 	if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
 		return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 15476de6f..0456e4c4f 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -10,10 +10,7 @@ go_library(
         "specutils.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
-    visibility = [
-        "//runsc:__subpackages__",
-        "//test:__subpackages__",
-    ],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index 1f3afb4e4..6e6902e9f 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -16,6 +16,7 @@ package specutils
 
 import (
 	"fmt"
+	"math/bits"
 	"path"
 	"syscall"
 
@@ -105,22 +106,30 @@ func optionsToFlags(opts []string, source map[string]mapping) uint32 {
 	return rv
 }
 
-// ValidateMount validates that spec mounts are correct.
+// validateMount validates that spec mounts are correct.
 func validateMount(mnt *specs.Mount) error {
 	if !path.IsAbs(mnt.Destination) {
 		return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt)
 	}
-
 	if mnt.Type == "bind" {
-		for _, o := range mnt.Options {
-			if ContainsStr(invalidOptions, o) {
-				return fmt.Errorf("mount option %q is not supported: %v", o, mnt)
-			}
-			_, ok1 := optionsMap[o]
-			_, ok2 := propOptionsMap[o]
-			if !ok1 && !ok2 {
-				return fmt.Errorf("unknown mount option %q", o)
-			}
+		return ValidateMountOptions(mnt.Options)
+	}
+	return nil
+}
+
+// ValidateMountOptions validates that mount options are correct.
+func ValidateMountOptions(opts []string) error {
+	for _, o := range opts {
+		if ContainsStr(invalidOptions, o) {
+			return fmt.Errorf("mount option %q is not supported", o)
+		}
+		_, ok1 := optionsMap[o]
+		_, ok2 := propOptionsMap[o]
+		if !ok1 && !ok2 {
+			return fmt.Errorf("unknown mount option %q", o)
+		}
+		if err := validatePropagation(o); err != nil {
+			return err
 		}
 	}
 	return nil
@@ -133,5 +142,14 @@ func validateRootfsPropagation(opt string) error {
 	if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 {
 		return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt)
 	}
+	return validatePropagation(opt)
+}
+
+func validatePropagation(opt string) error {
+	flags := PropOptionsToFlags([]string{opt})
+	exclusive := flags & (syscall.MS_SLAVE | syscall.MS_PRIVATE | syscall.MS_SHARED | syscall.MS_UNBINDABLE)
+	if bits.OnesCount32(exclusive) > 1 {
+		return fmt.Errorf("mount propagation options are mutually exclusive: %q", opt)
+	}
 	return nil
 }
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 7d194335c..06c13d1ab 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -220,3 +220,55 @@ func HasCapabilities(cs ...capability.Cap) bool {
 	}
 	return true
 }
+
+// MaybeRunAsRoot ensures the process runs with capabilities needed to create a
+// sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed,
+// it will create a new user namespace and re-execute the process as root
+// inside the namespace with the same arguments and environment.
+//
+// This function returns immediately when no new capability is needed. If
+// another process is executed, it returns straight from here with the same exit
+// code as the child.
+func MaybeRunAsRoot() error {
+	if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) {
+		return nil
+	}
+
+	// Current process doesn't have required capabilities, create user namespace
+	// and run as root inside the namespace to acquire capabilities.
+	log.Infof("*** Re-running as root in new user namespace ***")
+
+	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
+
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
+		// Set current user/group as root inside the namespace. Since we may not
+		// have CAP_SETUID/CAP_SETGID, just map root to the current user/group.
+		UidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
+		},
+		GidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
+		},
+		Credential:                 &syscall.Credential{Uid: 0, Gid: 0},
+		GidMappingsEnableSetgroups: false,
+	}
+
+	cmd.Env = os.Environ()
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		if exit, ok := err.(*exec.ExitError); ok {
+			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
+				os.Exit(ws.ExitStatus())
+			}
+			log.Warningf("No wait status provided, exiting with -1: %v", err)
+			os.Exit(-1)
+		}
+		return fmt.Errorf("re-executing self: %v", err)
+	}
+	// Child completed with success.
+	os.Exit(0)
+	panic("unreachable")
+}
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index 0c4e4fa80..04ed885c6 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -8,6 +8,7 @@ go_test(
     srcs = [
         "exec_test.go",
         "integration_test.go",
+        "regression_test.go",
     ],
     embed = [":integration"],
     tags = [
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index 7af064d79..7c0e61ac3 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -29,6 +29,7 @@ package integration
 import (
 	"fmt"
 	"strconv"
+	"strings"
 	"syscall"
 	"testing"
 	"time"
@@ -136,3 +137,25 @@ func TestExecJobControl(t *testing.T) {
 		t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
 	}
 }
+
+// Test that failure to exec returns proper error message.
+func TestExecError(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("exec-error-test")
+
+	// Start the container.
+	if err := d.Run("alpine", "sleep", "1000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	_, err := d.Exec("no_can_find")
+	if err == nil {
+		t.Fatalf("docker exec didn't fail")
+	}
+	if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(err.Error(), want) {
+		t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", err.Error(), want)
+	}
+}
diff --git a/runsc/test/integration/regression_test.go b/runsc/test/integration/regression_test.go
new file mode 100644
index 000000000..80bae9970
--- /dev/null
+++ b/runsc/test/integration/regression_test.go
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration
+
+import (
+	"strings"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// Test that UDS can be created using overlay when parent directory is in lower
+// layer only (b/134090485).
+//
+// Prerequisite: the directory where the socket file is created must not have
+// been open for write before bind(2) is called.
+func TestBindOverlay(t *testing.T) {
+	if err := testutil.Pull("ubuntu:trusty"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := testutil.MakeDocker("bind-overlay-test")
+
+	cmd := "nc -l -U /var/run/sock& sleep 1 && echo foobar-asdf | nc -U /var/run/sock"
+	got, err := d.RunFg("ubuntu:trusty", "bash", "-c", cmd)
+	if err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+
+	if want := "foobar-asdf"; !strings.Contains(got, want) {
+		t.Fatalf("docker run output is missing %q: %s", want, got)
+	}
+	defer d.CleanUp()
+}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index ddec81444..eedf962a4 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -18,6 +18,5 @@ go_library(
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
 )
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 9efb1ba8e..1bd5adc54 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -30,7 +30,6 @@ import (
 	"os/exec"
 	"os/signal"
 	"path/filepath"
-	"runtime"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -39,7 +38,6 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/syndtr/gocapability/capability"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -136,6 +134,7 @@ func TestConfig() *boot.Config {
 		Strace:     true,
 		FileAccess: boot.FileAccessExclusive,
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
+		NumNetworkChannels:                         1,
 	}
 }
 
@@ -283,54 +282,6 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 	return Poll(cb, timeout)
 }
 
-// RunAsRoot ensures the test runs with CAP_SYS_ADMIN and CAP_SYS_CHROOT. If
-// needed it will create a new user namespace and re-execute the test as root
-// inside of the namespace. This function returns when it's running as root. If
-// it needs to create another process, it will exit from there and not return.
-func RunAsRoot() {
-	if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) {
-		return
-	}
-
-	fmt.Println("*** Re-running test as root in new user namespace ***")
-
-	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
-	// as root inside that namespace to get it.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
-	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
-	cmd.SysProcAttr = &syscall.SysProcAttr{
-		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
-		// Set current user/group as root inside the namespace.
-		UidMappings: []syscall.SysProcIDMap{
-			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
-		},
-		GidMappings: []syscall.SysProcIDMap{
-			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
-		},
-		GidMappingsEnableSetgroups: false,
-		Credential: &syscall.Credential{
-			Uid: 0,
-			Gid: 0,
-		},
-	}
-	cmd.Env = os.Environ()
-	cmd.Stdin = os.Stdin
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	if err := cmd.Run(); err != nil {
-		if exit, ok := err.(*exec.ExitError); ok {
-			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
-				os.Exit(ws.ExitStatus())
-			}
-			os.Exit(-1)
-		}
-		panic(fmt.Sprint("error running child process:", err.Error()))
-	}
-	os.Exit(0)
-}
-
 // Reaper reaps child processes.
 type Reaper struct {
 	// mu protects ch, which will be nil if the reaper is not running.
diff --git a/test/BUILD b/test/BUILD
index e99b4e501..8e1dc5228 100644
--- a/test/BUILD
+++ b/test/BUILD
@@ -1,8 +1,4 @@
-# gVisor is a general-purpose sandbox.
-
-package(licenses = ["notice"])
-
-exports_files(["LICENSE"])
+package(licenses = ["notice"])  # Apache 2.0
 
 # We need to define a bazel platform and toolchain to specify dockerPrivileged
 # and dockerRunAsRoot options, they are required to run tests on the RBE
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index c53742d14..731e2aa85 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -13,11 +13,17 @@ syscall_test(
     test = "//test/syscalls/linux:accept_bind_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:access_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:access_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:affinity_test")
 
-syscall_test(test = "//test/syscalls/linux:aio_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:aio_test",
+)
 
 syscall_test(
     size = "medium",
@@ -30,6 +36,7 @@ syscall_test(test = "//test/syscalls/linux:bad_test")
 
 syscall_test(
     size = "large",
+    add_overlay = True,
     test = "//test/syscalls/linux:bind_test",
 )
 
@@ -37,17 +44,27 @@ syscall_test(test = "//test/syscalls/linux:brk_test")
 
 syscall_test(test = "//test/syscalls/linux:socket_test")
 
-syscall_test(test = "//test/syscalls/linux:chdir_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:chdir_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:chmod_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:chmod_test",
+)
 
 syscall_test(
     size = "medium",
+    add_overlay = True,
     test = "//test/syscalls/linux:chown_test",
     use_tmpfs = True,  # chwon tests require gofer to be running as root.
 )
 
-syscall_test(test = "//test/syscalls/linux:chroot_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:chroot_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:clock_getres_test")
 
@@ -60,11 +77,17 @@ syscall_test(test = "//test/syscalls/linux:clock_nanosleep_test")
 
 syscall_test(test = "//test/syscalls/linux:concurrency_test")
 
-syscall_test(test = "//test/syscalls/linux:creat_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:creat_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:dev_test")
 
-syscall_test(test = "//test/syscalls/linux:dup_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:dup_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:epoll_test")
 
@@ -74,23 +97,34 @@ syscall_test(test = "//test/syscalls/linux:exceptions_test")
 
 syscall_test(
     size = "medium",
+    add_overlay = True,
     test = "//test/syscalls/linux:exec_test",
 )
 
 syscall_test(
     size = "medium",
+    add_overlay = True,
     test = "//test/syscalls/linux:exec_binary_test",
 )
 
 syscall_test(test = "//test/syscalls/linux:exit_test")
 
-syscall_test(test = "//test/syscalls/linux:fadvise64_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:fadvise64_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:fallocate_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:fallocate_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:fault_test")
 
-syscall_test(test = "//test/syscalls/linux:fchdir_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:fchdir_test",
+)
 
 syscall_test(
     size = "medium",
@@ -99,6 +133,7 @@ syscall_test(
 
 syscall_test(
     size = "medium",
+    add_overlay = True,
     test = "//test/syscalls/linux:flock_test",
 )
 
@@ -108,7 +143,10 @@ syscall_test(test = "//test/syscalls/linux:fpsig_fork_test")
 
 syscall_test(test = "//test/syscalls/linux:fpsig_nested_test")
 
-syscall_test(test = "//test/syscalls/linux:fsync_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:fsync_test",
+)
 
 syscall_test(
     size = "medium",
@@ -120,7 +158,10 @@ syscall_test(test = "//test/syscalls/linux:getcpu_host_test")
 
 syscall_test(test = "//test/syscalls/linux:getcpu_test")
 
-syscall_test(test = "//test/syscalls/linux:getdents_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:getdents_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:getrandom_test")
 
@@ -128,11 +169,13 @@ syscall_test(test = "//test/syscalls/linux:getrusage_test")
 
 syscall_test(
     size = "medium",
+    add_overlay = False,  # TODO(gvisor.dev/issue/317): enable when fixed.
     test = "//test/syscalls/linux:inotify_test",
 )
 
 syscall_test(
     size = "medium",
+    add_overlay = True,
     test = "//test/syscalls/linux:ioctl_test",
 )
 
@@ -144,11 +187,15 @@ syscall_test(
 syscall_test(test = "//test/syscalls/linux:kill_test")
 
 syscall_test(
+    add_overlay = True,
     test = "//test/syscalls/linux:link_test",
     use_tmpfs = True,  # gofer needs CAP_DAC_READ_SEARCH to use AT_EMPTY_PATH with linkat(2)
 )
 
-syscall_test(test = "//test/syscalls/linux:lseek_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:lseek_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:madvise_test")
 
@@ -158,9 +205,13 @@ syscall_test(test = "//test/syscalls/linux:mempolicy_test")
 
 syscall_test(test = "//test/syscalls/linux:mincore_test")
 
-syscall_test(test = "//test/syscalls/linux:mkdir_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:mkdir_test",
+)
 
 syscall_test(
+    add_overlay = True,
     test = "//test/syscalls/linux:mknod_test",
     use_tmpfs = True,  # mknod is not supported over gofer.
 )
@@ -171,7 +222,10 @@ syscall_test(
     test = "//test/syscalls/linux:mmap_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:mount_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:mount_test",
+)
 
 syscall_test(
     size = "medium",
@@ -185,9 +239,15 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:munmap_test")
 
-syscall_test(test = "//test/syscalls/linux:open_create_test")
+syscall_test(
+    add_overlay = False,  # TODO(gvisor.dev/issue/316): enable when fixed.
+    test = "//test/syscalls/linux:open_create_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:open_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:open_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:partial_bad_buffer_test")
 
@@ -195,6 +255,7 @@ syscall_test(test = "//test/syscalls/linux:pause_test")
 
 syscall_test(
     size = "large",
+    add_overlay = False,  # TODO(gvisor.dev/issue/318): enable when fixed.
     shard_count = 5,
     test = "//test/syscalls/linux:pipe_test",
 )
@@ -210,11 +271,20 @@ syscall_test(test = "//test/syscalls/linux:prctl_setuid_test")
 
 syscall_test(test = "//test/syscalls/linux:prctl_test")
 
-syscall_test(test = "//test/syscalls/linux:pread64_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:pread64_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:preadv_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:preadv_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:preadv2_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:preadv2_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:priority_test")
 
@@ -239,13 +309,22 @@ syscall_test(
     test = "//test/syscalls/linux:pty_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:pwritev2_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:pwritev2_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:pwrite64_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:pwrite64_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:raw_socket_ipv4_test")
 
-syscall_test(test = "//test/syscalls/linux:read_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:read_test",
+)
 
 syscall_test(
     size = "medium",
@@ -254,11 +333,13 @@ syscall_test(
 
 syscall_test(
     size = "medium",
+    add_overlay = True,
     test = "//test/syscalls/linux:readv_test",
 )
 
 syscall_test(
     size = "medium",
+    add_overlay = True,
     test = "//test/syscalls/linux:rename_test",
 )
 
@@ -279,11 +360,20 @@ syscall_test(
     test = "//test/syscalls/linux:semaphore_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:sendfile_socket_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:sendfile_socket_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:sendfile_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:sendfile_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:splice_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:splice_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:sigaction_test")
 
@@ -330,11 +420,13 @@ syscall_test(
 
 syscall_test(
     size = "medium",
+    add_overlay = True,
     test = "//test/syscalls/linux:socket_filesystem_non_blocking_test",
 )
 
 syscall_test(
     size = "large",
+    add_overlay = True,
     shard_count = 10,
     test = "//test/syscalls/linux:socket_filesystem_test",
 )
@@ -418,12 +510,6 @@ syscall_test(
 )
 
 syscall_test(
-    size = "large",
-    shard_count = 10,
-    test = "//test/syscalls/linux:socket_unix_abstract_test",
-)
-
-syscall_test(
     # NOTE(b/116636318): Large sendmsg may stall a long time.
     size = "enormous",
     test = "//test/syscalls/linux:socket_unix_dgram_local_test",
@@ -436,12 +522,7 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
-    test = "//test/syscalls/linux:socket_unix_filesystem_test",
-)
-
-syscall_test(
-    size = "large",
+    add_overlay = True,
     shard_count = 10,
     test = "//test/syscalls/linux:socket_unix_pair_test",
 )
@@ -484,19 +565,40 @@ syscall_test(
     test = "//test/syscalls/linux:socket_unix_unbound_stream_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:statfs_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:statfs_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:stat_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:stat_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:stat_times_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:stat_times_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:sticky_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:sticky_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:symlink_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:symlink_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:sync_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:sync_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:sync_file_range_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:sync_file_range_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:sysinfo_test")
 
@@ -520,7 +622,10 @@ syscall_test(test = "//test/syscalls/linux:time_test")
 
 syscall_test(test = "//test/syscalls/linux:tkill_test")
 
-syscall_test(test = "//test/syscalls/linux:truncate_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:truncate_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
@@ -534,7 +639,10 @@ syscall_test(test = "//test/syscalls/linux:uidgid_test")
 
 syscall_test(test = "//test/syscalls/linux:uname_test")
 
-syscall_test(test = "//test/syscalls/linux:unlink_test")
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:unlink_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:unshare_test")
 
@@ -556,15 +664,13 @@ syscall_test(
     test = "//test/syscalls/linux:wait_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:write_test")
-
 syscall_test(
-    test = "//test/syscalls/linux:proc_net_unix_test",
-    # Unix domain socket creation isn't supported on all file systems. The
-    # sentry-internal tmpfs is known to support it.
-    use_tmpfs = True,
+    add_overlay = True,
+    test = "//test/syscalls/linux:write_test",
 )
 
+syscall_test(test = "//test/syscalls/linux:proc_net_unix_test")
+
 go_binary(
     name = "syscall_test_runner",
     srcs = ["syscall_test_runner.go"],
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index cd74a769d..9f2fc9109 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -7,6 +7,7 @@ def syscall_test(
         shard_count = 1,
         size = "small",
         use_tmpfs = False,
+        add_overlay = False,
         tags = None,
         parallel = True):
     _syscall_test(
@@ -39,6 +40,18 @@ def syscall_test(
         parallel = parallel,
     )
 
+    if add_overlay:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
+            tags = tags,
+            parallel = parallel,
+            overlay = True,
+        )
+
     if not use_tmpfs:
         # Also test shared gofer access.
         _syscall_test(
@@ -60,7 +73,8 @@ def _syscall_test(
         use_tmpfs,
         tags,
         parallel,
-        file_access = "exclusive"):
+        file_access = "exclusive",
+        overlay = False):
     test_name = test.split(":")[1]
 
     # Prepend "runsc" to non-native platform names.
@@ -69,6 +83,8 @@ def _syscall_test(
     name = test_name + "_" + full_platform
     if file_access == "shared":
         name += "_shared"
+    if overlay:
+        name += "_overlay"
 
     if tags == None:
         tags = []
@@ -92,6 +108,7 @@ def _syscall_test(
         "--platform=" + platform,
         "--use-tmpfs=" + str(use_tmpfs),
         "--file-access=" + file_access,
+        "--overlay=" + str(overlay),
     ]
 
     if parallel:
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 8465e5ad0..9bafc6e4f 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -999,6 +999,7 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "//test/util:memory_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
@@ -1317,6 +1318,7 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "//test/util:cleanup",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:test_util",
@@ -2095,6 +2097,7 @@ cc_binary(
     deps = [
         ":socket_generic_test_cases",
         ":socket_test_util",
+        ":socket_unix_cmsg_test_cases",
         ":socket_unix_test_cases",
         ":unix_domain_socket_test_util",
         "//test/util:test_main",
@@ -2368,6 +2371,7 @@ cc_binary(
     deps = [
         ":socket_generic_test_cases",
         ":socket_test_util",
+        ":socket_unix_cmsg_test_cases",
         ":socket_unix_test_cases",
         ":unix_domain_socket_test_util",
         "//test/util:test_main",
@@ -2490,6 +2494,26 @@ cc_library(
 )
 
 cc_library(
+    name = "socket_unix_cmsg_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_unix_cmsg.cc",
+    ],
+    hdrs = [
+        "socket_unix_cmsg.h",
+    ],
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
     name = "socket_stream_blocking_test_cases",
     testonly = 1,
     srcs = [
@@ -2614,22 +2638,6 @@ cc_binary(
 )
 
 cc_binary(
-    name = "socket_unix_abstract_test",
-    testonly = 1,
-    srcs = [
-        "socket_unix_abstract.cc",
-    ],
-    linkstatic = 1,
-    deps = [
-        ":socket_test_util",
-        ":socket_unix_test_cases",
-        ":unix_domain_socket_test_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
-    ],
-)
-
-cc_binary(
     name = "socket_unix_unbound_dgram_test",
     testonly = 1,
     srcs = ["socket_unix_unbound_dgram.cc"],
@@ -2672,23 +2680,6 @@ cc_binary(
 )
 
 cc_binary(
-    name = "socket_unix_filesystem_test",
-    testonly = 1,
-    srcs = [
-        "socket_unix_filesystem.cc",
-    ],
-    linkstatic = 1,
-    deps = [
-        ":socket_test_util",
-        ":socket_unix_test_cases",
-        ":unix_domain_socket_test_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_binary(
     name = "socket_blocking_local_test",
     testonly = 1,
     srcs = [
@@ -2765,6 +2756,7 @@ cc_binary(
     linkstatic = 1,
     deps = [
         ":socket_test_util",
+        ":socket_unix_cmsg_test_cases",
         ":socket_unix_test_cases",
         ":unix_domain_socket_test_util",
         "//test/util:test_main",
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index 56377feab..1122ea240 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -448,19 +448,7 @@ TEST_P(AllSocketPairTest, UnboundSenderAddr) {
       RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
                            reinterpret_cast<sockaddr*>(&addr), &addr_len),
       SyscallSucceedsWithValue(sizeof(i)));
-  if (!IsRunningOnGvisor()) {
-    // Linux returns a zero length for addresses from recvfrom(2) and
-    // recvmsg(2). This differs from the behavior of getpeername(2) and
-    // getsockname(2). For simplicity, we use the getpeername(2) and
-    // getsockname(2) behavior for recvfrom(2) and recvmsg(2).
-    EXPECT_EQ(addr_len, 0);
-    return;
-  }
-  EXPECT_EQ(addr_len, 2);
-  EXPECT_EQ(
-      memcmp(&addr, sockets->second_addr(),
-             std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
-      0);
+  EXPECT_EQ(addr_len, 0);
 }
 
 TEST_P(AllSocketPairTest, BoundSenderAddr) {
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
index 4ac4cb88f..9d5f47651 100644
--- a/test/syscalls/linux/mempolicy.cc
+++ b/test/syscalls/linux/mempolicy.cc
@@ -18,6 +18,7 @@
 #include "gtest/gtest.h"
 #include "absl/memory/memory.h"
 #include "test/util/cleanup.h"
+#include "test/util/memory_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
@@ -34,7 +35,7 @@ namespace {
 #define MPOL_PREFERRED 1
 #define MPOL_BIND 2
 #define MPOL_INTERLEAVE 3
-#define MPOL_MAX MPOL_INTERLEAVE
+#define MPOL_LOCAL 4
 #define MPOL_F_NODE (1 << 0)
 #define MPOL_F_ADDR (1 << 1)
 #define MPOL_F_MEMS_ALLOWED (1 << 2)
@@ -44,11 +45,17 @@ namespace {
 
 int get_mempolicy(int *policy, uint64_t *nmask, uint64_t maxnode, void *addr,
                   int flags) {
-  return syscall(__NR_get_mempolicy, policy, nmask, maxnode, addr, flags);
+  return syscall(SYS_get_mempolicy, policy, nmask, maxnode, addr, flags);
 }
 
 int set_mempolicy(int mode, uint64_t *nmask, uint64_t maxnode) {
-  return syscall(__NR_set_mempolicy, mode, nmask, maxnode);
+  return syscall(SYS_set_mempolicy, mode, nmask, maxnode);
+}
+
+int mbind(void *addr, unsigned long len, int mode,
+          const unsigned long *nodemask, unsigned long maxnode,
+          unsigned flags) {
+  return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
 }
 
 // Creates a cleanup object that resets the calling thread's mempolicy to the
@@ -252,6 +259,30 @@ TEST(MempolicyTest, GetMempolicyNextInterleaveNode) {
   EXPECT_EQ(0, mode);
 }
 
+TEST(MempolicyTest, Mbind) {
+  // Temporarily set the thread policy to MPOL_PREFERRED.
+  const auto cleanup_thread_policy =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(MPOL_PREFERRED, nullptr, 0));
+
+  const auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS));
+
+  // vmas default to MPOL_DEFAULT irrespective of the thread policy (currently
+  // MPOL_PREFERRED).
+  int mode;
+  ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, mapping.ptr(), MPOL_F_ADDR),
+              SyscallSucceeds());
+  EXPECT_EQ(mode, MPOL_DEFAULT);
+
+  // Set MPOL_PREFERRED for the vma and read it back.
+  ASSERT_THAT(
+      mbind(mapping.ptr(), mapping.len(), MPOL_PREFERRED, nullptr, 0, 0),
+      SyscallSucceeds());
+  ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, mapping.ptr(), MPOL_F_ADDR),
+              SyscallSucceeds());
+  EXPECT_EQ(mode, MPOL_PREFERRED);
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index bce351e08..67b93ecf5 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -55,7 +55,7 @@ class PipeTest : public ::testing::TestWithParam<PipeCreator> {
   FileDescriptor wfd;
 
  public:
-  static void SetUpTestCase() {
+  static void SetUpTestSuite() {
     // Tests intentionally generate SIGPIPE.
     TEST_PCHECK(signal(SIGPIPE, SIG_IGN) != SIG_ERR);
   }
@@ -82,7 +82,7 @@ class PipeTest : public ::testing::TestWithParam<PipeCreator> {
     return s1;
   }
 
-  static void TearDownTestCase() {
+  static void TearDownTestSuite() {
     TEST_PCHECK(signal(SIGPIPE, SIG_DFL) != SIG_ERR);
   }
 
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
index bce42dc74..bd1779557 100644
--- a/test/syscalls/linux/prctl.cc
+++ b/test/syscalls/linux/prctl.cc
@@ -17,10 +17,12 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
+
 #include <string>
 
 #include "gtest/gtest.h"
 #include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
 #include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/test_util.h"
@@ -35,6 +37,16 @@ namespace testing {
 
 namespace {
 
+#ifndef SUID_DUMP_DISABLE
+#define SUID_DUMP_DISABLE 0
+#endif /* SUID_DUMP_DISABLE */
+#ifndef SUID_DUMP_USER
+#define SUID_DUMP_USER 1
+#endif /* SUID_DUMP_USER */
+#ifndef SUID_DUMP_ROOT
+#define SUID_DUMP_ROOT 2
+#endif /* SUID_DUMP_ROOT */
+
 TEST(PrctlTest, NameInitialized) {
   const size_t name_length = 20;
   char name[name_length] = {};
@@ -178,6 +190,28 @@ TEST(PrctlTest, InvalidPrSetMM) {
   ASSERT_THAT(prctl(PR_SET_MM, 0, 0, 0, 0), SyscallFailsWithErrno(EPERM));
 }
 
+// Sanity check that dumpability is remembered.
+TEST(PrctlTest, SetGetDumpability) {
+  int before;
+  ASSERT_THAT(before = prctl(PR_GET_DUMPABLE), SyscallSucceeds());
+  auto cleanup = Cleanup([before] {
+    ASSERT_THAT(prctl(PR_SET_DUMPABLE, before), SyscallSucceeds());
+  });
+
+  EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_DISABLE), SyscallSucceeds());
+  EXPECT_THAT(prctl(PR_GET_DUMPABLE),
+              SyscallSucceedsWithValue(SUID_DUMP_DISABLE));
+
+  EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_USER), SyscallSucceeds());
+  EXPECT_THAT(prctl(PR_GET_DUMPABLE), SyscallSucceedsWithValue(SUID_DUMP_USER));
+}
+
+// SUID_DUMP_ROOT cannot be set via PR_SET_DUMPABLE.
+TEST(PrctlTest, RootDumpability) {
+  EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_ROOT),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index ede6fb860..924b98e3a 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -69,9 +69,11 @@
 // way to get it tested on both gVisor, PTrace and Linux.
 
 using ::testing::AllOf;
+using ::testing::AnyOf;
 using ::testing::ContainerEq;
 using ::testing::Contains;
 using ::testing::ContainsRegex;
+using ::testing::Eq;
 using ::testing::Gt;
 using ::testing::HasSubstr;
 using ::testing::IsSupersetOf;
@@ -86,6 +88,16 @@ namespace gvisor {
 namespace testing {
 namespace {
 
+#ifndef SUID_DUMP_DISABLE
+#define SUID_DUMP_DISABLE 0
+#endif /* SUID_DUMP_DISABLE */
+#ifndef SUID_DUMP_USER
+#define SUID_DUMP_USER 1
+#endif /* SUID_DUMP_USER */
+#ifndef SUID_DUMP_ROOT
+#define SUID_DUMP_ROOT 2
+#endif /* SUID_DUMP_ROOT */
+
 // O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
 // because "it isn't needed", even though Linux can return it via F_GETFL.
 constexpr int kOLargeFile = 00100000;
@@ -1896,6 +1908,51 @@ void CheckDuplicatesRecursively(std::string path) {
 
 TEST(Proc, NoDuplicates) { CheckDuplicatesRecursively("/proc"); }
 
+// Most /proc/PID files are owned by the task user with SUID_DUMP_USER.
+TEST(ProcPid, UserDumpableOwner) {
+  int before;
+  ASSERT_THAT(before = prctl(PR_GET_DUMPABLE), SyscallSucceeds());
+  auto cleanup = Cleanup([before] {
+    ASSERT_THAT(prctl(PR_SET_DUMPABLE, before), SyscallSucceeds());
+  });
+
+  EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_USER), SyscallSucceeds());
+
+  // This applies to the task directory itself and files inside.
+  struct stat st;
+  ASSERT_THAT(stat("/proc/self/", &st), SyscallSucceeds());
+  EXPECT_EQ(st.st_uid, geteuid());
+  EXPECT_EQ(st.st_gid, getegid());
+
+  ASSERT_THAT(stat("/proc/self/stat", &st), SyscallSucceeds());
+  EXPECT_EQ(st.st_uid, geteuid());
+  EXPECT_EQ(st.st_gid, getegid());
+}
+
+// /proc/PID files are owned by root with SUID_DUMP_DISABLE.
+TEST(ProcPid, RootDumpableOwner) {
+  int before;
+  ASSERT_THAT(before = prctl(PR_GET_DUMPABLE), SyscallSucceeds());
+  auto cleanup = Cleanup([before] {
+    ASSERT_THAT(prctl(PR_SET_DUMPABLE, before), SyscallSucceeds());
+  });
+
+  EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_DISABLE), SyscallSucceeds());
+
+  // This *does not* applies to the task directory itself (or other 0555
+  // directories), but does to files inside.
+  struct stat st;
+  ASSERT_THAT(stat("/proc/self/", &st), SyscallSucceeds());
+  EXPECT_EQ(st.st_uid, geteuid());
+  EXPECT_EQ(st.st_gid, getegid());
+
+  // This file is owned by root. Also allow nobody in case this test is running
+  // in a userns without root mapped.
+  ASSERT_THAT(stat("/proc/self/stat", &st), SyscallSucceeds());
+  EXPECT_THAT(st.st_uid, AnyOf(Eq(0), Eq(65534)));
+  EXPECT_THAT(st.st_gid, AnyOf(Eq(0), Eq(65534)));
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index 6d745f728..82d325c17 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -34,6 +34,16 @@ using absl::StrFormat;
 constexpr char kProcNetUnixHeader[] =
     "Num       RefCount Protocol Flags    Type St Inode Path";
 
+// Possible values of the "st" field in a /proc/net/unix entry. Source: Linux
+// kernel, include/uapi/linux/net.h.
+enum {
+  SS_FREE = 0,      // Not allocated
+  SS_UNCONNECTED,   // Unconnected to any socket
+  SS_CONNECTING,    // In process of connecting
+  SS_CONNECTED,     // Connected to socket
+  SS_DISCONNECTING  // In process of disconnecting
+};
+
 // UnixEntry represents a single entry from /proc/net/unix.
 struct UnixEntry {
   uintptr_t addr;
@@ -71,7 +81,12 @@ PosixErrorOr<std::vector<UnixEntry>> ProcNetUnixEntries() {
   bool skipped_header = false;
   std::vector<UnixEntry> entries;
   std::vector<std::string> lines = absl::StrSplit(content, absl::ByAnyChar("\n"));
+  std::cerr << "<contents of /proc/net/unix>" << std::endl;
   for (std::string line : lines) {
+    // Emit the proc entry to the test output to provide context for the test
+    // results.
+    std::cerr << line << std::endl;
+
     if (!skipped_header) {
       EXPECT_EQ(line, kProcNetUnixHeader);
       skipped_header = true;
@@ -139,6 +154,7 @@ PosixErrorOr<std::vector<UnixEntry>> ProcNetUnixEntries() {
 
     entries.push_back(entry);
   }
+  std::cerr << "<end of /proc/net/unix>" << std::endl;
 
   return entries;
 }
@@ -241,6 +257,168 @@ TEST(ProcNetUnix, SocketPair) {
   EXPECT_EQ(entries.size(), 2);
 }
 
+TEST(ProcNetUnix, StreamSocketStateUnconnectedOnBind) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+      AbstractUnboundUnixDomainSocketPair(SOCK_STREAM).Create());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  std::vector<UnixEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+  const std::string address = ExtractPath(sockets->first_addr());
+  UnixEntry bind_entry;
+  ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+  EXPECT_EQ(bind_entry.state, SS_UNCONNECTED);
+}
+
+TEST(ProcNetUnix, StreamSocketStateStateUnconnectedOnListen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+      AbstractUnboundUnixDomainSocketPair(SOCK_STREAM).Create());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  std::vector<UnixEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+  const std::string address = ExtractPath(sockets->first_addr());
+  UnixEntry bind_entry;
+  ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+  EXPECT_EQ(bind_entry.state, SS_UNCONNECTED);
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+  UnixEntry listen_entry;
+  ASSERT_TRUE(
+      FindByPath(entries, &listen_entry, ExtractPath(sockets->first_addr())));
+  EXPECT_EQ(listen_entry.state, SS_UNCONNECTED);
+  // The bind and listen entries should refer to the same socket.
+  EXPECT_EQ(listen_entry.inode, bind_entry.inode);
+}
+
+TEST(ProcNetUnix, StreamSocketStateStateConnectedOnAccept) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+      AbstractUnboundUnixDomainSocketPair(SOCK_STREAM).Create());
+  const std::string address = ExtractPath(sockets->first_addr());
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+  std::vector<UnixEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+  UnixEntry listen_entry;
+  ASSERT_TRUE(
+      FindByPath(entries, &listen_entry, ExtractPath(sockets->first_addr())));
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  int clientfd;
+  ASSERT_THAT(clientfd = accept(sockets->first_fd(), nullptr, nullptr),
+              SyscallSucceeds());
+
+  // Find the entry for the accepted socket. UDS proc entries don't have a
+  // remote address, so we distinguish the accepted socket from the listen
+  // socket by checking for a different inode.
+  entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+  UnixEntry accept_entry;
+  ASSERT_TRUE(FindBy(
+      entries, &accept_entry, [address, listen_entry](const UnixEntry& e) {
+        return e.path == address && e.inode != listen_entry.inode;
+      }));
+  EXPECT_EQ(accept_entry.state, SS_CONNECTED);
+  // Listen entry should still be in SS_UNCONNECTED state.
+  ASSERT_TRUE(FindBy(entries, &listen_entry,
+                     [&sockets, listen_entry](const UnixEntry& e) {
+                       return e.path == ExtractPath(sockets->first_addr()) &&
+                              e.inode == listen_entry.inode;
+                     }));
+  EXPECT_EQ(listen_entry.state, SS_UNCONNECTED);
+}
+
+TEST(ProcNetUnix, DgramSocketStateDisconnectingOnBind) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+      AbstractUnboundUnixDomainSocketPair(SOCK_DGRAM).Create());
+
+  std::vector<UnixEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+  // On gVisor, the only two UDS on the system are the ones we just created and
+  // we rely on this to locate the test socket entries in the remainder of the
+  // test. On a generic Linux system, we have no easy way to locate the
+  // corresponding entries, as they don't have an address yet.
+  if (IsRunningOnGvisor()) {
+    ASSERT_EQ(entries.size(), 2);
+    for (auto e : entries) {
+      ASSERT_EQ(e.state, SS_DISCONNECTING);
+    }
+  }
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+  const std::string address = ExtractPath(sockets->first_addr());
+  UnixEntry bind_entry;
+  ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+  EXPECT_EQ(bind_entry.state, SS_UNCONNECTED);
+}
+
+TEST(ProcNetUnix, DgramSocketStateConnectingOnConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+      AbstractUnboundUnixDomainSocketPair(SOCK_DGRAM).Create());
+
+  std::vector<UnixEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+  // On gVisor, the only two UDS on the system are the ones we just created and
+  // we rely on this to locate the test socket entries in the remainder of the
+  // test. On a generic Linux system, we have no easy way to locate the
+  // corresponding entries, as they don't have an address yet.
+  if (IsRunningOnGvisor()) {
+    ASSERT_EQ(entries.size(), 2);
+    for (auto e : entries) {
+      ASSERT_EQ(e.state, SS_DISCONNECTING);
+    }
+  }
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+  const std::string address = ExtractPath(sockets->first_addr());
+  UnixEntry bind_entry;
+  ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+  // Once again, we have no easy way to identify the connecting socket as it has
+  // no listed address. We can only identify the entry as the "non-bind socket
+  // entry" on gVisor, where we're guaranteed to have only the two entries we
+  // create during this test.
+  if (IsRunningOnGvisor()) {
+    ASSERT_EQ(entries.size(), 2);
+    UnixEntry connect_entry;
+    ASSERT_TRUE(
+        FindBy(entries, &connect_entry, [bind_entry](const UnixEntry& e) {
+          return e.inode != bind_entry.inode;
+        }));
+    EXPECT_EQ(connect_entry.state, SS_CONNECTING);
+  }
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index 66adda515..1c56540bc 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -33,9 +33,69 @@ namespace gvisor {
 namespace testing {
 namespace {
 
+class SendFileTest : public ::testing::TestWithParam<int> {
+ protected:
+  PosixErrorOr<std::tuple<int, int>> Sockets() {
+    // Bind a server socket.
+    int family = GetParam();
+    struct sockaddr server_addr = {};
+    switch (family) {
+      case AF_INET: {
+        struct sockaddr_in *server_addr_in =
+            reinterpret_cast<struct sockaddr_in *>(&server_addr);
+        server_addr_in->sin_family = family;
+        server_addr_in->sin_addr.s_addr = INADDR_ANY;
+        break;
+      }
+      case AF_UNIX: {
+        struct sockaddr_un *server_addr_un =
+            reinterpret_cast<struct sockaddr_un *>(&server_addr);
+        server_addr_un->sun_family = family;
+        server_addr_un->sun_path[0] = '\0';
+        break;
+      }
+      default:
+        return PosixError(EINVAL);
+    }
+    int server = socket(family, SOCK_STREAM, 0);
+    if (bind(server, &server_addr, sizeof(server_addr)) < 0) {
+      return PosixError(errno);
+    }
+    if (listen(server, 1) < 0) {
+      close(server);
+      return PosixError(errno);
+    }
+
+    // Fetch the address; both are anonymous.
+    socklen_t length = sizeof(server_addr);
+    if (getsockname(server, &server_addr, &length) < 0) {
+      close(server);
+      return PosixError(errno);
+    }
+
+    // Connect the client.
+    int client = socket(family, SOCK_STREAM, 0);
+    if (connect(client, &server_addr, length) < 0) {
+      close(server);
+      close(client);
+      return PosixError(errno);
+    }
+
+    // Accept on the server.
+    int server_client = accept(server, nullptr, 0);
+    if (server_client < 0) {
+      close(server);
+      close(client);
+      return PosixError(errno);
+    }
+    close(server);
+    return std::make_tuple(client, server_client);
+  }
+};
+
 // Sends large file to exercise the path that read and writes data multiple
 // times, esp. when more data is read than can be written.
-TEST(SendFileTest, SendMultiple) {
+TEST_P(SendFileTest, SendMultiple) {
   std::vector<char> data(5 * 1024 * 1024);
   RandomizeBuffer(data.data(), data.size());
 
@@ -45,34 +105,20 @@ TEST(SendFileTest, SendMultiple) {
       TempPath::kDefaultFileMode));
   const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
 
-  // Use a socket for target file to make the write window small.
-  const FileDescriptor server(socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
-  ASSERT_THAT(server.get(), SyscallSucceeds());
-
-  struct sockaddr_in server_addr = {};
-  server_addr.sin_family = AF_INET;
-  server_addr.sin_addr.s_addr = INADDR_ANY;
-  ASSERT_THAT(
-      bind(server.get(), reinterpret_cast<struct sockaddr *>(&server_addr),
-           sizeof(server_addr)),
-      SyscallSucceeds());
-  ASSERT_THAT(listen(server.get(), 1), SyscallSucceeds());
+  // Create sockets.
+  std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
+  const FileDescriptor server(std::get<0>(fds));
+  FileDescriptor client(std::get<1>(fds));  // non-const, reset is used.
 
   // Thread that reads data from socket and dumps to a file.
-  ScopedThread th([&server, &out_file, &server_addr] {
-    socklen_t addrlen = sizeof(server_addr);
-    const FileDescriptor fd(RetryEINTR(accept)(
-        server.get(), reinterpret_cast<struct sockaddr *>(&server_addr),
-        &addrlen));
-    ASSERT_THAT(fd.get(), SyscallSucceeds());
-
+  ScopedThread th([&] {
     FileDescriptor outf =
         ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
 
     // Read until socket is closed.
     char buf[10240];
     for (int cnt = 0;; cnt++) {
-      int r = RetryEINTR(read)(fd.get(), buf, sizeof(buf));
+      int r = RetryEINTR(read)(server.get(), buf, sizeof(buf));
       // We cannot afford to save on every read() call.
       if (cnt % 1000 == 0) {
         ASSERT_THAT(r, SyscallSucceeds());
@@ -99,25 +145,6 @@ TEST(SendFileTest, SendMultiple) {
   const FileDescriptor inf =
       ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
 
-  FileDescriptor outf(socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
-  ASSERT_THAT(outf.get(), SyscallSucceeds());
-
-  // Get the port bound by the listening socket.
-  socklen_t addrlen = sizeof(server_addr);
-  ASSERT_THAT(getsockname(server.get(),
-                          reinterpret_cast<sockaddr *>(&server_addr), &addrlen),
-              SyscallSucceeds());
-
-  struct sockaddr_in addr = {};
-  addr.sin_family = AF_INET;
-  addr.sin_addr.s_addr = inet_addr("127.0.0.1");
-  addr.sin_port = server_addr.sin_port;
-  std::cout << "Connecting on port=" << server_addr.sin_port;
-  ASSERT_THAT(
-      RetryEINTR(connect)(
-          outf.get(), reinterpret_cast<struct sockaddr *>(&addr), sizeof(addr)),
-      SyscallSucceeds());
-
   int cnt = 0;
   for (size_t sent = 0; sent < data.size(); cnt++) {
     const size_t remain = data.size() - sent;
@@ -125,7 +152,7 @@ TEST(SendFileTest, SendMultiple) {
               << ", remain=" << remain;
 
     // Send data and verify that sendfile returns the correct value.
-    int res = sendfile(outf.get(), inf.get(), nullptr, remain);
+    int res = sendfile(client.get(), inf.get(), nullptr, remain);
     // We cannot afford to save on every sendfile() call.
     if (cnt % 120 == 0) {
       MaybeSave();
@@ -142,17 +169,74 @@ TEST(SendFileTest, SendMultiple) {
   }
 
   // Close socket to stop thread.
-  outf.reset();
+  client.reset();
   th.Join();
 
   // Verify that the output file has the correct data.
-  outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
   std::vector<char> actual(data.size(), '\0');
   ASSERT_THAT(RetryEINTR(read)(outf.get(), actual.data(), actual.size()),
               SyscallSucceedsWithValue(actual.size()));
   ASSERT_EQ(memcmp(data.data(), actual.data(), data.size()), 0);
 }
 
+TEST_P(SendFileTest, Shutdown) {
+  // Create a socket.
+  std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
+  const FileDescriptor client(std::get<0>(fds));
+  FileDescriptor server(std::get<1>(fds));  // non-const, released below.
+
+  // If this is a TCP socket, then turn off linger.
+  if (GetParam() == AF_INET) {
+    struct linger sl;
+    sl.l_onoff = 1;
+    sl.l_linger = 0;
+    ASSERT_THAT(
+        setsockopt(server.get(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+        SyscallSucceeds());
+  }
+
+  // Create a 1m file with random data.
+  std::vector<char> data(1024 * 1024);
+  RandomizeBuffer(data.data(), data.size());
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), absl::string_view(data.data(), data.size()),
+      TempPath::kDefaultFileMode));
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Read some data, then shutdown the socket. We don't actually care about
+  // checking the contents (other tests do that), so we just re-use the same
+  // buffer as above.
+  ScopedThread t([&]() {
+    int done = 0;
+    while (done < data.size()) {
+      int n = read(server.get(), data.data(), data.size());
+      ASSERT_THAT(n, SyscallSucceeds());
+      done += n;
+    }
+    // Close the server side socket.
+    ASSERT_THAT(close(server.release()), SyscallSucceeds());
+  });
+
+  // Continuously stream from the file to the socket. Note we do not assert
+  // that a specific amount of data has been written at any time, just that some
+  // data is written. Eventually, we should get a connection reset error.
+  while (1) {
+    off_t offset = 0;  // Always read from the start.
+    int n = sendfile(client.get(), inf.get(), &offset, data.size());
+    EXPECT_THAT(n, AnyOf(SyscallFailsWithErrno(ECONNRESET),
+                         SyscallFailsWithErrno(EPIPE), SyscallSucceeds()));
+    if (n <= 0) {
+      break;
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(AddressFamily, SendFileTest,
+                         ::testing::Values(AF_UNIX, AF_INET));
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_abstract.cc b/test/syscalls/linux/socket_abstract.cc
index 2faf678f7..715d87b76 100644
--- a/test/syscalls/linux/socket_abstract.cc
+++ b/test/syscalls/linux/socket_abstract.cc
@@ -17,6 +17,7 @@
 #include "test/syscalls/linux/socket_generic.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/socket_unix_cmsg.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
 
@@ -31,11 +32,15 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, AllSocketPairTest,
+    AbstractUnixSockets, AllSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, UnixSocketPairTest,
+    AbstractUnixSockets, UnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+    AbstractUnixSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_filesystem.cc b/test/syscalls/linux/socket_filesystem.cc
index f7cb72df4..74e262959 100644
--- a/test/syscalls/linux/socket_filesystem.cc
+++ b/test/syscalls/linux/socket_filesystem.cc
@@ -17,6 +17,7 @@
 #include "test/syscalls/linux/socket_generic.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/socket_unix_cmsg.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
 
@@ -31,11 +32,15 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, AllSocketPairTest,
+    FilesystemUnixSockets, AllSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, UnixSocketPairTest,
+    FilesystemUnixSockets, UnixSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+    FilesystemUnixSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index b216d14cb..df31d25b5 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -14,6 +14,7 @@
 
 #include <arpa/inet.h>
 #include <netinet/in.h>
+#include <poll.h>
 #include <string.h>
 #include <sys/socket.h>
 
@@ -144,6 +145,66 @@ TEST_P(SocketInetLoopbackTest, TCP) {
   ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
 }
 
+TEST_P(SocketInetLoopbackTest, TCPbacklog) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), 2), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+  int i = 0;
+  while (1) {
+    int ret;
+
+    // Connect to the listening socket.
+    const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+    sockaddr_storage conn_addr = connector.addr;
+    ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+    ret = connect(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                  connector.addr_len);
+    if (ret != 0) {
+      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+      struct pollfd pfd = {
+          .fd = conn_fd.get(),
+          .events = POLLOUT,
+      };
+      ret = poll(&pfd, 1, 3000);
+      if (ret == 0) break;
+      EXPECT_THAT(ret, SyscallSucceedsWithValue(1));
+    }
+    EXPECT_THAT(RetryEINTR(send)(conn_fd.get(), &i, sizeof(i), 0),
+                SyscallSucceedsWithValue(sizeof(i)));
+    ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
+    i++;
+  }
+
+  for (; i != 0; i--) {
+    // Accept the connection.
+    //
+    // We have to assign a name to the accepted socket, as unamed temporary
+    // objects are destructed upon full evaluation of the expression it is in,
+    // potentially causing the connecting socket to fail to shutdown properly.
+    auto accepted =
+        ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc
index d7fc20aad..d7fc9715b 100644
--- a/test/syscalls/linux/socket_ip_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc
@@ -39,7 +39,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, BlockingSocketPairTest,
+    BlockingIPSockets, BlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 5b198f49d..0b76280a7 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -592,5 +592,109 @@ TEST_P(TCPSocketPairTest, MsgTruncMsgPeek) {
   EXPECT_EQ(0, memcmp(received_data2, sent_data, sizeof(sent_data)));
 }
 
+TEST_P(TCPSocketPairTest, SetCongestionControlSucceedsForSupported) {
+  // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+  const int kTcpCaNameMax = 16;
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  // Netstack only supports reno & cubic so we only test these two values here.
+  {
+    const char kSetCC[kTcpCaNameMax] = "reno";
+    ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                           &kSetCC, strlen(kSetCC)),
+                SyscallSucceedsWithValue(0));
+
+    char got_cc[kTcpCaNameMax];
+    memset(got_cc, '1', sizeof(got_cc));
+    socklen_t optlen = sizeof(got_cc);
+    ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                           &got_cc, &optlen),
+                SyscallSucceedsWithValue(0));
+    EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+  }
+  {
+    const char kSetCC[kTcpCaNameMax] = "cubic";
+    ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                           &kSetCC, strlen(kSetCC)),
+                SyscallSucceedsWithValue(0));
+
+    char got_cc[kTcpCaNameMax];
+    memset(got_cc, '1', sizeof(got_cc));
+    socklen_t optlen = sizeof(got_cc);
+    ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                           &got_cc, &optlen),
+                SyscallSucceedsWithValue(0));
+    EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+  }
+}
+
+TEST_P(TCPSocketPairTest, SetGetTCPCongestionShortReadBuffer) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  {
+    // Verify that getsockopt/setsockopt work with buffers smaller than
+    // kTcpCaNameMax.
+    const char kSetCC[] = "cubic";
+    ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                           &kSetCC, strlen(kSetCC)),
+                SyscallSucceedsWithValue(0));
+
+    char got_cc[sizeof(kSetCC)];
+    socklen_t optlen = sizeof(got_cc);
+    ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                           &got_cc, &optlen),
+                SyscallSucceedsWithValue(0));
+    EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(got_cc)));
+  }
+}
+
+TEST_P(TCPSocketPairTest, SetGetTCPCongestionLargeReadBuffer) {
+  // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+  const int kTcpCaNameMax = 16;
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  {
+    // Verify that getsockopt works with buffers larger than
+    // kTcpCaNameMax.
+    const char kSetCC[] = "cubic";
+    ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                           &kSetCC, strlen(kSetCC)),
+                SyscallSucceedsWithValue(0));
+
+    char got_cc[kTcpCaNameMax + 5];
+    socklen_t optlen = sizeof(got_cc);
+    ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                           &got_cc, &optlen),
+                SyscallSucceedsWithValue(0));
+    // Linux copies the minimum of kTcpCaNameMax or the length of the passed in
+    // buffer and sets optlen to the number of bytes actually copied
+    // irrespective of the actual length of the congestion control name.
+    EXPECT_EQ(kTcpCaNameMax, optlen);
+    EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+  }
+}
+
+TEST_P(TCPSocketPairTest, SetCongestionControlFailsForUnsupported) {
+  // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+  const int kTcpCaNameMax = 16;
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char old_cc[kTcpCaNameMax];
+  socklen_t optlen;
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                         &old_cc, &optlen),
+              SyscallSucceedsWithValue(0));
+
+  const char kSetCC[] = "invalid_ca_cc";
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                         &kSetCC, strlen(kSetCC)),
+              SyscallFailsWithErrno(ENOENT));
+
+  char got_cc[kTcpCaNameMax];
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+                         &got_cc, &optlen),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(old_cc)));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
index 2c6ae17bf..0dc274e2d 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -35,7 +35,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, TCPSocketPairTest,
+    AllTCPSockets, TCPSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
index d1ea8ef12..cd3ad97d0 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -35,7 +35,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, BlockingStreamSocketPairTest,
+    BlockingTCPSockets, BlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
index 96c1b3b3d..1acdecc17 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -34,7 +34,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, NonBlockingSocketPairTest,
+    NonBlockingTCPSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
index 251817a9f..de63f79d9 100644
--- a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
@@ -69,7 +69,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllTCPSockets, TcpUdpSocketPairTest,
+    AllIPSockets, TcpUdpSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace
diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc
index fc124e9ef..1df74a348 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback.cc
@@ -33,15 +33,15 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, AllSocketPairTest,
+    AllUDPSockets, AllSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, NonStreamSocketPairTest,
+    AllUDPSockets, NonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 INSTANTIATE_TEST_SUITE_P(
-    UDPSockets, UDPSocketPairTest,
+    AllUDPSockets, UDPSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
index 1c3d1c0ad..1e259efa7 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
@@ -30,7 +30,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, BlockingNonStreamSocketPairTest,
+    BlockingUDPSockets, BlockingNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
index 7554b08d5..74cbd326d 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
@@ -30,7 +30,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, NonBlockingSocketPairTest,
+    NonBlockingUDPSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
index 040bb176e..92f03e045 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -28,7 +28,7 @@ std::vector<SocketKind> GetSockets() {
       AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
 }
 
-INSTANTIATE_TEST_SUITE_P(IPv4TCPSockets,
+INSTANTIATE_TEST_SUITE_P(IPv4TCPUnboundSockets,
                          IPv4TCPUnboundExternalNetworkingSocketTest,
                          ::testing::ValuesIn(GetSockets()));
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 53dcd58cd..6b92e05aa 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -559,5 +559,134 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
+// Check that two sockets can join the same multicast group at the same time,
+// and both will receive data on it.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastToTwo) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  std::unique_ptr<FileDescriptor> receivers[2] = {
+      ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+      ASSERT_NO_ERRNO_AND_VALUE(NewSocket())};
+
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  auto receiver_addr = V4Any();
+  int bound_port = 0;
+  for (auto& receiver : receivers) {
+    ASSERT_THAT(setsockopt(receiver->get(), SOL_SOCKET, SO_REUSEPORT,
+                           &kSockOptOn, sizeof(kSockOptOn)),
+                SyscallSucceeds());
+    // Bind the receiver to the v4 any address to ensure that we can receive the
+    // multicast packet.
+    ASSERT_THAT(
+        bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+             receiver_addr.addr_len),
+        SyscallSucceeds());
+    socklen_t receiver_addr_len = receiver_addr.addr_len;
+    ASSERT_THAT(getsockname(receiver->get(),
+                            reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                            &receiver_addr_len),
+                SyscallSucceeds());
+    EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+    // On the first iteration, save the port we are bound to. On the second
+    // iteration, verify the port is the same as the one from the first
+    // iteration. In other words, both sockets listen on the same port.
+    if (bound_port == 0) {
+      bound_port =
+          reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+    } else {
+      EXPECT_EQ(bound_port,
+                reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port);
+    }
+
+    // Register to receive multicast packets.
+    ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                           &group, sizeof(group)),
+                SyscallSucceeds());
+  }
+
+  // Send a multicast packet to the group and verify both receivers get it.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port = bound_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+  for (auto& receiver : receivers) {
+    char recv_buf[sizeof(send_buf)] = {};
+    ASSERT_THAT(
+        RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+        SyscallSucceedsWithValue(sizeof(recv_buf)));
+    EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+  }
+}
+
+// Check that when receiving a looped-back multicast packet, its source address
+// is not a multicast address.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       IpMulticastLoopbackFromAddr) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+  int receiver_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Connect to the multicast address. This binds us to the outgoing interface
+  // and allows us to get its IP (to be compared against the src-IP on the
+  // receiver side).
+  auto sendto_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port = receiver_port;
+  ASSERT_THAT(RetryEINTR(connect)(
+                  sender->get(), reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                  sendto_addr.addr_len),
+              SyscallSucceeds());
+  TestAddress sender_addr("");
+  ASSERT_THAT(
+      getsockname(sender->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+                  &sender_addr.addr_len),
+      SyscallSucceeds());
+  ASSERT_EQ(sizeof(struct sockaddr_in), sender_addr.addr_len);
+  sockaddr_in* sender_addr_in =
+      reinterpret_cast<sockaddr_in*>(&sender_addr.addr);
+
+  // Send a multicast packet.
+  char send_buf[4] = {};
+  ASSERT_THAT(RetryEINTR(send)(sender->get(), send_buf, sizeof(send_buf), 0),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Receive a multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  TestAddress src_addr("");
+  ASSERT_THAT(
+      RetryEINTR(recvfrom)(receiver->get(), recv_buf, sizeof(recv_buf), 0,
+                           reinterpret_cast<sockaddr*>(&src_addr.addr),
+                           &src_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_EQ(sizeof(struct sockaddr_in), src_addr.addr_len);
+  sockaddr_in* src_addr_in = reinterpret_cast<sockaddr_in*>(&src_addr.addr);
+
+  // Verify that the received source IP:port matches the sender one.
+  EXPECT_EQ(sender_addr_in->sin_port, src_addr_in->sin_port);
+  EXPECT_EQ(sender_addr_in->sin_addr.s_addr, src_addr_in->sin_addr.s_addr);
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
index ffbb8e6eb..9d4e1ab97 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -28,7 +28,7 @@ std::vector<SocketKind> GetSockets() {
       AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
 }
 
-INSTANTIATE_TEST_SUITE_P(IPv4UDPSockets,
+INSTANTIATE_TEST_SUITE_P(IPv4UDPUnboundSockets,
                          IPv4UDPUnboundExternalNetworkingSocketTest,
                          ::testing::ValuesIn(GetSockets()));
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 95cf8d2a3..875f0391f 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -32,6 +32,9 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
+// This file contains tests specific to Unix domain sockets. It does not contain
+// tests for UDS control messages. Those belong in socket_unix_cmsg.cc.
+//
 // This file is a generic socket test file. It must be built with another file
 // that provides the test types.
 
@@ -40,1430 +43,6 @@ namespace testing {
 
 namespace {
 
-TEST_P(UnixSocketPairTest, BasicFDPass) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  char received_data[20];
-  int fd = -1;
-  ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
-                                       sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, BasicTwoFDPass) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair1 =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-  auto pair2 =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-  int sent_fds[] = {pair1->second_fd(), pair2->second_fd()};
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
-
-  char received_data[20];
-  int received_fds[] = {-1, -1};
-
-  ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 2,
-                                  received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
-  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, BasicThreeFDPass) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair1 =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-  auto pair2 =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-  auto pair3 =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-  int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
-
-  char received_data[20];
-  int received_fds[] = {-1, -1, -1};
-
-  ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 3,
-                                  received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
-  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
-  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[2], pair3->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, BadFDPass) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  int sent_fd = -1;
-
-  struct msghdr msg = {};
-  char control[CMSG_SPACE(sizeof(sent_fd))];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  cmsg->cmsg_len = CMSG_LEN(sizeof(sent_fd));
-  cmsg->cmsg_level = SOL_SOCKET;
-  cmsg->cmsg_type = SCM_RIGHTS;
-  memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd));
-
-  struct iovec iov;
-  iov.iov_base = sent_data;
-  iov.iov_len = sizeof(sent_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
-              SyscallFailsWithErrno(EBADF));
-}
-
-// BasicFDPassNoSpace starts off by sending a single FD just like BasicFDPass.
-// The difference is that when calling recvmsg, no space for FDs is provided,
-// only space for the cmsg header.
-TEST_P(UnixSocketPairTest, BasicFDPassNoSpace) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  char received_data[20];
-
-  struct msghdr msg = {};
-  std::vector<char> control(CMSG_SPACE(0));
-  msg.msg_control = &control[0];
-  msg.msg_controllen = control.size();
-
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(msg.msg_controllen, 0);
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-// BasicFDPassNoSpaceMsgCtrunc sends an FD, but does not provide any space to
-// receive it. It then verifies that the MSG_CTRUNC flag is set in the msghdr.
-TEST_P(UnixSocketPairTest, BasicFDPassNoSpaceMsgCtrunc) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  struct msghdr msg = {};
-  std::vector<char> control(CMSG_SPACE(0));
-  msg.msg_control = &control[0];
-  msg.msg_controllen = control.size();
-
-  char received_data[sizeof(sent_data)];
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(msg.msg_controllen, 0);
-  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-}
-
-// BasicFDPassNullControlMsgCtrunc sends an FD and sets contradictory values for
-// msg_controllen and msg_control. msg_controllen is set to the correct size to
-// accomidate the FD, but msg_control is set to NULL. In this case, msg_control
-// should override msg_controllen.
-TEST_P(UnixSocketPairTest, BasicFDPassNullControlMsgCtrunc) {
-  // FIXME(gvisor.dev/issue/207): Fix handling of NULL msg_control.
-  SKIP_IF(IsRunningOnGvisor());
-
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  struct msghdr msg = {};
-  msg.msg_controllen = CMSG_SPACE(1);
-
-  char received_data[sizeof(sent_data)];
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(msg.msg_controllen, 0);
-  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-}
-
-// BasicFDPassNotEnoughSpaceMsgCtrunc sends an FD, but does not provide enough
-// space to receive it. It then verifies that the MSG_CTRUNC flag is set in the
-// msghdr.
-TEST_P(UnixSocketPairTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  struct msghdr msg = {};
-  std::vector<char> control(CMSG_SPACE(0) + 1);
-  msg.msg_control = &control[0];
-  msg.msg_controllen = control.size();
-
-  char received_data[sizeof(sent_data)];
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(msg.msg_controllen, 0);
-  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-}
-
-// BasicThreeFDPassTruncationMsgCtrunc sends three FDs, but only provides enough
-// space to receive two of them. It then verifies that the MSG_CTRUNC flag is
-// set in the msghdr.
-TEST_P(UnixSocketPairTest, BasicThreeFDPassTruncationMsgCtrunc) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair1 =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-  auto pair2 =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-  auto pair3 =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-  int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
-
-  struct msghdr msg = {};
-  std::vector<char> control(CMSG_SPACE(2 * sizeof(int)));
-  msg.msg_control = &control[0];
-  msg.msg_controllen = control.size();
-
-  char received_data[sizeof(sent_data)];
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(2 * sizeof(int)));
-  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
-}
-
-// BasicFDPassUnalignedRecv starts off by sending a single FD just like
-// BasicFDPass. The difference is that when calling recvmsg, the length of the
-// receive data is only aligned on a 4 byte boundry instead of the normal 8.
-TEST_P(UnixSocketPairTest, BasicFDPassUnalignedRecv) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  char received_data[20];
-  int fd = -1;
-  ASSERT_NO_FATAL_FAILURE(RecvSingleFDUnaligned(
-      sockets->second_fd(), &fd, received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
-}
-
-// BasicFDPassUnalignedRecvNoMsgTrunc sends one FD and only provides enough
-// space to receive just it. (Normally the minimum amount of space one would
-// provide would be enough space for two FDs.) It then verifies that the
-// MSG_CTRUNC flag is not set in the msghdr.
-TEST_P(UnixSocketPairTest, BasicFDPassUnalignedRecvNoMsgTrunc) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  struct msghdr msg = {};
-  char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  char received_data[sizeof(sent_data)] = {};
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(msg.msg_flags, 0);
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
-  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
-}
-
-// BasicTwoFDPassUnalignedRecvTruncationMsgTrunc sends two FDs, but only
-// provides enough space to receive one of them. It then verifies that the
-// MSG_CTRUNC flag is set in the msghdr.
-TEST_P(UnixSocketPairTest, BasicTwoFDPassUnalignedRecvTruncationMsgTrunc) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-  int sent_fds[] = {pair->first_fd(), pair->second_fd()};
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
-
-  struct msghdr msg = {};
-  // CMSG_SPACE rounds up to two FDs, we only want one.
-  char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  char received_data[sizeof(sent_data)] = {};
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
-  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
-}
-
-TEST_P(UnixSocketPairTest, ConcurrentBasicFDPass) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  int sockfd1 = sockets->first_fd();
-  auto recv_func = [sockfd1, sent_data]() {
-    char received_data[20];
-    int fd = -1;
-    RecvSingleFD(sockfd1, &fd, received_data, sizeof(received_data));
-    ASSERT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-    char buf[20];
-    ASSERT_THAT(ReadFd(fd, buf, sizeof(buf)),
-                SyscallSucceedsWithValue(sizeof(buf)));
-    ASSERT_THAT(WriteFd(fd, buf, sizeof(buf)),
-                SyscallSucceedsWithValue(sizeof(buf)));
-  };
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->second_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  ScopedThread t(recv_func);
-
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-  ASSERT_THAT(WriteFd(pair->first_fd(), sent_data, sizeof(sent_data)),
-              SyscallSucceedsWithValue(sizeof(sent_data)));
-
-  char received_data[20];
-  ASSERT_THAT(ReadFd(pair->first_fd(), received_data, sizeof(received_data)),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  t.Join();
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-// FDPassNoRecv checks that the control message can be safely ignored by using
-// read(2) instead of recvmsg(2).
-TEST_P(UnixSocketPairTest, FDPassNoRecv) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  // Read while ignoring the passed FD.
-  char received_data[20];
-  ASSERT_THAT(
-      ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
-      SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  // Check that the socket still works for reads and writes.
-  ASSERT_NO_FATAL_FAILURE(
-      TransferTest(sockets->first_fd(), sockets->second_fd()));
-}
-
-// FDPassInterspersed1 checks that sent control messages cannot be read before
-// their associated data has been read.
-TEST_P(UnixSocketPairTest, FDPassInterspersed1) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char written_data[20];
-  RandomizeBuffer(written_data, sizeof(written_data));
-
-  ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
-              SyscallSucceedsWithValue(sizeof(written_data)));
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  // Check that we don't get a control message, but do get the data.
-  char received_data[20];
-  RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data));
-  EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
-}
-
-// FDPassInterspersed2 checks that sent control messages cannot be read after
-// their assocated data has been read while ignoring the control message by
-// using read(2) instead of recvmsg(2).
-TEST_P(UnixSocketPairTest, FDPassInterspersed2) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  char written_data[20];
-  RandomizeBuffer(written_data, sizeof(written_data));
-  ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
-              SyscallSucceedsWithValue(sizeof(written_data)));
-
-  char received_data[20];
-  ASSERT_THAT(
-      ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
-      SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  ASSERT_NO_FATAL_FAILURE(
-      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-  EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
-}
-
-TEST_P(UnixSocketPairTest, FDPassNotCoalesced) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data1[20];
-  RandomizeBuffer(sent_data1, sizeof(sent_data1));
-
-  auto pair1 =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair1->second_fd(),
-                                       sent_data1, sizeof(sent_data1)));
-
-  char sent_data2[20];
-  RandomizeBuffer(sent_data2, sizeof(sent_data2));
-
-  auto pair2 =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair2->second_fd(),
-                                       sent_data2, sizeof(sent_data2)));
-
-  char received_data1[sizeof(sent_data1) + sizeof(sent_data2)];
-  int received_fd1 = -1;
-
-  RecvSingleFD(sockets->second_fd(), &received_fd1, received_data1,
-               sizeof(received_data1), sizeof(sent_data1));
-
-  EXPECT_EQ(0, memcmp(sent_data1, received_data1, sizeof(sent_data1)));
-  TransferTest(pair1->first_fd(), pair1->second_fd());
-
-  char received_data2[sizeof(sent_data1) + sizeof(sent_data2)];
-  int received_fd2 = -1;
-
-  RecvSingleFD(sockets->second_fd(), &received_fd2, received_data2,
-               sizeof(received_data2), sizeof(sent_data2));
-
-  EXPECT_EQ(0, memcmp(sent_data2, received_data2, sizeof(sent_data2)));
-  TransferTest(pair2->first_fd(), pair2->second_fd());
-}
-
-TEST_P(UnixSocketPairTest, FDPassPeek) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  char peek_data[20];
-  int peek_fd = -1;
-  PeekSingleFD(sockets->second_fd(), &peek_fd, peek_data, sizeof(peek_data));
-  EXPECT_EQ(0, memcmp(sent_data, peek_data, sizeof(sent_data)));
-  TransferTest(peek_fd, pair->first_fd());
-  EXPECT_THAT(close(peek_fd), SyscallSucceeds());
-
-  char received_data[20];
-  int received_fd = -1;
-  RecvSingleFD(sockets->second_fd(), &received_fd, received_data,
-               sizeof(received_data));
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-  TransferTest(received_fd, pair->first_fd());
-  EXPECT_THAT(close(received_fd), SyscallSucceeds());
-}
-
-TEST_P(UnixSocketPairTest, BasicCredPass) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  struct ucred sent_creds;
-
-  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  char received_data[20];
-  struct ucred received_creds;
-  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
-                                    received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-  EXPECT_EQ(sent_creds.pid, received_creds.pid);
-  EXPECT_EQ(sent_creds.uid, received_creds.uid);
-  EXPECT_EQ(sent_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, SendNullCredsBeforeSoPassCredRecvEnd) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  char received_data[20];
-  struct ucred received_creds;
-  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
-                                    received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  struct ucred want_creds {
-    0, 65534, 65534
-  };
-
-  EXPECT_EQ(want_creds.pid, received_creds.pid);
-  EXPECT_EQ(want_creds.uid, received_creds.uid);
-  EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, SendNullCredsAfterSoPassCredRecvEnd) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  SetSoPassCred(sockets->second_fd());
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
-
-  char received_data[20];
-  struct ucred received_creds;
-  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
-                                    received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  struct ucred want_creds;
-  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
-
-  EXPECT_EQ(want_creds.pid, received_creds.pid);
-  EXPECT_EQ(want_creds.uid, received_creds.uid);
-  EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, SendNullCredsBeforeSoPassCredSendEnd) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
-
-  SetSoPassCred(sockets->first_fd());
-
-  char received_data[20];
-  ASSERT_NO_FATAL_FAILURE(
-      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-TEST_P(UnixSocketPairTest, SendNullCredsAfterSoPassCredSendEnd) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  SetSoPassCred(sockets->first_fd());
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
-
-  char received_data[20];
-  ASSERT_NO_FATAL_FAILURE(
-      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-TEST_P(UnixSocketPairTest, SendNullCredsBeforeSoPassCredRecvEndAfterSendEnd) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  SetSoPassCred(sockets->first_fd());
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  char received_data[20];
-  struct ucred received_creds;
-  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
-                                    received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  struct ucred want_creds;
-  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
-
-  EXPECT_EQ(want_creds.pid, received_creds.pid);
-  EXPECT_EQ(want_creds.uid, received_creds.uid);
-  EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, WriteBeforeSoPassCredRecvEnd) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
-              SyscallSucceedsWithValue(sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  char received_data[20];
-
-  struct ucred received_creds;
-  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
-                                    received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  struct ucred want_creds {
-    0, 65534, 65534
-  };
-
-  EXPECT_EQ(want_creds.pid, received_creds.pid);
-  EXPECT_EQ(want_creds.uid, received_creds.uid);
-  EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, WriteAfterSoPassCredRecvEnd) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  SetSoPassCred(sockets->second_fd());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
-              SyscallSucceedsWithValue(sizeof(sent_data)));
-
-  char received_data[20];
-
-  struct ucred received_creds;
-  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
-                                    received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  struct ucred want_creds;
-  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
-
-  EXPECT_EQ(want_creds.pid, received_creds.pid);
-  EXPECT_EQ(want_creds.uid, received_creds.uid);
-  EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, WriteBeforeSoPassCredSendEnd) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
-              SyscallSucceedsWithValue(sizeof(sent_data)));
-
-  SetSoPassCred(sockets->first_fd());
-
-  char received_data[20];
-  ASSERT_NO_FATAL_FAILURE(
-      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-TEST_P(UnixSocketPairTest, WriteAfterSoPassCredSendEnd) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  SetSoPassCred(sockets->first_fd());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
-              SyscallSucceedsWithValue(sizeof(sent_data)));
-
-  char received_data[20];
-  ASSERT_NO_FATAL_FAILURE(
-      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-TEST_P(UnixSocketPairTest, WriteBeforeSoPassCredRecvEndAfterSendEnd) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  SetSoPassCred(sockets->first_fd());
-
-  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
-              SyscallSucceedsWithValue(sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  char received_data[20];
-
-  struct ucred received_creds;
-  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
-                                    received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  struct ucred want_creds;
-  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
-
-  EXPECT_EQ(want_creds.pid, received_creds.pid);
-  EXPECT_EQ(want_creds.uid, received_creds.uid);
-  EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, CredPassTruncated) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  struct ucred sent_creds;
-
-  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  struct msghdr msg = {};
-  char control[CMSG_SPACE(0) + sizeof(pid_t)];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  char received_data[sizeof(sent_data)] = {};
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  EXPECT_EQ(msg.msg_controllen, sizeof(control));
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
-  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
-
-  pid_t pid = 0;
-  memcpy(&pid, CMSG_DATA(cmsg), sizeof(pid));
-  EXPECT_EQ(pid, sent_creds.pid);
-}
-
-// CredPassNoMsgCtrunc passes a full set of credentials. It then verifies that
-// receiving the full set does not result in MSG_CTRUNC being set in the msghdr.
-TEST_P(UnixSocketPairTest, CredPassNoMsgCtrunc) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  struct ucred sent_creds;
-
-  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  struct msghdr msg = {};
-  char control[CMSG_SPACE(sizeof(struct ucred))];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  char received_data[sizeof(sent_data)] = {};
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  // The control message should not be truncated.
-  EXPECT_EQ(msg.msg_flags, 0);
-  EXPECT_EQ(msg.msg_controllen, sizeof(control));
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct ucred)));
-  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
-}
-
-// CredPassNoSpaceMsgCtrunc passes a full set of credentials. It then receives
-// the data without providing space for any credentials and verifies that
-// MSG_CTRUNC is set in the msghdr.
-TEST_P(UnixSocketPairTest, CredPassNoSpaceMsgCtrunc) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  struct ucred sent_creds;
-
-  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  struct msghdr msg = {};
-  char control[CMSG_SPACE(0)];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  char received_data[sizeof(sent_data)] = {};
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  // The control message should be truncated.
-  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-  EXPECT_EQ(msg.msg_controllen, sizeof(control));
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
-  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
-}
-
-// CredPassTruncatedMsgCtrunc passes a full set of credentials. It then receives
-// the data while providing enough space for only the first field of the
-// credentials and verifies that MSG_CTRUNC is set in the msghdr.
-TEST_P(UnixSocketPairTest, CredPassTruncatedMsgCtrunc) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  struct ucred sent_creds;
-
-  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  struct msghdr msg = {};
-  char control[CMSG_SPACE(0) + sizeof(pid_t)];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  char received_data[sizeof(sent_data)] = {};
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  // The control message should be truncated.
-  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-  EXPECT_EQ(msg.msg_controllen, sizeof(control));
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
-  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
-}
-
-TEST_P(UnixSocketPairTest, SoPassCred) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  int opt;
-  socklen_t optLen = sizeof(opt);
-  EXPECT_THAT(
-      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
-      SyscallSucceeds());
-  EXPECT_FALSE(opt);
-
-  optLen = sizeof(opt);
-  EXPECT_THAT(
-      getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
-      SyscallSucceeds());
-  EXPECT_FALSE(opt);
-
-  SetSoPassCred(sockets->first_fd());
-
-  optLen = sizeof(opt);
-  EXPECT_THAT(
-      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
-      SyscallSucceeds());
-  EXPECT_TRUE(opt);
-
-  optLen = sizeof(opt);
-  EXPECT_THAT(
-      getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
-      SyscallSucceeds());
-  EXPECT_FALSE(opt);
-
-  int zero = 0;
-  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &zero,
-                         sizeof(zero)),
-              SyscallSucceeds());
-
-  optLen = sizeof(opt);
-  EXPECT_THAT(
-      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
-      SyscallSucceeds());
-  EXPECT_FALSE(opt);
-
-  optLen = sizeof(opt);
-  EXPECT_THAT(
-      getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
-      SyscallSucceeds());
-  EXPECT_FALSE(opt);
-}
-
-TEST_P(UnixSocketPairTest, NoDataCredPass) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  struct msghdr msg = {};
-
-  struct iovec iov;
-  iov.iov_base = sent_data;
-  iov.iov_len = sizeof(sent_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  char control[CMSG_SPACE(0)];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  cmsg->cmsg_level = SOL_SOCKET;
-  cmsg->cmsg_type = SCM_CREDENTIALS;
-  cmsg->cmsg_len = CMSG_LEN(0);
-
-  ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
-              SyscallFailsWithErrno(EINVAL));
-}
-
-TEST_P(UnixSocketPairTest, NoPassCred) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  struct ucred sent_creds;
-
-  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
-  ASSERT_NO_FATAL_FAILURE(
-      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
-  char received_data[20];
-
-  ASSERT_NO_FATAL_FAILURE(
-      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-TEST_P(UnixSocketPairTest, CredAndFDPass) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  struct ucred sent_creds;
-
-  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendCredsAndFD(sockets->first_fd(), sent_creds,
-                                         pair->second_fd(), sent_data,
-                                         sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  char received_data[20];
-  struct ucred received_creds;
-  int fd = -1;
-  ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
-                                         &fd, received_data,
-                                         sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  EXPECT_EQ(sent_creds.pid, received_creds.pid);
-  EXPECT_EQ(sent_creds.uid, received_creds.uid);
-  EXPECT_EQ(sent_creds.gid, received_creds.gid);
-
-  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, FDPassBeforeSoPassCred) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  SetSoPassCred(sockets->second_fd());
-
-  char received_data[20];
-  struct ucred received_creds;
-  int fd = -1;
-  ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
-                                         &fd, received_data,
-                                         sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  struct ucred want_creds {
-    0, 65534, 65534
-  };
-
-  EXPECT_EQ(want_creds.pid, received_creds.pid);
-  EXPECT_EQ(want_creds.uid, received_creds.uid);
-  EXPECT_EQ(want_creds.gid, received_creds.gid);
-
-  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, FDPassAfterSoPassCred) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  SetSoPassCred(sockets->second_fd());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  char received_data[20];
-  struct ucred received_creds;
-  int fd = -1;
-  ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
-                                         &fd, received_data,
-                                         sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  struct ucred want_creds;
-  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
-  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
-  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
-
-  EXPECT_EQ(want_creds.pid, received_creds.pid);
-  EXPECT_EQ(want_creds.uid, received_creds.uid);
-  EXPECT_EQ(want_creds.gid, received_creds.gid);
-
-  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, CloexecDroppedWhenFDPassed) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair = ASSERT_NO_ERRNO_AND_VALUE(
-      UnixDomainSocketPair(SOCK_SEQPACKET | SOCK_CLOEXEC).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  char received_data[20];
-  int fd = -1;
-  ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
-                                       sizeof(received_data)));
-
-  EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UnixSocketPairTest, CloexecRecvFDPass) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  struct msghdr msg = {};
-  char control[CMSG_SPACE(sizeof(int))];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  struct iovec iov;
-  char received_data[20];
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CMSG_CLOEXEC),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
-  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
-
-  int fd = -1;
-  memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
-
-  EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
-}
-
-TEST_P(UnixSocketPairTest, FDPassAfterSoPassCredWithoutCredSpace) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  SetSoPassCred(sockets->second_fd());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  struct msghdr msg = {};
-  char control[CMSG_LEN(0)];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  char received_data[20];
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
-  EXPECT_EQ(msg.msg_controllen, sizeof(control));
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
-  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
-}
-
-// This test will validate that MSG_CTRUNC as an input flag to recvmsg will
-// not appear as an output flag on the control message when truncation doesn't
-// happen.
-TEST_P(UnixSocketPairTest, MsgCtruncInputIsNoop) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  struct msghdr msg = {};
-  char control[CMSG_SPACE(sizeof(int)) /* we're passing a single fd */];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  struct iovec iov;
-  char received_data[20];
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CTRUNC),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
-  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
-
-  // Now we should verify that MSG_CTRUNC wasn't set as an output flag.
-  EXPECT_EQ(msg.msg_flags & MSG_CTRUNC, 0);
-}
-
-TEST_P(UnixSocketPairTest, FDPassAfterSoPassCredWithoutCredHeaderSpace) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  char sent_data[20];
-  RandomizeBuffer(sent_data, sizeof(sent_data));
-
-  auto pair =
-      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
-  SetSoPassCred(sockets->second_fd());
-
-  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
-                                       sent_data, sizeof(sent_data)));
-
-  struct msghdr msg = {};
-  char control[CMSG_LEN(0) / 2];
-  msg.msg_control = control;
-  msg.msg_controllen = sizeof(control);
-
-  char received_data[20];
-  struct iovec iov;
-  iov.iov_base = received_data;
-  iov.iov_len = sizeof(received_data);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
-              SyscallSucceedsWithValue(sizeof(received_data)));
-
-  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-  EXPECT_EQ(msg.msg_controllen, 0);
-}
-
 TEST_P(UnixSocketPairTest, InvalidGetSockOpt) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
   int opt;
@@ -1519,6 +98,14 @@ TEST_P(UnixSocketPairTest, RecvmmsgTimeoutAfterRecv) {
 TEST_P(UnixSocketPairTest, TIOCINQSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
+  if (IsRunningOnGvisor()) {
+    // TODO(gvisor.dev/issue/273): Inherited host UDS don't support TIOCINQ.
+    // Skip the test.
+    int size = -1;
+    int ret = ioctl(sockets->first_fd(), TIOCINQ, &size);
+    SKIP_IF(ret == -1 && errno == ENOTTY);
+  }
+
   int size = -1;
   EXPECT_THAT(ioctl(sockets->first_fd(), TIOCINQ, &size), SyscallSucceeds());
   EXPECT_EQ(size, 0);
@@ -1544,6 +131,14 @@ TEST_P(UnixSocketPairTest, TIOCINQSucceeds) {
 TEST_P(UnixSocketPairTest, TIOCOUTQSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
+  if (IsRunningOnGvisor()) {
+    // TODO(gvisor.dev/issue/273): Inherited host UDS don't support TIOCOUTQ.
+    // Skip the test.
+    int size = -1;
+    int ret = ioctl(sockets->second_fd(), TIOCOUTQ, &size);
+    SKIP_IF(ret == -1 && errno == ENOTTY);
+  }
+
   int size = -1;
   EXPECT_THAT(ioctl(sockets->second_fd(), TIOCOUTQ, &size), SyscallSucceeds());
   EXPECT_EQ(size, 0);
@@ -1580,19 +175,70 @@ TEST_P(UnixSocketPairTest, NetdeviceIoctlsSucceed) {
   }
 }
 
-TEST_P(UnixSocketPairTest, SocketShutdown) {
+TEST_P(UnixSocketPairTest, Shutdown) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-  char buf[20];
+
   const std::string data = "abc";
-  ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), 3),
-              SyscallSucceedsWithValue(3));
+  ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), data.size()),
+              SyscallSucceedsWithValue(data.size()));
+
   ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_RDWR), SyscallSucceeds());
   ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_RDWR), SyscallSucceeds());
 
   // Shutting down a socket does not clear the buffer.
-  ASSERT_THAT(ReadFd(sockets->second_fd(), buf, 3),
-              SyscallSucceedsWithValue(3));
-  EXPECT_EQ(data, absl::string_view(buf, 3));
+  char buf[3];
+  ASSERT_THAT(ReadFd(sockets->second_fd(), buf, data.size()),
+              SyscallSucceedsWithValue(data.size()));
+  EXPECT_EQ(data, absl::string_view(buf, data.size()));
+}
+
+TEST_P(UnixSocketPairTest, ShutdownRead) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_RD), SyscallSucceeds());
+
+  // When the socket is shutdown for read, read behavior varies between
+  // different socket types. This is covered by the various ReadOneSideClosed
+  // test cases.
+
+  // ... and the peer cannot write.
+  const std::string data = "abc";
+  EXPECT_THAT(WriteFd(sockets->second_fd(), data.c_str(), data.size()),
+              SyscallFailsWithErrno(EPIPE));
+
+  // ... but the socket can still write.
+  ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), data.size()),
+              SyscallSucceedsWithValue(data.size()));
+
+  // ... and the peer can still read.
+  char buf[3];
+  EXPECT_THAT(ReadFd(sockets->second_fd(), buf, data.size()),
+              SyscallSucceedsWithValue(data.size()));
+  EXPECT_EQ(data, absl::string_view(buf, data.size()));
+}
+
+TEST_P(UnixSocketPairTest, ShutdownWrite) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_WR), SyscallSucceeds());
+
+  // When the socket is shutdown for write, it cannot write.
+  const std::string data = "abc";
+  EXPECT_THAT(WriteFd(sockets->first_fd(), data.c_str(), data.size()),
+              SyscallFailsWithErrno(EPIPE));
+
+  // ... and the peer read behavior varies between different socket types. This
+  // is covered by the various ReadOneSideClosed test cases.
+
+  // ... but the peer can still write.
+  char buf[3];
+  ASSERT_THAT(WriteFd(sockets->second_fd(), data.c_str(), data.size()),
+              SyscallSucceedsWithValue(data.size()));
+
+  // ... and the socket can still read.
+  EXPECT_THAT(ReadFd(sockets->first_fd(), buf, data.size()),
+              SyscallSucceedsWithValue(data.size()));
+  EXPECT_EQ(data, absl::string_view(buf, data.size()));
 }
 
 TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) {
diff --git a/test/syscalls/linux/socket_unix_abstract_nonblock.cc b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
index 9de0f6dfe..be31ab2a7 100644
--- a/test/syscalls/linux/socket_unix_abstract_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
@@ -30,7 +30,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, NonBlockingSocketPairTest,
+    NonBlockingAbstractUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
index 320915b0f..1994139e6 100644
--- a/test/syscalls/linux/socket_unix_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -37,7 +37,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, BlockingSocketPairTest,
+    NonBlockingUnixDomainSockets, BlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc
new file mode 100644
index 000000000..b0ab26847
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_cmsg.cc
@@ -0,0 +1,1473 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix_cmsg.h"
+
+#include <errno.h>
+#include <net/if.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+// This file contains tests for control message in Unix domain sockets.
+//
+// This file is a generic socket test file. It must be built with another file
+// that provides the test types.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(UnixSocketPairCmsgTest, BasicFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+                                       sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, BasicTwoFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair1 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  auto pair2 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  int sent_fds[] = {pair1->second_fd(), pair2->second_fd()};
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  int received_fds[] = {-1, -1};
+
+  ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 2,
+                                  received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
+  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, BasicThreeFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair1 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  auto pair2 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  auto pair3 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  int received_fds[] = {-1, -1, -1};
+
+  ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 3,
+                                  received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
+  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
+  ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[2], pair3->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, BadFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  int sent_fd = -1;
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(sent_fd))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg->cmsg_len = CMSG_LEN(sizeof(sent_fd));
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_RIGHTS;
+  memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd));
+
+  struct iovec iov;
+  iov.iov_base = sent_data;
+  iov.iov_len = sizeof(sent_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+              SyscallFailsWithErrno(EBADF));
+}
+
+// BasicFDPassNoSpace starts off by sending a single FD just like BasicFDPass.
+// The difference is that when calling recvmsg, no space for FDs is provided,
+// only space for the cmsg header.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNoSpace) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+
+  struct msghdr msg = {};
+  std::vector<char> control(CMSG_SPACE(0));
+  msg.msg_control = &control[0];
+  msg.msg_controllen = control.size();
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(msg.msg_controllen, 0);
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+// BasicFDPassNoSpaceMsgCtrunc sends an FD, but does not provide any space to
+// receive it. It then verifies that the MSG_CTRUNC flag is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNoSpaceMsgCtrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  std::vector<char> control(CMSG_SPACE(0));
+  msg.msg_control = &control[0];
+  msg.msg_controllen = control.size();
+
+  char received_data[sizeof(sent_data)];
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(msg.msg_controllen, 0);
+  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+}
+
+// BasicFDPassNullControlMsgCtrunc sends an FD and sets contradictory values for
+// msg_controllen and msg_control. msg_controllen is set to the correct size to
+// accomidate the FD, but msg_control is set to NULL. In this case, msg_control
+// should override msg_controllen.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNullControlMsgCtrunc) {
+  // FIXME(gvisor.dev/issue/207): Fix handling of NULL msg_control.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  msg.msg_controllen = CMSG_SPACE(1);
+
+  char received_data[sizeof(sent_data)];
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(msg.msg_controllen, 0);
+  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+}
+
+// BasicFDPassNotEnoughSpaceMsgCtrunc sends an FD, but does not provide enough
+// space to receive it. It then verifies that the MSG_CTRUNC flag is set in the
+// msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  std::vector<char> control(CMSG_SPACE(0) + 1);
+  msg.msg_control = &control[0];
+  msg.msg_controllen = control.size();
+
+  char received_data[sizeof(sent_data)];
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(msg.msg_controllen, 0);
+  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+}
+
+// BasicThreeFDPassTruncationMsgCtrunc sends three FDs, but only provides enough
+// space to receive two of them. It then verifies that the MSG_CTRUNC flag is
+// set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicThreeFDPassTruncationMsgCtrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair1 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  auto pair2 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  auto pair3 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  std::vector<char> control(CMSG_SPACE(2 * sizeof(int)));
+  msg.msg_control = &control[0];
+  msg.msg_controllen = control.size();
+
+  char received_data[sizeof(sent_data)];
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(2 * sizeof(int)));
+  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+}
+
+// BasicFDPassUnalignedRecv starts off by sending a single FD just like
+// BasicFDPass. The difference is that when calling recvmsg, the length of the
+// receive data is only aligned on a 4 byte boundry instead of the normal 8.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassUnalignedRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvSingleFDUnaligned(
+      sockets->second_fd(), &fd, received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+// BasicFDPassUnalignedRecvNoMsgTrunc sends one FD and only provides enough
+// space to receive just it. (Normally the minimum amount of space one would
+// provide would be enough space for two FDs.) It then verifies that the
+// MSG_CTRUNC flag is not set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassUnalignedRecvNoMsgTrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  char received_data[sizeof(sent_data)] = {};
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(msg.msg_flags, 0);
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+}
+
+// BasicTwoFDPassUnalignedRecvTruncationMsgTrunc sends two FDs, but only
+// provides enough space to receive one of them. It then verifies that the
+// MSG_CTRUNC flag is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicTwoFDPassUnalignedRecvTruncationMsgTrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  int sent_fds[] = {pair->first_fd(), pair->second_fd()};
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  // CMSG_SPACE rounds up to two FDs, we only want one.
+  char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  char received_data[sizeof(sent_data)] = {};
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+}
+
+TEST_P(UnixSocketPairCmsgTest, ConcurrentBasicFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  int sockfd1 = sockets->first_fd();
+  auto recv_func = [sockfd1, sent_data]() {
+    char received_data[20];
+    int fd = -1;
+    RecvSingleFD(sockfd1, &fd, received_data, sizeof(received_data));
+    ASSERT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+    char buf[20];
+    ASSERT_THAT(ReadFd(fd, buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+    ASSERT_THAT(WriteFd(fd, buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+  };
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->second_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  ScopedThread t(recv_func);
+
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(WriteFd(pair->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[20];
+  ASSERT_THAT(ReadFd(pair->first_fd(), received_data, sizeof(received_data)),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  t.Join();
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+// FDPassNoRecv checks that the control message can be safely ignored by using
+// read(2) instead of recvmsg(2).
+TEST_P(UnixSocketPairCmsgTest, FDPassNoRecv) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  // Read while ignoring the passed FD.
+  char received_data[20];
+  ASSERT_THAT(
+      ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+      SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  // Check that the socket still works for reads and writes.
+  ASSERT_NO_FATAL_FAILURE(
+      TransferTest(sockets->first_fd(), sockets->second_fd()));
+}
+
+// FDPassInterspersed1 checks that sent control messages cannot be read before
+// their associated data has been read.
+TEST_P(UnixSocketPairCmsgTest, FDPassInterspersed1) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char written_data[20];
+  RandomizeBuffer(written_data, sizeof(written_data));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
+              SyscallSucceedsWithValue(sizeof(written_data)));
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  // Check that we don't get a control message, but do get the data.
+  char received_data[20];
+  RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data));
+  EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
+}
+
+// FDPassInterspersed2 checks that sent control messages cannot be read after
+// their assocated data has been read while ignoring the control message by
+// using read(2) instead of recvmsg(2).
+TEST_P(UnixSocketPairCmsgTest, FDPassInterspersed2) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char written_data[20];
+  RandomizeBuffer(written_data, sizeof(written_data));
+  ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
+              SyscallSucceedsWithValue(sizeof(written_data)));
+
+  char received_data[20];
+  ASSERT_THAT(
+      ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+      SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassNotCoalesced) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data1[20];
+  RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+  auto pair1 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair1->second_fd(),
+                                       sent_data1, sizeof(sent_data1)));
+
+  char sent_data2[20];
+  RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+  auto pair2 =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair2->second_fd(),
+                                       sent_data2, sizeof(sent_data2)));
+
+  char received_data1[sizeof(sent_data1) + sizeof(sent_data2)];
+  int received_fd1 = -1;
+
+  RecvSingleFD(sockets->second_fd(), &received_fd1, received_data1,
+               sizeof(received_data1), sizeof(sent_data1));
+
+  EXPECT_EQ(0, memcmp(sent_data1, received_data1, sizeof(sent_data1)));
+  TransferTest(pair1->first_fd(), pair1->second_fd());
+
+  char received_data2[sizeof(sent_data1) + sizeof(sent_data2)];
+  int received_fd2 = -1;
+
+  RecvSingleFD(sockets->second_fd(), &received_fd2, received_data2,
+               sizeof(received_data2), sizeof(sent_data2));
+
+  EXPECT_EQ(0, memcmp(sent_data2, received_data2, sizeof(sent_data2)));
+  TransferTest(pair2->first_fd(), pair2->second_fd());
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassPeek) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char peek_data[20];
+  int peek_fd = -1;
+  PeekSingleFD(sockets->second_fd(), &peek_fd, peek_data, sizeof(peek_data));
+  EXPECT_EQ(0, memcmp(sent_data, peek_data, sizeof(sent_data)));
+  TransferTest(peek_fd, pair->first_fd());
+  EXPECT_THAT(close(peek_fd), SyscallSucceeds());
+
+  char received_data[20];
+  int received_fd = -1;
+  RecvSingleFD(sockets->second_fd(), &received_fd, received_data,
+               sizeof(received_data));
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+  TransferTest(received_fd, pair->first_fd());
+  EXPECT_THAT(close(received_fd), SyscallSucceeds());
+}
+
+TEST_P(UnixSocketPairCmsgTest, BasicCredPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+  EXPECT_EQ(sent_creds.pid, received_creds.pid);
+  EXPECT_EQ(sent_creds.uid, received_creds.uid);
+  EXPECT_EQ(sent_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsBeforeSoPassCredRecvEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds {
+    0, 65534, 65534
+  };
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsAfterSoPassCredRecvEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  SetSoPassCred(sockets->second_fd());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsBeforeSoPassCredSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->first_fd());
+
+  char received_data[20];
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsAfterSoPassCredSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  SetSoPassCred(sockets->first_fd());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest,
+       SendNullCredsBeforeSoPassCredRecvEndAfterSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  SetSoPassCred(sockets->first_fd());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteBeforeSoPassCredRecvEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds {
+    0, 65534, 65534
+  };
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteAfterSoPassCredRecvEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  SetSoPassCred(sockets->second_fd());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[20];
+
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteBeforeSoPassCredSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  SetSoPassCred(sockets->first_fd());
+
+  char received_data[20];
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteAfterSoPassCredSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  SetSoPassCred(sockets->first_fd());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[20];
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteBeforeSoPassCredRecvEndAfterSendEnd) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  SetSoPassCred(sockets->first_fd());
+
+  ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+
+  struct ucred received_creds;
+  ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+                                    received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, CredPassTruncated) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(0) + sizeof(pid_t)];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  char received_data[sizeof(sent_data)] = {};
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+
+  pid_t pid = 0;
+  memcpy(&pid, CMSG_DATA(cmsg), sizeof(pid));
+  EXPECT_EQ(pid, sent_creds.pid);
+}
+
+// CredPassNoMsgCtrunc passes a full set of credentials. It then verifies that
+// receiving the full set does not result in MSG_CTRUNC being set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, CredPassNoMsgCtrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(struct ucred))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  char received_data[sizeof(sent_data)] = {};
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  // The control message should not be truncated.
+  EXPECT_EQ(msg.msg_flags, 0);
+  EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct ucred)));
+  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+// CredPassNoSpaceMsgCtrunc passes a full set of credentials. It then receives
+// the data without providing space for any credentials and verifies that
+// MSG_CTRUNC is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, CredPassNoSpaceMsgCtrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(0)];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  char received_data[sizeof(sent_data)] = {};
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  // The control message should be truncated.
+  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+  EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+// CredPassTruncatedMsgCtrunc passes a full set of credentials. It then receives
+// the data while providing enough space for only the first field of the
+// credentials and verifies that MSG_CTRUNC is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, CredPassTruncatedMsgCtrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(0) + sizeof(pid_t)];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  char received_data[sizeof(sent_data)] = {};
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  // The control message should be truncated.
+  EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+  EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SoPassCred) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int opt;
+  socklen_t optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_FALSE(opt);
+
+  optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_FALSE(opt);
+
+  SetSoPassCred(sockets->first_fd());
+
+  optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_TRUE(opt);
+
+  optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_FALSE(opt);
+
+  int zero = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &zero,
+                         sizeof(zero)),
+              SyscallSucceeds());
+
+  optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_FALSE(opt);
+
+  optLen = sizeof(opt);
+  EXPECT_THAT(
+      getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+      SyscallSucceeds());
+  EXPECT_FALSE(opt);
+}
+
+TEST_P(UnixSocketPairCmsgTest, NoDataCredPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct msghdr msg = {};
+
+  struct iovec iov;
+  iov.iov_base = sent_data;
+  iov.iov_len = sizeof(sent_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  char control[CMSG_SPACE(0)];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_CREDENTIALS;
+  cmsg->cmsg_len = CMSG_LEN(0);
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UnixSocketPairCmsgTest, NoPassCred) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  ASSERT_NO_FATAL_FAILURE(
+      SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+
+  ASSERT_NO_FATAL_FAILURE(
+      RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, CredAndFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  struct ucred sent_creds;
+
+  ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendCredsAndFD(sockets->first_fd(), sent_creds,
+                                         pair->second_fd(), sent_data,
+                                         sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+  struct ucred received_creds;
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+                                         &fd, received_data,
+                                         sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  EXPECT_EQ(sent_creds.pid, received_creds.pid);
+  EXPECT_EQ(sent_creds.uid, received_creds.uid);
+  EXPECT_EQ(sent_creds.gid, received_creds.gid);
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassBeforeSoPassCred) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  SetSoPassCred(sockets->second_fd());
+
+  char received_data[20];
+  struct ucred received_creds;
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+                                         &fd, received_data,
+                                         sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds {
+    0, 65534, 65534
+  };
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassAfterSoPassCred) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  SetSoPassCred(sockets->second_fd());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  struct ucred received_creds;
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+                                         &fd, received_data,
+                                         sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  struct ucred want_creds;
+  ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+  ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+  EXPECT_EQ(want_creds.pid, received_creds.pid);
+  EXPECT_EQ(want_creds.uid, received_creds.uid);
+  EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+  ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, CloexecDroppedWhenFDPassed) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair = ASSERT_NO_ERRNO_AND_VALUE(
+      UnixDomainSocketPair(SOCK_SEQPACKET | SOCK_CLOEXEC).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  char received_data[20];
+  int fd = -1;
+  ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+                                       sizeof(received_data)));
+
+  EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UnixSocketPairCmsgTest, CloexecRecvFDPass) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(int))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct iovec iov;
+  char received_data[20];
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CMSG_CLOEXEC),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+  int fd = -1;
+  memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
+
+  EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassAfterSoPassCredWithoutCredSpace) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  SetSoPassCred(sockets->second_fd());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  char control[CMSG_LEN(0)];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  char received_data[20];
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+  EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+// This test will validate that MSG_CTRUNC as an input flag to recvmsg will
+// not appear as an output flag on the control message when truncation doesn't
+// happen.
+TEST_P(UnixSocketPairCmsgTest, MsgCtruncInputIsNoop) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(int)) /* we're passing a single fd */];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct iovec iov;
+  char received_data[20];
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CTRUNC),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+  // Now we should verify that MSG_CTRUNC wasn't set as an output flag.
+  EXPECT_EQ(msg.msg_flags & MSG_CTRUNC, 0);
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassAfterSoPassCredWithoutCredHeaderSpace) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  SetSoPassCred(sockets->second_fd());
+
+  ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+                                       sent_data, sizeof(sent_data)));
+
+  struct msghdr msg = {};
+  char control[CMSG_LEN(0) / 2];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  char received_data[20];
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+  EXPECT_EQ(msg.msg_controllen, 0);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_abstract.cc b/test/syscalls/linux/socket_unix_cmsg.h
index 8241bf997..431606903 100644
--- a/test/syscalls/linux/socket_unix_abstract.cc
+++ b/test/syscalls/linux/socket_unix_cmsg.h
@@ -12,26 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <vector>
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_
 
 #include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/socket_unix.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/test_util.h"
 
 namespace gvisor {
 namespace testing {
 
-std::vector<SocketPairKind> GetSocketPairs() {
-  return ApplyVec<SocketPairKind>(
-      AbstractBoundUnixDomainSocketPair,
-      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
-                             List<int>{0, SOCK_NONBLOCK}));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, UnixSocketPairTest,
-    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+// Test fixture for tests that apply to pairs of connected unix sockets about
+// control messages.
+using UnixSocketPairCmsgTest = SocketPairTest;
 
 }  // namespace testing
 }  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_
diff --git a/test/syscalls/linux/socket_unix_dgram_local.cc b/test/syscalls/linux/socket_unix_dgram_local.cc
index 4ba2c80ae..8c5a473bd 100644
--- a/test/syscalls/linux/socket_unix_dgram_local.cc
+++ b/test/syscalls/linux/socket_unix_dgram_local.cc
@@ -41,15 +41,15 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, DgramUnixSocketPairTest,
+    DgramUnixSockets, DgramUnixSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, UnixNonStreamSocketPairTest,
+    DgramUnixSockets, UnixNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, NonStreamSocketPairTest,
+    DgramUnixSockets, NonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
index 9fe86cee8..707052af8 100644
--- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -44,7 +44,7 @@ TEST_P(NonBlockingDgramUnixSocketPairTest, ReadOneSideClosed) {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, NonBlockingDgramUnixSocketPairTest,
+    NonBlockingDgramUnixSockets, NonBlockingDgramUnixSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(std::vector<SocketPairKind>{
         UnixDomainSocketPair(SOCK_DGRAM | SOCK_NONBLOCK),
         FilesystemBoundUnixDomainSocketPair(SOCK_DGRAM | SOCK_NONBLOCK),
diff --git a/test/syscalls/linux/socket_unix_filesystem.cc b/test/syscalls/linux/socket_unix_filesystem.cc
deleted file mode 100644
index 5dbe67773..000000000
--- a/test/syscalls/linux/socket_unix_filesystem.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/socket_unix.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/test_util.h"
-
-namespace gvisor {
-namespace testing {
-
-std::vector<SocketPairKind> GetSocketPairs() {
-  return ApplyVec<SocketPairKind>(
-      FilesystemBoundUnixDomainSocketPair,
-      AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
-                             List<int>{0, SOCK_NONBLOCK}));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, UnixSocketPairTest,
-    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
-
-}  // namespace testing
-}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
index 137db53c4..8ba7af971 100644
--- a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
@@ -30,7 +30,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, NonBlockingSocketPairTest,
+    NonBlockingFilesystemUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
index 98cf1fe8a..da762cd83 100644
--- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -34,7 +34,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, BlockingNonStreamSocketPairTest,
+    BlockingNonStreamUnixSockets, BlockingNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_pair.cc b/test/syscalls/linux/socket_unix_pair.cc
index bacfc11e4..411fb4518 100644
--- a/test/syscalls/linux/socket_unix_pair.cc
+++ b/test/syscalls/linux/socket_unix_pair.cc
@@ -16,6 +16,7 @@
 
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/socket_unix_cmsg.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
 
@@ -33,5 +34,9 @@ INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, UnixSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+INSTANTIATE_TEST_SUITE_P(
+    AllUnixDomainSockets, UnixSocketPairCmsgTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair_nonblock.cc b/test/syscalls/linux/socket_unix_pair_nonblock.cc
index 583506f08..3135d325f 100644
--- a/test/syscalls/linux/socket_unix_pair_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_pair_nonblock.cc
@@ -30,7 +30,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, NonBlockingSocketPairTest,
+    NonBlockingUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_seqpacket_local.cc b/test/syscalls/linux/socket_unix_seqpacket_local.cc
index b903a9e8f..dff75a532 100644
--- a/test/syscalls/linux/socket_unix_seqpacket_local.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket_local.cc
@@ -41,15 +41,15 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, NonStreamSocketPairTest,
+    SeqpacketUnixSockets, NonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, SeqpacketUnixSocketPairTest,
+    SeqpacketUnixSockets, SeqpacketUnixSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, UnixNonStreamSocketPairTest,
+    SeqpacketUnixSockets, UnixNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
index ce0f1e50d..fa0a9d367 100644
--- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -32,7 +32,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, BlockingStreamSocketPairTest,
+    BlockingStreamUnixSockets, BlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_stream_local.cc b/test/syscalls/linux/socket_unix_stream_local.cc
index 6b840189c..65eef1a81 100644
--- a/test/syscalls/linux/socket_unix_stream_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_local.cc
@@ -39,7 +39,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, StreamSocketPairTest,
+    StreamUnixSockets, StreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
index ebec4e0ec..ec777c59f 100644
--- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -31,7 +31,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllUnixDomainSockets, NonBlockingStreamSocketPairTest,
+    NonBlockingStreamUnixSockets, NonBlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_unbound_dgram.cc b/test/syscalls/linux/socket_unix_unbound_dgram.cc
index 2ddc5c11f..52aef891f 100644
--- a/test/syscalls/linux/socket_unix_unbound_dgram.cc
+++ b/test/syscalls/linux/socket_unix_unbound_dgram.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <stdio.h>
+#include <sys/socket.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -142,6 +144,28 @@ TEST_P(UnboundDgramUnixSocketPairTest, SendtoWithoutConnect) {
       SyscallSucceedsWithValue(sizeof(data)));
 }
 
+TEST_P(UnboundDgramUnixSocketPairTest, SendtoWithoutConnectPassCreds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  SetSoPassCred(sockets->first_fd());
+  char data = 'a';
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->second_fd(), &data, sizeof(data), 0,
+                         sockets->first_addr(), sockets->first_addr_size()),
+      SyscallSucceedsWithValue(sizeof(data)));
+  ucred creds;
+  creds.pid = -1;
+  char buf[sizeof(data) + 1];
+  ASSERT_NO_FATAL_FAILURE(
+      RecvCreds(sockets->first_fd(), &creds, buf, sizeof(buf), sizeof(data)));
+  EXPECT_EQ(0, memcmp(&data, buf, sizeof(data)));
+  EXPECT_THAT(getpid(), SyscallSucceedsWithValue(creds.pid));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, UnboundDgramUnixSocketPairTest,
     ::testing::ValuesIn(VecCat<SocketPairKind>(
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index e3f9f9f9d..e95b644ac 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -751,6 +751,133 @@ TEST_P(SimpleTcpSocketTest, NonBlockingConnectRefused) {
   EXPECT_THAT(close(s.release()), SyscallSucceeds());
 }
 
+// Test that setting a supported congestion control algorithm succeeds for an
+// unconnected TCP socket
+TEST_P(SimpleTcpSocketTest, SetCongestionControlSucceedsForSupported) {
+  // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+  const int kTcpCaNameMax = 16;
+
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  {
+    const char kSetCC[kTcpCaNameMax] = "reno";
+    ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+                           strlen(kSetCC)),
+                SyscallSucceedsWithValue(0));
+
+    char got_cc[kTcpCaNameMax];
+    memset(got_cc, '1', sizeof(got_cc));
+    socklen_t optlen = sizeof(got_cc);
+    ASSERT_THAT(
+        getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+        SyscallSucceedsWithValue(0));
+    // We ignore optlen here as the linux kernel sets optlen to the lower of the
+    // size of the buffer passed in or kTcpCaNameMax and not the length of the
+    // congestion control algorithm's actual name.
+    EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kTcpCaNameMax)));
+  }
+  {
+    const char kSetCC[kTcpCaNameMax] = "cubic";
+    ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+                           strlen(kSetCC)),
+                SyscallSucceedsWithValue(0));
+
+    char got_cc[kTcpCaNameMax];
+    memset(got_cc, '1', sizeof(got_cc));
+    socklen_t optlen = sizeof(got_cc);
+    ASSERT_THAT(
+        getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+        SyscallSucceedsWithValue(0));
+    // We ignore optlen here as the linux kernel sets optlen to the lower of the
+    // size of the buffer passed in or kTcpCaNameMax and not the length of the
+    // congestion control algorithm's actual name.
+    EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kTcpCaNameMax)));
+  }
+}
+
+// This test verifies that a getsockopt(...TCP_CONGESTION) behaviour is
+// consistent between linux and gvisor when the passed in buffer is smaller than
+// kTcpCaNameMax.
+TEST_P(SimpleTcpSocketTest, SetGetTCPCongestionShortReadBuffer) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  {
+    // Verify that getsockopt/setsockopt work with buffers smaller than
+    // kTcpCaNameMax.
+    const char kSetCC[] = "cubic";
+    ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+                           strlen(kSetCC)),
+                SyscallSucceedsWithValue(0));
+
+    char got_cc[sizeof(kSetCC)];
+    socklen_t optlen = sizeof(got_cc);
+    ASSERT_THAT(
+        getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+        SyscallSucceedsWithValue(0));
+    EXPECT_EQ(sizeof(got_cc), optlen);
+    EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(got_cc)));
+  }
+}
+
+// This test verifies that a getsockopt(...TCP_CONGESTION) behaviour is
+// consistent between linux and gvisor when the passed in buffer is larger than
+// kTcpCaNameMax.
+TEST_P(SimpleTcpSocketTest, SetGetTCPCongestionLargeReadBuffer) {
+  // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+  const int kTcpCaNameMax = 16;
+
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  {
+    // Verify that getsockopt works with buffers larger than
+    // kTcpCaNameMax.
+    const char kSetCC[] = "cubic";
+    ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+                           strlen(kSetCC)),
+                SyscallSucceedsWithValue(0));
+
+    char got_cc[kTcpCaNameMax + 5];
+    socklen_t optlen = sizeof(got_cc);
+    ASSERT_THAT(
+        getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+        SyscallSucceedsWithValue(0));
+    // Linux copies the minimum of kTcpCaNameMax or the length of the passed in
+    // buffer and sets optlen to the number of bytes actually copied
+    // irrespective of the actual length of the congestion control name.
+    EXPECT_EQ(kTcpCaNameMax, optlen);
+    EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+  }
+}
+
+// Test that setting an unsupported congestion control algorithm fails for an
+// unconnected TCP socket.
+TEST_P(SimpleTcpSocketTest, SetCongestionControlFailsForUnsupported) {
+  // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+  const int kTcpCaNameMax = 16;
+
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  char old_cc[kTcpCaNameMax];
+  socklen_t optlen = sizeof(old_cc);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &old_cc, &optlen),
+      SyscallSucceedsWithValue(0));
+
+  const char kSetCC[] = "invalid_ca_kSetCC";
+  ASSERT_THAT(
+      setsockopt(s.get(), SOL_TCP, TCP_CONGESTION, &kSetCC, strlen(kSetCC)),
+      SyscallFailsWithErrno(ENOENT));
+
+  char got_cc[kTcpCaNameMax];
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+      SyscallSucceedsWithValue(0));
+  // We ignore optlen here as the linux kernel sets optlen to the lower of the
+  // size of the buffer passed in or kTcpCaNameMax and not the length of the
+  // congestion control algorithm's actual name.
+  EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(kTcpCaNameMax)));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index 9a8e0600b..476248184 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -47,6 +47,7 @@ var (
 	platform   = flag.String("platform", "ptrace", "platform to run on")
 	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
 	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
+	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
 	parallel   = flag.Bool("parallel", false, "run tests in parallel")
 	runscPath  = flag.String("runsc", "", "path to runsc binary")
 )
@@ -184,10 +185,13 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 		"-platform", *platform,
 		"-root", rootDir,
 		"-file-access", *fileAccess,
-		"--network=none",
+		"-network=none",
 		"-log-format=text",
 		"-TESTONLY-unsafe-nonroot=true",
-		"--net-raw=true",
+		"-net-raw=true",
+	}
+	if *overlay {
+		args = append(args, "-overlay")
 	}
 	if *debug {
 		args = append(args, "-debug", "-log-packets=true")
@@ -196,7 +200,11 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 		args = append(args, "-strace")
 	}
 	if outDir, ok := syscall.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
-		debugLogDir, err := ioutil.TempDir(outDir, "runsc")
+		tdir := filepath.Join(outDir, strings.Replace(tc.FullName(), "/", "_", -1))
+		if err := os.MkdirAll(tdir, 0755); err != nil {
+			t.Fatalf("could not create test dir: %v", err)
+		}
+		debugLogDir, err := ioutil.TempDir(tdir, "runsc")
 		if err != nil {
 			t.Fatalf("could not create temp dir: %v", err)
 		}
diff --git a/tools/go_branch.sh b/tools/go_branch.sh
new file mode 100755
index 000000000..8ea6a6d8d
--- /dev/null
+++ b/tools/go_branch.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eo pipefail
+
+# Discovery the package name from the go.mod file.
+declare -r gomod="$(pwd)/go.mod"
+declare -r module=$(cat "${gomod}" | grep -E "^module" | cut -d' ' -f2)
+
+# Check that gopath has been built.
+declare -r gopath_dir="$(pwd)/bazel-bin/gopath/src/${module}"
+if ! [ -d "${gopath_dir}" ]; then
+  echo "No gopath directory found; build the :gopath target." >&2
+  exit 1
+fi
+
+# Create a temporary working directory, and ensure that this directory and all
+# subdirectories are cleaned up upon exit.
+declare -r tmp_dir=$(mktemp -d)
+finish() {
+  cd # Leave tmp_dir.
+  rm -rf "${tmp_dir}"
+}
+trap finish EXIT
+
+# Record the current working commit.
+declare -r head=$(git describe --always)
+
+# We expect to have an existing go branch that we will use as the basis for
+# this commit. That branch may be empty, but it must exist.
+declare -r go_branch=$(git show-ref --hash origin/go)
+
+# Clone the current repository to the temporary directory, and check out the
+# current go_branch directory. We move to the new repository for convenience.
+declare -r repo_orig="$(pwd)"
+declare -r repo_new="${tmp_dir}/repository"
+git clone . "${repo_new}"
+cd "${repo_new}"
+
+# Setup the repository and checkout the branch.
+git config user.email "gvisor-bot@google.com"
+git config user.name "gVisor bot"
+git fetch origin "${go_branch}"
+git checkout -b go "${go_branch}"
+
+# Start working on a merge commit that combines the previous history with the
+# current history. Note that we don't actually want any changes yet.
+git merge --allow-unrelated-histories --no-commit --strategy ours ${head}
+
+# Sync the entire gopath_dir and go.mod.
+rsync --recursive --verbose --delete --exclude .git --exclude README.md -L "${gopath_dir}/" .
+cp "${gomod}" .
+
+# There are a few solitary files that can get left behind due to the way bazel
+# constructs the gopath target. Note that we don't find all Go files here
+# because they may correspond to unused templates, etc.
+cp "${repo_orig}"/runsc/*.go runsc/
+
+# Update the current working set and commit.
+git add . && git commit -m "Merge ${head} (automated)"
+
+# Push the branch back to the original repository.
+git remote add orig "${repo_orig}" && git push -f orig go:go
diff --git a/tools/run_build.sh b/tools/run_build.sh
new file mode 100755
index 000000000..d49a1d4be
--- /dev/null
+++ b/tools/run_build.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Fail on any error.
+set -e
+# Display commands to stderr.
+set -x
+
+# Install the latest version of Bazel and log the version.
+(which use_bazel.sh && use_bazel.sh latest) || which bazel
+bazel version
+
+# Switch into the workspace.
+if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
+  cd git/repo
+elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
+  cd github/repo
+fi
+
+# Build runsc.
+bazel build //runsc
+
+# Move the runsc binary into "latest" directory, and also a directory with the
+# current date.
+if [[ -v KOKORO_ARTIFACTS_DIR ]]; then
+  latest_dir="${KOKORO_ARTIFACTS_DIR}"/latest
+  today_dir="${KOKORO_ARTIFACTS_DIR}"/"$(date -Idate)"
+  mkdir -p "${latest_dir}" "${today_dir}"
+  cp bazel-bin/runsc/linux_amd64_pure_stripped/runsc "${latest_dir}"
+  sha512sum "${latest_dir}"/runsc | awk '{print $1 "  runsc"}' > "${latest_dir}"/runsc.sha512
+  cp bazel-bin/runsc/linux_amd64_pure_stripped/runsc "${today_dir}"
+  sha512sum "${today_dir}"/runsc | awk '{print $1 "  runsc"}' > "${today_dir}"/runsc.sha512
+fi
diff --git a/tools/run_tests.sh b/tools/run_tests.sh
new file mode 100755
index 000000000..7a1f889dd
--- /dev/null
+++ b/tools/run_tests.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Fail on any error. Treat unset variables as error. Print commands as executed.
+set -eux
+
+###################
+# GLOBAL ENV VARS #
+###################
+
+if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
+  readonly WORKSPACE_DIR="${PWD}/git/repo"
+elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
+  readonly WORKSPACE_DIR="${PWD}/github/repo"
+else
+  readonly WORKSPACE_DIR="${PWD}"
+fi
+
+# Used to configure RBE.
+readonly CLOUD_PROJECT_ID="gvisor-rbe"
+readonly RBE_PROJECT_ID="projects/${CLOUD_PROJECT_ID}/instances/default_instance"
+
+# Random runtime name to avoid collisions.
+readonly RUNTIME="runsc_test_$((RANDOM))"
+
+# Packages that will be built and tested.
+readonly BUILD_PACKAGES=("//...")
+readonly TEST_PACKAGES=("//pkg/..." "//runsc/..." "//tools/...")
+
+#######################
+# BAZEL CONFIGURATION #
+#######################
+
+# Install the latest version of Bazel and log the version.
+(which use_bazel.sh && use_bazel.sh latest) || which bazel
+bazel version
+
+# Load the kvm module.
+sudo -n -E modprobe kvm
+
+# General Bazel build/test flags.
+BAZEL_BUILD_FLAGS=(
+  "--show_timestamps"
+  "--test_output=errors"
+  "--keep_going"
+  "--verbose_failures=true"
+)
+
+# Bazel build/test for RBE, a super-set of BAZEL_BUILD_FLAGS.
+BAZEL_BUILD_RBE_FLAGS=(
+  "${BAZEL_BUILD_FLAGS[@]}"
+  "--config=remote"
+  "--project_id=${CLOUD_PROJECT_ID}"
+  "--remote_instance_name=${RBE_PROJECT_ID}"
+)
+if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
+  BAZEL_BUILD_RBE_FLAGS=(
+    "${BAZEL_BUILD_RBE_FLAGS[@]}"
+    "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
+  )
+fi
+
+####################
+# Helper Functions #
+####################
+
+sanity_checks() {
+  cd ${WORKSPACE_DIR}
+  bazel run //:gazelle -- update-repos -from_file=go.mod
+  git diff --exit-code WORKSPACE
+}
+
+build_everything() {
+  FLAVOR="${1}"
+
+  cd ${WORKSPACE_DIR}
+  bazel build \
+    -c "${FLAVOR}" "${BAZEL_BUILD_RBE_FLAGS[@]}" \
+    "${BUILD_PACKAGES[@]}"
+}
+
+# Run simple tests runs the tests that require no special setup or
+# configuration.
+run_simple_tests() {
+  cd ${WORKSPACE_DIR}
+  bazel test \
+    "${BAZEL_BUILD_FLAGS[@]}" \
+    "${TEST_PACKAGES[@]}"
+}
+
+install_runtime() {
+  cd ${WORKSPACE_DIR}
+  sudo -n ${WORKSPACE_DIR}/runsc/test/install.sh --runtime ${RUNTIME}
+}
+
+install_helper() {
+  PACKAGE="${1}"
+  TAG="${2}"
+  GOPATH="${3}"
+
+  # Clone the repository.
+  mkdir -p "${GOPATH}"/src/$(dirname "${PACKAGE}") && \
+     git clone https://"${PACKAGE}" "${GOPATH}"/src/"${PACKAGE}"
+
+  # Checkout and build the repository.
+  (cd "${GOPATH}"/src/"${PACKAGE}" && \
+      git checkout "${TAG}" && \
+      GOPATH="${GOPATH}" make && \
+      sudo -n -E env GOPATH="${GOPATH}" make install)
+}
+
+# Install dependencies for the crictl tests.
+install_crictl_test_deps() {
+  sudo -n -E apt-get update
+  sudo -n -E apt-get install -y btrfs-tools libseccomp-dev
+
+  # Install containerd & cri-tools.
+  GOPATH=$(mktemp -d --tmpdir gopathXXXXX)
+  install_helper github.com/containerd/containerd v1.2.2 "${GOPATH}"
+  install_helper github.com/kubernetes-sigs/cri-tools v1.11.0 "${GOPATH}"
+
+  # Install gvisor-containerd-shim.
+  local latest=/tmp/gvisor-containerd-shim-latest
+  local shim_path=/tmp/gvisor-containerd-shim
+  wget --no-verbose https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/latest -O ${latest}
+  wget --no-verbose https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
+  chmod +x ${shim_path}
+  sudo -n -E mv ${shim_path} /usr/local/bin
+
+  # Configure containerd-shim.
+  local shim_config_path=/etc/containerd
+  local shim_config_tmp_path=/tmp/gvisor-containerd-shim.toml
+  sudo -n -E mkdir -p ${shim_config_path}
+  cat > ${shim_config_tmp_path} <<-EOF
+    runc_shim = "/usr/local/bin/containerd-shim"
+
+    [runsc_config]
+      debug = "true"
+      debug-log = "/tmp/runsc-logs/"
+      strace = "true"
+      file-access = "shared"
+EOF
+  sudo mv ${shim_config_tmp_path} ${shim_config_path}
+
+  # Configure CNI.
+  (cd "${GOPATH}" && sudo -n -E env PATH="${PATH}" GOPATH="${GOPATH}" \
+      src/github.com/containerd/containerd/script/setup/install-cni)
+}
+
+# Run the tests that require docker.
+run_docker_tests() {
+  cd ${WORKSPACE_DIR}
+
+  # Run tests with a default runtime (runc).
+  bazel test \
+    "${BAZEL_BUILD_FLAGS[@]}" \
+    --test_env=RUNSC_RUNTIME="" \
+    --test_output=all \
+    //runsc/test/image:image_test
+
+  # These names are used to exclude tests not supported in certain
+  # configuration, e.g. save/restore not supported with hostnet.
+  declare -a variations=("" "-kvm" "-hostnet" "-overlay")
+  for v in "${variations[@]}"; do
+    # Change test names otherwise each run of tests will overwrite logs and
+    # results of the previous run.
+    sed -i "s/name = \"integration_test.*\"/name = \"integration_test${v}\"/" runsc/test/integration/BUILD
+    sed -i "s/name = \"image_test.*\"/name = \"image_test${v}\"/" runsc/test/image/BUILD
+    # Run runsc tests with docker that are tagged manual.
+    bazel test \
+      "${BAZEL_BUILD_FLAGS[@]}" \
+      --test_env=RUNSC_RUNTIME="${RUNTIME}${v}" \
+      --test_output=all \
+      //runsc/test/image:image_test${v} \
+      //runsc/test/integration:integration_test${v}
+  done
+}
+
+# Run the tests that require root.
+run_root_tests() {
+  cd ${WORKSPACE_DIR}
+  bazel build //runsc/test/root:root_test
+  local root_test=$(find -L ./bazel-bin/ -executable -type f -name root_test | grep __main__)
+  if [[ ! -f "${root_test}" ]]; then
+    echo "root_test executable not found"
+    exit 1
+  fi
+  sudo -n -E RUNSC_RUNTIME="${RUNTIME}" RUNSC_EXEC=/tmp/"${RUNTIME}"/runsc ${root_test}
+}
+
+# Run syscall unit tests.
+run_syscall_tests() {
+  cd ${WORKSPACE_DIR}
+  bazel test "${BAZEL_BUILD_RBE_FLAGS[@]}" \
+    --test_tag_filters=runsc_ptrace //test/syscalls/...
+}
+
+run_runsc_do_tests() {
+  local runsc=$(find bazel-bin/runsc -type f -executable -name "runsc" | head -n1)
+
+  # run runsc do without root privileges.
+  ${runsc} --rootless do true
+  ${runsc} --rootless --network=none do true
+
+  # run runsc do with root privileges.
+  sudo -n -E ${runsc} do true
+}
+
+# Find and rename all test xml and log files so that Sponge can pick them up.
+# XML files must be named sponge_log.xml, and log files must be named
+# sponge_log.log. We move all such files into KOKORO_ARTIFACTS_DIR, in a
+# subdirectory named with the test name.
+upload_test_artifacts() {
+  # Skip if no kokoro directory.
+  [[ -v KOKORO_ARTIFACTS_DIR ]] || return
+
+  cd ${WORKSPACE_DIR}
+  find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
+    tar --create --files-from - --transform 's/test\./sponge_log./' |
+    tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
+  if [[ -d "/tmp/${RUNTIME}/logs" ]]; then
+    tar --create --gzip "--file=${KOKORO_ARTIFACTS_DIR}/runsc-logs.tar.gz" -C /tmp/ ${RUNTIME}/logs
+  fi
+}
+
+# Finish runs at exit, even in the event of an error, and uploads all test
+# artifacts.
+finish() {
+  # Grab the last exit code, we will return it.
+  local exit_code=${?}
+  upload_test_artifacts
+  exit ${exit_code}
+}
+
+# Run bazel in a docker container
+build_in_docker() {
+  cd ${WORKSPACE_DIR}
+  bazel clean
+  bazel shutdown
+  make
+  make runsc
+  make bazel-shutdown
+}
+
+########
+# MAIN #
+########
+
+main() {
+  # Register finish to run at exit.
+  trap finish EXIT
+
+  # Build and run the simple tests.
+  sanity_checks
+  build_everything opt
+  run_simple_tests
+
+  # So far so good. Install more deps and run the integration tests.
+  install_runtime
+  install_crictl_test_deps
+  run_docker_tests
+  run_root_tests
+
+  run_syscall_tests
+  run_runsc_do_tests
+
+  # Build other flavors too.
+  build_everything dbg
+
+  build_in_docker
+  # No need to call "finish" here, it will happen at exit.
+}
+
+# Kick it off.
+main
author	Kevin Krakauer <krakauer@google.com>	2019-06-12 15:21:22 -0700
committer	Kevin Krakauer <krakauer@google.com>	2019-06-12 15:21:22 -0700
commit	0bbbcafd68154e7c7b46692b84a39fb6bb5f1568 (patch)
tree	d8fba01ad76900715665b0418a786de2d77e2a05
parent	06a83df533244dc2b3b8adfc1bf0608d3753c1d9 (diff)
parent	70578806e8d3e01fae2249b3e602cd5b05d378a0 (diff)